From 195130f8153bb0fc733a242ce9d86d0056a4fccb Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 29 Feb 2024 17:43:11 -0500
Subject: [PATCH 001/667] Fix bug

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 46e17d4fd..851ae3560 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2429,7 +2429,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     // Compile the i-th ssm
     FFModel *ssm = get_ssm_model(i);
     im->compile_model_and_allocate_buffer(ssm);
-    assert(im->model_weights_loaders.find(llm) !=
+    assert(im->model_weights_loaders.find(ssm) !=
            im->model_weights_loaders.end());
     // Load model weights
     im->model_weights_loaders[ssm]->load_weights(ssm);

From eda4bd2faddb6f6899b684f7a02eee66fbdfdd38 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 14 Mar 2024 16:00:29 -0400
Subject: [PATCH 002/667] Add some comments on how to implement the new version
 of the SpecScheduler

---
 include/flexflow/batch_config.h    |  8 +--
 include/flexflow/request_manager.h | 44 +++++++++++---
 src/runtime/request_manager.cc     | 93 ++++++++++++++++++++++++++++++
 tests/inference_tests.sh           |  4 +-
 4 files changed, 135 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 5c126293c..71e56f78e 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -145,7 +145,7 @@ class BeamSearchBatchConfig : public BatchConfig {
   void print() const;
   void save_to_file(std::string const &filename) const;
   bool done() const;
-  int max_beam_depth_all_requests() const;
+  int max_beam_depth_all_requests() const; // Need to remove
   int current_depth_all_requests() const;
   int get_speculative_request_num() const;
 
@@ -154,11 +154,11 @@ class BeamSearchBatchConfig : public BatchConfig {
 
   // how many requests is in speculative phase
   int speculative_request_num = 0;
-  inline static int const MAX_BEAM_WIDTH = 3;
-  inline static int const MAX_BEAM_DEPTH = 8;
+  inline static int const MAX_BEAM_WIDTH = 3; // Need to remove
+  inline static int const MAX_BEAM_DEPTH = 8; // Need to remove
 
   // maximum tree branches for a request
-  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; // Need to remove
 
   int model_id;
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4763eb1ef..6ae267bc2 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -73,9 +73,11 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  std::vector<struct BeamTree> beam_trees;
+  std::vector<struct BeamTree> beam_trees; // Old version, delete after refactor
+  std::vector<struct TokenTree> token_trees; // New version
 };
 
+// The old version of beam tree
 // store the result of beam search
 struct BeamTree {
   struct treeLayer {
@@ -88,11 +90,36 @@ struct BeamTree {
   treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
 };
 
-// struct BeamTree_v2 {
-//   std::vector<BatchConfig::TokenId> tokens;
-//   std::vector<int> parent_ids;
-//   std::vector<float> probs;
-// };
+// The new version of BeamTree
+// Named as TokenTree, supports general tree structure.
+class TokenTree {
+  class Node {
+  public:
+    BatchConfig::TokenId id;
+    float unconditional_prob;
+    std::vector<std::shared_ptr<Node>> children;
+    std::shared_ptr<Node> parent;
+    Node(BatchConfig::TokenId id, float prob, std::shared_ptr<Node> parent)
+        : id(id), unconditional_prob(prob), parent(parent) {}
+  };
+
+  class TreeLayer {
+  public:
+    std::vector<std::shared_ptr<Node>> nodes;
+  };
+
+private:
+  std::vector<TreeLayer> layers;
+  // Do we need the root?
+  std::shared_ptr<Node> root;
+
+public:
+  TokenTree(BatchConfig::TokenId root_id, float root_prob)
+      : root(std::make_shared<Node>(root_id, root_prob, nullptr)) {
+    layers.push_back(TreeLayer());
+    layers[0].nodes.push_back(root);
+  }
+};
 
 class RequestManager {
 public:
@@ -104,7 +131,7 @@ class RequestManager {
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
-  static const RequestGuid INVALID_GUID = 0;
+  static RequestGuid const INVALID_GUID = 0;
   RequestManager();
   static RequestManager *get_request_manager();
   size_t get_num_processed_requests();
@@ -140,6 +167,7 @@ class RequestManager {
 
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
+  void serve_spec_infer_v2(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,
                                    int max_sequence_length);
@@ -263,7 +291,7 @@ class RequestManager {
   Status request_manager_status;
 
   // tree width in each speculative step, if not specified 1
-  std::vector<int> spec_infer_tree_width;
+  std::vector<int> spec_infer_tree_width; // Old version, delete after refactor
 
   // private fields
   std::unique_ptr<Tokenizer> tokenizer_;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 851ae3560..1f1f93183 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2503,6 +2503,99 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   }
 }
 
+/*static*/
+void RequestManager::serve_spec_infer_v2(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  {
+    // Compile the llm
+    im->compile_model_and_allocate_buffer(llm);
+    assert(im->model_weights_loaders.find(llm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[llm]->load_weights(llm);
+    // init operators
+    im->init_operators_inference(llm);
+  }
+  for (size_t i = 0; i < get_num_ssms(); i++) {
+    // Compile the i-th ssm
+    FFModel *ssm = get_ssm_model(i);
+    im->compile_model_and_allocate_buffer(ssm);
+    assert(im->model_weights_loaders.find(ssm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[ssm]->load_weights(ssm);
+    // init operators
+    im->init_operators_inference(ssm);
+  }
+
+  std::queue<std::pair<TreeVerifyBatchConfigFuture, InferenceResultFuture>>
+      batch_pipeline;
+  // Legion futures for inc_decoding and spec_infer
+  TreeVerifyBatchConfigFuture last_tree_bcf;
+  InferenceResultFuture last_tree_irf;
+  {
+    // Initialize futures for spec infer
+    TreeVerifyBatchConfig tree_bc;
+    InferenceResult tree_ir;
+    last_tree_bcf = Future::from_value<TreeVerifyBatchConfig>(tree_bc);
+    last_tree_irf = Future::from_value<InferenceResult>(tree_ir);
+  }
+  batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf));
+
+  while (!is_background_server_terminated()) {
+
+    if (batch_pipeline.size() >= 4) {
+      // Block here to avoid launching too many batches
+      auto const &batch = batch_pipeline.front();
+      batch.second.get_void_result();
+    }
+    // deque finished batches
+    while (batch_pipeline.size() > 1) {
+      auto const &batch = batch_pipeline.front();
+      if (batch.second.is_ready()) {
+        batch_pipeline.pop();
+      } else {
+        break;
+      }
+    }
+    auto const &next_batch = batch_pipeline.back();
+    BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
+        next_batch.first, next_batch.second, 0, ctx, runtime);
+    std::vector<BeamSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
+    for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {
+      beam_bcf_vec[ssm_id] = beam_bcf;
+    }
+    runtime->begin_trace(ctx, 12345 /*trace_id*/);
+
+    for (size_t i = 0; i < get_num_ssms(); i++) {
+      for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+           depth++) {
+        beam_bcf = beam_bcf_vec[i];
+
+        FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]);
+        assert(fm.get_future_map_domain().get_volume() == 1);
+        BeamInferenceResultFuture beam_irf = fm.get_future(0);
+        beam_bcf_vec[i] =
+            prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime);
+      }
+    }
+    // Token Tree Verification
+    {
+      TreeVerifyBatchConfigFuture tree_bcf =
+          prepare_next_batch_verify(beam_bcf_vec, ctx, runtime);
+      FutureMap fm = im->inference(llm, 0, tree_bcf);
+      assert(fm.get_future_map_domain().get_volume() == 1);
+      InferenceResultFuture tree_irf = fm.get_future(0);
+      batch_pipeline.push(std::make_pair(tree_bcf, tree_irf));
+      last_tree_bcf = tree_bcf;
+      last_tree_irf = tree_irf;
+    }
+    runtime->end_trace(ctx, 12345 /*trace_id*/);
+  }
+}
+
 void RequestManager::trigger_request_completion_future(
     RequestGuid const &guid) {
   const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
index 895b74c79..3334939a1 100755
--- a/tests/inference_tests.sh
+++ b/tests/inference_tests.sh
@@ -10,9 +10,9 @@ cleanup() {
 cd "${BASH_SOURCE[0]%/*}"
 
 # Enable Python tests (on by default)
-PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-ON}
+PYTHON_INFERENCE_TESTS=${PYTHON_INFERENCE_TESTS:-OFF}
 # Enable C++ tests, (off by default)
-CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF}
+CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-ON}
 # Enable model parallelism tests in C++, if desired
 TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF}
 

From 95db61a5940e6f89a38f88a76bf9ba7120364558 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 17 Mar 2024 22:42:26 -0400
Subject: [PATCH 003/667] Add a modified API TreeSearchBatchConfig to replace
 BeamSearchBatchConfig

---
 include/flexflow/batch_config.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 71e56f78e..adee111b8 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -35,6 +35,7 @@ using InferenceResultFuture = Legion::Future;
 using BeamSearchBatchConfigFuture = Legion::Future;
 using TreeVerifyBatchConfigFuture = Legion::Future;
 using BeamInferenceResultFuture = Legion::Future;
+using TreeSearchBatchConfigFuture = Legion::Future;
 
 class BatchConfig {
 public:
@@ -189,6 +190,37 @@ class BeamSearchBatchConfig : public BatchConfig {
   size_t current_iteration;
 };
 
+class TreeSearchBatchConfig : public BatchConfig {
+public:
+  TreeSearchBatchConfig();
+  TreeSearchBatchConfig(int model_id);
+  TreeSearchBatchConfig(TreeSearchBatchConfig const &other, int model_id);
+  InferenceMode get_mode() const;
+
+  ~TreeSearchBatchConfig();
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  TreeSearchBatchConfig const &bc);
+  void print() const;
+  void save_to_file(std::string const &filename) const;
+  int current_depth_all_requests() const;
+  int get_speculative_request_num() const;
+
+  // how many requests is in speculative phase
+  int speculative_request_num = 0;
+  int model_id;
+
+  struct TreeSearchPerRequestInfo {
+    int current_depth = -1;
+
+    BatchConfig::TokenId tokens_arr[BatchConfig::MAX_NUM_TOKENS];
+    float probs_arr[BatchConfig::MAX_NUM_TOKENS];
+    int parent_id_arr[BatchConfig::MAX_NUM_TOKENS];
+  };
+
+  TreeSearchPerRequestInfo tree_requests_info[MAX_NUM_REQUESTS];
+};
+
 struct BeamInferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId

From d0af3b8c486b9f648a0f56433e403de60cf9d0f5 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 17 Mar 2024 22:44:03 -0400
Subject: [PATCH 004/667] Add new APIs using TreeSearchBatchConfig instead of
 BeamSearchBatchConfig in RequestManager

---
 include/flexflow/request_manager.h | 49 +++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 6ae267bc2..1e716726b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -181,6 +181,7 @@ class RequestManager {
   // Methods to check and mark request completion
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
+
   // Methods for preparing next batches
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
@@ -188,14 +189,6 @@ class RequestManager {
                                        InferenceResultFuture const &result,
                                        Legion::Context ctx,
                                        Legion::Runtime *runtime);
-  BeamSearchBatchConfig
-      prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
-                              BeamInferenceResult const &result);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
   BeamSearchBatchConfig
       prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                               InferenceResult const &result,
@@ -206,6 +199,16 @@ class RequestManager {
                               int model_id,
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
+
+  /* The APIs that need to be changed. */
+  BeamSearchBatchConfig
+      prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
+                              BeamInferenceResult const &result);
+  BeamSearchBatchConfigFuture
+      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
+                              BeamInferenceResultFuture const &result,
+                              Legion::Context ctx,
+                              Legion::Runtime *runtime);
   TreeVerifyBatchConfig prepare_next_batch_verify(
       std::vector<BeamSearchBatchConfig> const &old_batches);
   TreeVerifyBatchConfigFuture prepare_next_batch_verify(
@@ -224,6 +227,36 @@ class RequestManager {
       traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
                          int request_index,
                          int first_token_depth_in_request);
+  /* The APIs that need to be changed. */
+
+  /* New APIs */
+  TreeSearchBatchConfig
+      prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
+                              BeamInferenceResult const &result);
+  TreeSearchBatchConfigFuture
+      prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
+                              BeamInferenceResultFuture const &result,
+                              Legion::Context ctx,
+                              Legion::Runtime *runtime);
+  TreeSearchBatchConfig TreeVerifyBatchConfig prepare_next_batch_verify(
+      std::vector<TreeSearchBatchConfig> const &old_batches);
+  TreeVerifyBatchConfigFuture prepare_next_batch_verify(
+      std::vector<TreeSearchBatchConfigFuture> const &old_batches,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
+  void store_beam_metadata(TreeSearchBatchConfig const &old_bc,
+                           BeamInferenceResult const &result);
+  void update_beam_metadata(TreeSearchBatchConfig &new_bc,
+                            TreeSearchBatchConfig const &old_bc,
+                            Token &tree,
+                            int request_index);
+
+  std::vector<std::pair<BatchConfig::TokenId, int>>
+      traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
+                         int request_index,
+                         int first_token_depth_in_request);
+  /* New APIs */
 
   // remove guid after put the cached tree in request
   std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(

From d069dd58aa942b749d5c1c69c6f65e164a54877a Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 24 Mar 2024 21:18:53 -0400
Subject: [PATCH 005/667] Rename BeamSearchBatchConfig to TreeSearchBatchConfig

---
 include/flexflow/batch_config.h               |  72 +------
 include/flexflow/config.h                     |   4 +-
 include/flexflow/ops/arg_topk.h               |   8 +-
 include/flexflow/ops/beam_topk.h              |   8 +-
 .../ops/spec_inc_multihead_self_attention.h   |  14 +-
 include/flexflow/request_manager.h            |  78 ++++---
 inference/models/falcon.h                     |   4 +-
 inference/models/llama.h                      |   4 +-
 inference/models/mpt.h                        |   4 +-
 inference/models/opt.h                        |   4 +-
 inference/models/starcoder.h                  |   4 +-
 src/c/flexflow_c.cc                           | 200 +++++++++---------
 src/ops/arg_topk.cc                           |  10 +-
 src/ops/beam_topk.cc                          |  12 +-
 src/ops/beam_topk.cu                          |  30 +--
 src/ops/inc_multihead_self_attention.cpp      |   8 +-
 src/ops/inc_multihead_self_attention.cu       |   4 +-
 src/ops/spec_inc_multihead_self_attention.cc  |   4 +-
 src/ops/spec_inc_multihead_self_attention.cpp |  32 +--
 src/ops/spec_inc_multihead_self_attention.cu  |  10 +-
 src/runtime/batch_config.cc                   |   2 +-
 src/runtime/beam_search_batch_config.cc       |  34 +--
 src/runtime/inference_manager.cc              |   6 +-
 src/runtime/model.cc                          | 112 +++++-----
 src/runtime/request_manager.cc                |  93 ++++----
 src/runtime/request_manager.cpp               |  12 +-
 src/runtime/request_manager.cu                |  12 +-
 27 files changed, 375 insertions(+), 410 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index adee111b8..62829da55 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -131,65 +131,6 @@ struct InferenceResult {
   BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
 };
 
-class BeamSearchBatchConfig : public BatchConfig {
-public:
-  BeamSearchBatchConfig();
-  BeamSearchBatchConfig(int model_id);
-  BeamSearchBatchConfig(size_t beam_width, size_t target_iterations);
-  BeamSearchBatchConfig(BeamSearchBatchConfig const &other, int model_id);
-  InferenceMode get_mode() const;
-
-  ~BeamSearchBatchConfig();
-
-  friend std::ostream &operator<<(std::ostream &os,
-                                  BeamSearchBatchConfig const &bc);
-  void print() const;
-  void save_to_file(std::string const &filename) const;
-  bool done() const;
-  int max_beam_depth_all_requests() const; // Need to remove
-  int current_depth_all_requests() const;
-  int get_speculative_request_num() const;
-
-  size_t beam_width;
-  size_t target_iterations;
-
-  // how many requests is in speculative phase
-  int speculative_request_num = 0;
-  inline static int const MAX_BEAM_WIDTH = 3; // Need to remove
-  inline static int const MAX_BEAM_DEPTH = 8; // Need to remove
-
-  // maximum tree branches for a request
-  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3; // Need to remove
-
-  int model_id;
-
-  struct BeamSearchPerRequestInfo {
-    int beam_size;
-    int current_depth = -1;
-    int max_depth = MAX_BEAM_DEPTH;
-
-    BatchConfig::TokenId
-        tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int sub_request_num;
-  };
-
-  struct BeamSearchPerTokenInfo {
-    int sub_request_index;
-  };
-
-  BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS];
-  BeamSearchPerTokenInfo
-      beamTokenInfo[MAX_NUM_TOKENS +
-                    MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS];
-
-  int sub_requests[MAX_NUM_REQUESTS];
-
-private:
-  size_t current_iteration;
-};
-
 class TreeSearchBatchConfig : public BatchConfig {
 public:
   TreeSearchBatchConfig();
@@ -206,16 +147,15 @@ class TreeSearchBatchConfig : public BatchConfig {
   int current_depth_all_requests() const;
   int get_speculative_request_num() const;
 
+  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+
   // how many requests is in speculative phase
   int speculative_request_num = 0;
   int model_id;
 
   struct TreeSearchPerRequestInfo {
     int current_depth = -1;
-
-    BatchConfig::TokenId tokens_arr[BatchConfig::MAX_NUM_TOKENS];
-    float probs_arr[BatchConfig::MAX_NUM_TOKENS];
-    int parent_id_arr[BatchConfig::MAX_NUM_TOKENS];
+    int num_tokens_in_layer;
   };
 
   TreeSearchPerRequestInfo tree_requests_info[MAX_NUM_REQUESTS];
@@ -225,11 +165,11 @@ struct BeamInferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId
       token_ids[MAX_NUM_TOKENS *
-                BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+                TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
   float probs[MAX_NUM_TOKENS *
-              BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+              TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
   int parent_id[MAX_NUM_TOKENS *
-                BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+                TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 17a3f59e2..f9d901323 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -81,8 +81,8 @@ struct FFHandler {
   // request info + token info + topolopgy mask info
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-      sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
+      sizeof(TreeSearchBatchConfig::beamTokenInfo) +
+      sizeof(TreeSearchBatchConfig::beamRequestsInfo) +
       sizeof(BatchConfig::causalMask) +
       sizeof(TreeVerifyBatchConfig::committed_tokens) +
       sizeof(BatchConfig::request_completed);
diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 3822a5e41..2b8f858ec 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -22,7 +22,7 @@ class ArgTopK : public Op {
   using Input = ParallelTensor;
   ArgTopK(FFModel &model,
           LayerID const &layer_guid,
-          const ParallelTensor input,
+          ParallelTensor const input,
           int k,
           bool sorted,
           bool speculative_decoding,
@@ -30,7 +30,7 @@ class ArgTopK : public Op {
   ArgTopK(FFModel &model,
           LayerID const &layer_guid,
           ArgTopK const &other,
-          const ParallelTensor input);
+          ParallelTensor const input);
   ArgTopK(FFModel &model,
           Params const &params,
           Input const input,
@@ -89,14 +89,14 @@ class ArgTopK : public Op {
                              int length,
                              int k,
                              bool sorted,
-                             BeamSearchBatchConfig const *bc,
+                             TreeSearchBatchConfig const *bc,
                              ffStream_t stream);
   static void forward_kernel_wrapper(ArgTopKMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &prob,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     BeamSearchBatchConfig const *bc);
+                                     TreeSearchBatchConfig const *bc);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h
index 9466ba2a3..5427ccd0d 100644
--- a/include/flexflow/ops/beam_topk.h
+++ b/include/flexflow/ops/beam_topk.h
@@ -30,12 +30,12 @@ class BeamTopK : public Op {
   using Params = BeamTopKParams;
   using Input = ParallelTensor;
   BeamTopK(FFModel &model,
-           const ParallelTensor input,
+           ParallelTensor const input,
            LayerID const &_layer_guid,
            int max_beam_width,
            bool sorted,
            char const *name);
-  BeamTopK(FFModel &model, BeamTopK const &other, const ParallelTensor input);
+  BeamTopK(FFModel &model, BeamTopK const &other, ParallelTensor const input);
   BeamTopK(FFModel &model,
            Params const &params,
            Input const input,
@@ -82,7 +82,7 @@ class BeamTopK : public Op {
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
   static void forward_kernel(BeamTopKMeta const *m,
-                             BeamSearchBatchConfig const *bc,
+                             TreeSearchBatchConfig const *bc,
                              DT const *input_ptr,
                              float *output_ptr,
                              int *indices_ptr,
@@ -92,7 +92,7 @@ class BeamTopK : public Op {
                              bool sorted,
                              ffStream_t stream);
   static void forward_kernel_wrapper(BeamTopKMeta const *m,
-                                     BeamSearchBatchConfig const *bc,
+                                     TreeSearchBatchConfig const *bc,
                                      GenericTensorAccessorR const &input,
                                      float *output_ptr,
                                      int *indices_ptr,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index a0d01092b..6ba52fe52 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -26,7 +26,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
 
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 LayerID const &layer_guid,
-                                const ParallelTensor _input,
+                                ParallelTensor const _input,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -44,8 +44,8 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool allocate_weights,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
-                                const ParallelTensor _input,
-                                const ParallelTensor _weight,
+                                ParallelTensor const _input,
+                                ParallelTensor const _weight,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -64,7 +64,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 SpecIncMultiHeadSelfAttention const &other,
-                                const ParallelTensor input,
+                                ParallelTensor const input,
                                 bool allocate_weights);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 Params const &params,
@@ -109,7 +109,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
 
   static void
       inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m,
-                               BeamSearchBatchConfig const *bc,
+                               TreeSearchBatchConfig const *bc,
                                int shard_id,
                                GenericTensorAccessorR const &input,
                                GenericTensorAccessorR const &weight,
@@ -140,8 +140,8 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 
 public:
   Realm::RegionInstance beam_search_reserve_inst;
-  BeamSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos;
-  BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos;
+  TreeSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos;
+  TreeSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos;
   bool *request_completed;
   BatchConfig::BitMask *causalMask;
 };
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 1e716726b..9037707ea 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -81,13 +81,13 @@ struct Request {
 // store the result of beam search
 struct BeamTree {
   struct treeLayer {
-    BeamSearchBatchConfig::TokenId
-        tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int parent_ids[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+    TreeSearchBatchConfig::TokenId
+        tokens[TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+    int parent_ids[TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+    float probs[TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
     int nodes_num_this_layer = 0;
   };
-  treeLayer treeLayers[BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1];
+  treeLayer treeLayers[TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1];
 };
 
 // The new version of BeamTree
@@ -189,7 +189,16 @@ class RequestManager {
                                        InferenceResultFuture const &result,
                                        Legion::Context ctx,
                                        Legion::Runtime *runtime);
-  BeamSearchBatchConfig
+  /* The APIs that need to be changed. */
+  TreeSearchBatchConfig
+      prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
+                              BeamInferenceResult const &result);
+  BeamSearchBatchConfigFuture
+      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
+                              BeamInferenceResultFuture const &result,
+                              Legion::Context ctx,
+                              Legion::Runtime *runtime);
+  TreeSearchBatchConfig
       prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                               InferenceResult const &result,
                               int model_id);
@@ -200,31 +209,22 @@ class RequestManager {
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
 
-  /* The APIs that need to be changed. */
-  BeamSearchBatchConfig
-      prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
-                              BeamInferenceResult const &result);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
   TreeVerifyBatchConfig prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfig> const &old_batches);
+      std::vector<TreeSearchBatchConfig> const &old_batches);
   TreeVerifyBatchConfigFuture prepare_next_batch_verify(
       std::vector<BeamSearchBatchConfigFuture> const &old_batches,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void store_beam_metadata(BeamSearchBatchConfig const &old_bc,
+  void store_beam_metadata(TreeSearchBatchConfig const &old_bc,
                            BeamInferenceResult const &result);
-  void update_beam_metadata(BeamSearchBatchConfig &new_bc,
-                            BeamSearchBatchConfig const &old_bc,
+  void update_beam_metadata(TreeSearchBatchConfig &new_bc,
+                            TreeSearchBatchConfig const &old_bc,
                             BeamTree &tree,
                             int request_index);
 
   std::vector<std::pair<BatchConfig::TokenId, int>>
-      traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
+      traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
                          int first_token_depth_in_request);
   /* The APIs that need to be changed. */
@@ -238,22 +238,32 @@ class RequestManager {
                               BeamInferenceResultFuture const &result,
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
-  TreeSearchBatchConfig TreeVerifyBatchConfig prepare_next_batch_verify(
+  TreeSearchBatchConfig
+      prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
+                              InferenceResult const &result,
+                              int model_id);
+  TreeSearchBatchConfigFuture
+      prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
+                              InferenceResultFuture const &result,
+                              int model_id,
+                              Legion::Context ctx,
+                              Legion::Runtime *runtime);
+  TreeSearchBatchConfig prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfig> const &old_batches);
   TreeVerifyBatchConfigFuture prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfigFuture> const &old_batches,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void store_beam_metadata(TreeSearchBatchConfig const &old_bc,
+  void store_spec_metadata(TreeSearchBatchConfig const &old_bc,
                            BeamInferenceResult const &result);
-  void update_beam_metadata(TreeSearchBatchConfig &new_bc,
+  void update_spec_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             Token &tree,
                             int request_index);
 
   std::vector<std::pair<BatchConfig::TokenId, int>>
-      traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
+      traverse_spec_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
                          int first_token_depth_in_request);
   /* New APIs */
@@ -298,17 +308,33 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  static BeamSearchBatchConfig prepare_next_batch_beam_task(
+  /* APIs to modify */
+  static TreeSearchBatchConfig prepare_next_batch_beam_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  static BeamSearchBatchConfig prepare_next_batch_init_task(
+  static TreeSearchBatchConfig prepare_next_batch_init_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
+  /* APIs to modify */
+
+  /* New APIs */
+  static TreeSearchBatchConfig prepare_next_batch_spec_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
+  static TreeSearchBatchConfig prepare_next_batch_init_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+  /* New APIs */
 
   static TreeVerifyBatchConfig prepare_next_batch_verify_task(
       Legion::Task const *task,
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index fce2dade3..ccbe6ae79 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -61,8 +61,8 @@ class FALCON {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
     }
 
     void print() const {
diff --git a/inference/models/llama.h b/inference/models/llama.h
index ba1f0236f..3d8a5d1df 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -51,8 +51,8 @@ class LLAMA {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
     }
 
     void print() const {
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 08597e1d7..7cfec2687 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -48,8 +48,8 @@ class MPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
     }
 
     void print() const {
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 7c736a26d..14a1f087d 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -56,8 +56,8 @@ class OPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
     }
 
     void print() const {
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index 0e9577d56..f19db4a1f 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -53,8 +53,8 @@ class STARCODER {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
     }
 
     void print() const {}
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 9ad58695a..5fbe7b19f 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -35,7 +35,9 @@ class FFCObjectWrapper {
     t_.impl = const_cast<void *>(static_cast<void const *>(t));                \
     return t_;                                                                 \
   }                                                                            \
-  static T unwrap(T_ t_) { return static_cast<T>(t_.impl); }                   \
+  static T unwrap(T_ t_) {                                                     \
+    return static_cast<T>(t_.impl);                                            \
+  }                                                                            \
   static const T unwrap_const(const T_ t_) {                                   \
     return static_cast<const T>(t_.impl);                                      \
   }
@@ -62,7 +64,7 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t,
                         TreeVerifyBatchConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t,
-                        BeamSearchBatchConfig *);
+                        TreeSearchBatchConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_inference_manager_t, InferenceManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
@@ -253,56 +255,56 @@ void flexflow_model_zero_gradients(flexflow_model_t handle_) {
 }
 
 flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
+                                         flexflow_tensor_t const x_,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
   Tensor tensor = handle->exp(x, name);
   DEBUG_PRINT("[Exp] new Tensor %p, x %p, name %s", tensor, x, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
+                                         flexflow_tensor_t const x_,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
   Tensor tensor = handle->sin(x, name);
   DEBUG_PRINT("[Sin] new Tensor %p, x %p, name %s", tensor, x, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
+                                         flexflow_tensor_t const x_,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
   Tensor tensor = handle->cos(x, name);
   DEBUG_PRINT("[Cos] new Tensor %p, x %p, name %s", tensor, x, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
-                                         const flexflow_tensor_t y_,
+                                         flexflow_tensor_t const x_,
+                                         flexflow_tensor_t const y_,
                                          bool inplace_a,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->add(x, y, inplace_a, name);
   DEBUG_PRINT("[Add] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_,
-                                              const flexflow_tensor_t x_,
-                                              const flexflow_tensor_t y_,
+                                              flexflow_tensor_t const x_,
+                                              flexflow_tensor_t const y_,
                                               bool inplace_a,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->subtract(x, y, inplace_a, name);
   DEBUG_PRINT(
       "[Subtract] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
@@ -310,13 +312,13 @@ flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_,
-                                              const flexflow_tensor_t x_,
-                                              const flexflow_tensor_t y_,
+                                              flexflow_tensor_t const x_,
+                                              flexflow_tensor_t const y_,
                                               bool inplace_a,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->multiply(x, y, inplace_a, name);
   DEBUG_PRINT(
       "[Multiply] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
@@ -324,13 +326,13 @@ flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_,
-                                            const flexflow_tensor_t x_,
-                                            const flexflow_tensor_t y_,
+                                            flexflow_tensor_t const x_,
+                                            flexflow_tensor_t const y_,
                                             bool inplace_a,
                                             char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->divide(x, y, inplace_a, name);
   DEBUG_PRINT(
       "[Divide] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
@@ -338,33 +340,33 @@ flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
-                                         const flexflow_tensor_t y_,
+                                         flexflow_tensor_t const x_,
+                                         flexflow_tensor_t const y_,
                                          bool inplace_a,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->max(x, y, inplace_a, name);
   DEBUG_PRINT("[Max] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle_,
-                                         const flexflow_tensor_t x_,
-                                         const flexflow_tensor_t y_,
+                                         flexflow_tensor_t const x_,
+                                         flexflow_tensor_t const y_,
                                          bool inplace_a,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor x = FFCObjectWrapper::unwrap_const(x_);
-  const Tensor y = FFCObjectWrapper::unwrap_const(y_);
+  Tensor const x = FFCObjectWrapper::unwrap_const(x_);
+  Tensor const y = FFCObjectWrapper::unwrap_const(y_);
   Tensor tensor = handle->min(x, y, inplace_a, name);
   DEBUG_PRINT("[Min] new Tensor %p, x %p, y %p, name %s", tensor, x, y, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int *axes,
                                                 int n,
                                                 bool keepdims,
@@ -385,21 +387,21 @@ flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_,
-                                           const flexflow_tensor_t input_,
+                                           flexflow_tensor_t const input_,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   Tensor tensor = handle->rsqrt(input, name);
   DEBUG_PRINT("[Rsqrt] new Tensor %p, input %p, name %s", tensor, input, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
 flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input_,
+                                         flexflow_tensor_t const input_,
                                          float const exponent,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   Tensor tensor = handle->pow(input, exponent, name);
   DEBUG_PRINT("[Pow] new Tensor %p, input %p, exponent %f, name %s",
               tensor,
@@ -410,13 +412,13 @@ flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           int *dims,
                                           int n,
                                           bool keepdims,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   std::vector<int> dims_vec;
   char cbuffer[256];
   char *cbuffer_ptr = cbuffer;
@@ -441,7 +443,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_conv2d(flexflow_model_t handle_,
-                              const flexflow_tensor_t input_,
+                              flexflow_tensor_t const input_,
                               int out_channels,
                               int kernel_h,
                               int kernel_w,
@@ -457,7 +459,7 @@ flexflow_tensor_t
                               flexflow_initializer_t bias_initializer_,
                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap_const(input_);
+  Tensor const input = FFCObjectWrapper::unwrap_const(input_);
   Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
@@ -505,7 +507,7 @@ flexflow_tensor_t
 
 flexflow_tensor_t
     flexflow_model_add_embedding(flexflow_model_t handle_,
-                                 const flexflow_tensor_t input_,
+                                 flexflow_tensor_t const input_,
                                  int num_entries,
                                  int out_dim,
                                  enum AggrMode aggr,
@@ -514,7 +516,7 @@ flexflow_tensor_t
                                  flexflow_initializer_t kernel_initializer_,
                                  char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap_const(input_);
+  Tensor const input = FFCObjectWrapper::unwrap_const(input_);
   Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
@@ -588,7 +590,7 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 bool relu,
                                                 char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -608,7 +610,7 @@ flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int n,
                                                 int *axes,
                                                 bool elementwise_affine,
@@ -616,7 +618,7 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_,
                                                 bool use_bias,
                                                 char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
   std::vector<int> axes_vec;
   for (int i = 0; i < n; i++) {
     axes_vec.push_back(axes[i]);
@@ -640,9 +642,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle_,
 
 flexflow_tensor_t *
     flexflow_model_add_residual_layer_norm(flexflow_model_t handle_,
-                                           const flexflow_tensor_t input_,
-                                           const flexflow_tensor_t residual1_,
-                                           const flexflow_tensor_t residual2_,
+                                           flexflow_tensor_t const input_,
+                                           flexflow_tensor_t const residual1_,
+                                           flexflow_tensor_t const residual2_,
                                            bool use_two_residuals,
                                            int n,
                                            int *axes,
@@ -651,9 +653,9 @@ flexflow_tensor_t *
                                            bool use_bias,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
-  const Tensor residual1 = FFCObjectWrapper::unwrap(residual1_);
-  const Tensor residual2 =
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
+  Tensor const residual1 = FFCObjectWrapper::unwrap(residual1_);
+  Tensor const residual2 =
       use_two_residuals ? FFCObjectWrapper::unwrap(residual2_) : nullptr;
   Tensor tensor_outputs[2];
   std::vector<int> axes_vec;
@@ -699,8 +701,8 @@ flexflow_tensor_t *
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    const flexflow_tensor_t residual_,
+    flexflow_tensor_t const input_,
+    flexflow_tensor_t const residual_,
     int n,
     int *axes,
     bool elementwise_affine,
@@ -708,8 +710,8 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool use_bias,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap(input_);
-  const Tensor residual = FFCObjectWrapper::unwrap(residual_);
+  Tensor const input = FFCObjectWrapper::unwrap(input_);
+  Tensor const residual = FFCObjectWrapper::unwrap(residual_);
   Tensor tensor_outputs[2];
   std::vector<int> axes_vec;
   for (int i = 0; i < n; i++) {
@@ -746,12 +748,12 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
 
 flexflow_tensor_t
     flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input1_,
-                                          const flexflow_tensor_t input2_,
+                                          flexflow_tensor_t const input1_,
+                                          flexflow_tensor_t const input2_,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input1 = FFCObjectWrapper::unwrap(input1_);
-  const Tensor input2 = FFCObjectWrapper::unwrap(input2_);
+  Tensor const input1 = FFCObjectWrapper::unwrap(input1_);
+  Tensor const input2 = FFCObjectWrapper::unwrap(input2_);
   Tensor tensor =
       handle->sigmoid_silu_multi(input1, input2, input1->data_type, name);
   DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s",
@@ -763,8 +765,8 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_,
-                                                  const flexflow_tensor_t a_,
-                                                  const flexflow_tensor_t b_,
+                                                  flexflow_tensor_t const a_,
+                                                  flexflow_tensor_t const b_,
                                                   int a_seq_length_dim,
                                                   int b_seq_length_dim) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -778,7 +780,7 @@ flexflow_tensor_t flexflow_model_add_batch_matmul(flexflow_model_t handle_,
 
 flexflow_tensor_t flexflow_model_add_dense(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int out_dim,
     enum ActiMode activation /* AC_MODE_NONE */,
     bool use_bias /* true */,
@@ -790,7 +792,7 @@ flexflow_tensor_t flexflow_model_add_dense(
     float kernel_reg_lambda,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  const Tensor input = FFCObjectWrapper::unwrap_const(input_);
+  Tensor const input = FFCObjectWrapper::unwrap_const(input_);
   Layer *shared_op = FFCObjectWrapper::unwrap(shared_op_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
@@ -896,8 +898,8 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_,
-                                            const flexflow_tensor_t input_,
-                                            const flexflow_tensor_t index_,
+                                            flexflow_tensor_t const input_,
+                                            flexflow_tensor_t const index_,
                                             int dim,
                                             char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -914,7 +916,7 @@ flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              int dim,
                                              char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -926,7 +928,7 @@ flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_,
-                                               const flexflow_tensor_t input_,
+                                               flexflow_tensor_t const input_,
                                                int n,
                                                int *perm,
                                                char const *name) {
@@ -946,7 +948,7 @@ flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              int n,
                                              int *shape,
                                              char const *name) {
@@ -966,7 +968,7 @@ flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              int axis,
                                              char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -982,7 +984,7 @@ flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_scalar_multiply(flexflow_model_t handle_,
-                                       const flexflow_tensor_t input_,
+                                       flexflow_tensor_t const input_,
                                        float const scalar,
                                        bool inplace,
                                        char const *name) {
@@ -998,7 +1000,7 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name) {
@@ -1014,7 +1016,7 @@ flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name) {
@@ -1032,7 +1034,7 @@ flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_scalar_truediv(flexflow_model_t handle_,
-                                      const flexflow_tensor_t input_,
+                                      flexflow_tensor_t const input_,
                                       float const scalar,
                                       bool inplace,
                                       char const *name) {
@@ -1049,7 +1051,7 @@ flexflow_tensor_t
 }
 
 flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1059,7 +1061,7 @@ flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1070,7 +1072,7 @@ flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           bool inplace,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1081,7 +1083,7 @@ flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1092,7 +1094,7 @@ flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1102,7 +1104,7 @@ flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input_,
+                                         flexflow_tensor_t const input_,
                                          bool inplace,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1113,7 +1115,7 @@ flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_,
-                                             const flexflow_tensor_t input_,
+                                             flexflow_tensor_t const input_,
                                              float rate,
                                              unsigned long long seed,
                                              char const *name) {
@@ -1131,9 +1133,9 @@ flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle_,
 
 flexflow_tensor_t flexflow_model_add_multihead_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t query_,
-    const flexflow_tensor_t key_,
-    const flexflow_tensor_t value_,
+    flexflow_tensor_t const query_,
+    flexflow_tensor_t const key_,
+    flexflow_tensor_t const value_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1186,7 +1188,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1229,7 +1231,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1273,7 +1275,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -1317,7 +1319,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -1362,7 +1364,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -1408,7 +1410,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -1453,7 +1455,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
 }
 
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float eps,
                                               int dim,
                                               char const *name) {
@@ -1465,8 +1467,8 @@ flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
 
 flexflow_tensor_t *
     flexflow_model_add_residual_rms_norm(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input1_,
-                                         const flexflow_tensor_t input2_,
+                                         flexflow_tensor_t const input1_,
+                                         flexflow_tensor_t const input2_,
                                          float eps,
                                          int dim,
                                          char const *name) {
@@ -1486,7 +1488,7 @@ flexflow_tensor_t *
 }
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
-                                               const flexflow_tensor_t input_,
+                                               flexflow_tensor_t const input_,
                                                int k,
                                                bool sorted,
                                                bool speculative_decoding,
@@ -1499,7 +1501,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int max_beam_size,
                                                 bool sorted,
                                                 char const *name) {
@@ -1510,7 +1512,7 @@ flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float top_p,
                                               char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1520,7 +1522,7 @@ flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
 }
 
 flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
-                                            const flexflow_tensor_t input_,
+                                            flexflow_tensor_t const input_,
                                             bool beam_search,
                                             char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2545,14 +2547,14 @@ void flexflow_tree_verify_batch_config_destroy(
 
 flexflow_beam_search_batch_config_t
     flexflow_beam_search_batch_config_create(void) {
-  BeamSearchBatchConfig *config = new BeamSearchBatchConfig();
+  TreeSearchBatchConfig *config = new TreeSearchBatchConfig();
   DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config);
   return FFCObjectWrapper::wrap(config);
 }
 
 void flexflow_beam_search_batch_config_destroy(
     flexflow_beam_search_batch_config_t handle_) {
-  BeamSearchBatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  TreeSearchBatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
   DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle);
   delete handle;
 }
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 780a77450..a18bd404d 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -48,7 +48,7 @@ using PCG::Node;
 // For an input tensor, computes the top k entries in each row
 // (resp. vector along the last dimension). Thus,
 // values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::arg_top_k(const Tensor input,
+Tensor FFModel::arg_top_k(Tensor const input,
                           int k,
                           bool sorted,
                           bool speculative_decoding,
@@ -130,7 +130,7 @@ bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) {
 
 ArgTopK::ArgTopK(FFModel &model,
                  LayerID const &_layer_guid,
-                 const ParallelTensor _input,
+                 ParallelTensor const _input,
                  int _k,
                  bool _sorted,
                  bool _speculative_decoding,
@@ -167,7 +167,7 @@ ArgTopK::ArgTopK(FFModel &model,
 ArgTopK::ArgTopK(FFModel &model,
                  LayerID const &layer_guid,
                  ArgTopK const &other,
-                 const ParallelTensor input)
+                 ParallelTensor const input)
     : ArgTopK(model,
               layer_guid,
               input,
@@ -411,8 +411,8 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
     Runtime *runtime) {
   assert(regions.size() == 3);
   assert(task->regions.size() == 3);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+  TreeSearchBatchConfig const &bc =
+      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
   if (bc.num_active_tokens() == 0) {
     // Directly return for empty batch config
     BeamInferenceResult ir;
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index d2054cacb..5aa6cf5b3 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -48,7 +48,7 @@ using PCG::Node;
 // For an input tensor, computes the top k entries in each row
 // (resp. vector along the last dimension). Thus,
 // values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::beam_top_k(const Tensor input,
+Tensor FFModel::beam_top_k(Tensor const input,
                            int max_beam_width,
                            bool sorted,
                            char const *name) {
@@ -122,7 +122,7 @@ bool operator==(BeamTopKParams const &lhs, BeamTopKParams const &rhs) {
 }
 
 BeamTopK::BeamTopK(FFModel &model,
-                   const ParallelTensor _input,
+                   ParallelTensor const _input,
                    LayerID const &_layer_guid,
                    int _max_beam_width,
                    bool _sorted,
@@ -153,7 +153,7 @@ BeamTopK::BeamTopK(FFModel &model,
 
 BeamTopK::BeamTopK(FFModel &model,
                    BeamTopK const &other,
-                   const ParallelTensor input)
+                   ParallelTensor const input)
     : BeamTopK(model,
                input,
                other.layer_guid,
@@ -163,7 +163,7 @@ BeamTopK::BeamTopK(FFModel &model,
 
 BeamTopK::BeamTopK(FFModel &model,
                    BeamTopKParams const &params,
-                   const ParallelTensor input,
+                   ParallelTensor const input,
                    char const *name)
     : BeamTopK(model,
                input,
@@ -351,8 +351,8 @@ BeamInferenceResult
   assert(task->regions.size() == 4);
 
   BeamTopKMeta *m = *((BeamTopKMeta **)task->local_args);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+  TreeSearchBatchConfig const &bc =
+      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
 
   if (bc.num_tokens == 0) {
     BeamInferenceResult ir;
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index a958786be..48f057f98 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -290,7 +290,7 @@ __device__ void mergeBeamShards(int num_shards,
     // Initialize the heap as a min-heap.
     for (int slot = 0; slot < heap_size; slot++) {
       // int beam = (slot % max_heap_size) / k;
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+      T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                      ((slot % max_heap_size) / k)];
       min_heap.assign(slot, {slot, (entries[slot].value * prob)});
       if (verbose && batch_index == 0) {
@@ -307,7 +307,7 @@ __device__ void mergeBeamShards(int num_shards,
       auto const entry = entries[shard];
       auto const root = min_heap.root();
 
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+      T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                      ((shard % max_heap_size) / k)];
       if (verbose && batch_index == 0) {
         printf("shard %d, index %d, value %.15f, prob %.15f\n",
@@ -347,11 +347,11 @@ __device__ void mergeBeamShards(int num_shards,
       int shard_index = max_element.index;
       top_k_indices[rank] = entries[shard_index].index;
       top_k_parents[rank] =
-          parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+          parent_id[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                     ((shard_index % max_heap_size) / k)];
       int next_shard_index = shard_index + num_shards;
 
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+      T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                      ((next_shard_index % max_heap_size) / k)];
       // if (batch_index == 0) {
       //   printf("next_shard_index %d, value %.15f, prob %.15f\n",
@@ -370,7 +370,7 @@ __device__ void mergeBeamShards(int num_shards,
     int shard_index = max_element.index;
     top_k_indices[last_k] = entries[shard_index].index;
     top_k_parents[last_k] =
-        parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+        parent_id[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                   ((shard_index % max_heap_size) / k)];
   }
 }
@@ -379,9 +379,9 @@ template <typename T>
 __global__ void
     mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) {
   using T_ACC = T;
-  const int64_t i = blockIdx.x;
+  int64_t const i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
+    int64_t const index = i * N + j;
     Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
   }
 }
@@ -434,9 +434,9 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input,
            gpu_block_start_index[batch_index] +
                (sub_request_id * token_nums * length),
            batch_input,
-           request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id,
+           request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id,
            static_cast<float>(
-               acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+               acc_probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                          sub_request_id]),
            thread_count,
            request_id);
@@ -539,7 +539,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
 
   // a data structure for prob, parent_id,
   int max_total_requests =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests();
+      TreeSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests();
   int parent_ids[max_total_requests];
   DT acc_probs[max_total_requests];
 
@@ -559,9 +559,9 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
     assert(sub_requests[i] > 0);
     // process sub requests
     for (int j = 0; j < sub_requests[i]; j++) {
-      parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j;
+      parent_ids[req_index * TreeSearchBatchConfig::MAX_BEAM_WIDTH + j] = j;
       // beam_slots[i].parent_id[j];
-      acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] =
+      acc_probs[req_index * TreeSearchBatchConfig::MAX_BEAM_WIDTH + j] =
           bc->beamRequestsInfo[i].probs[j];
       // std::cout << "probbbb req: " << i << ", sub req probability : "
       //           << bc->beamRequestsInfo[i].probs[j] << ", sub request id " <<
@@ -667,7 +667,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
 
 /*static*/
 void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
-                                      BeamSearchBatchConfig const *bc,
+                                      TreeSearchBatchConfig const *bc,
                                       GenericTensorAccessorR const &input,
                                       float *output_ptr,
                                       int *indices_ptr,
@@ -728,9 +728,9 @@ BeamTopKMeta::BeamTopKMeta(FFHandler handler,
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
   size_t parent_id_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
+      TreeSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
   size_t acc_probs_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
+      TreeSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
   size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch;
   size_t request_id_size = max_tokens_per_batch * max_requests_per_batch;
   size_t tokens_per_request_size =
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d60386f92..55be42fa4 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -967,13 +967,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       }
       case BEAM_SEARCH_MODE: {
         key_cache_size = num_q_heads * kProjSize *
-                         BeamSearchBatchConfig::max_requests_per_batch() *
+                         TreeSearchBatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length() *
-                         BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                         TreeSearchBatchConfig::MAX_BEAM_WIDTH;
         value_cache_size = num_q_heads * vProjSize *
-                           BeamSearchBatchConfig::max_requests_per_batch() *
+                           TreeSearchBatchConfig::max_requests_per_batch() *
                            BatchConfig::max_sequence_length() *
-                           BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                           TreeSearchBatchConfig::MAX_BEAM_WIDTH;
         break;
       }
       default:
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 42933cee2..ec045fd2a 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1350,11 +1350,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       case TREE_VERIFY_MODE: {
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
-                         BeamSearchBatchConfig::max_requests_per_batch() *
+                         TreeSearchBatchConfig::max_requests_per_batch() *
                          (BatchConfig::max_sequence_length() +
                           BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
         value_cache_size = num_q_heads * vProjSize *
-                           BeamSearchBatchConfig::max_requests_per_batch() *
+                           TreeSearchBatchConfig::max_requests_per_batch() *
                            (BatchConfig::max_sequence_length() +
                             BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
         break;
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 9c6ed0e0b..dc23f4525 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -736,8 +736,8 @@ void SpecIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+  TreeSearchBatchConfig const &bc =
+      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
   if (bc.num_tokens == 0) {
     return;
   }
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index b1687d12a..676fad935 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -38,8 +38,8 @@ __global__ void spec_store_kv_cache(
     DT *vCache_ptr,
     BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
-    BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
-    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
+    TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
+    TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,
@@ -139,7 +139,7 @@ __global__ void spec_store_kv_cache(
 
 template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                            BeamSearchBatchConfig const *bc,
+                            TreeSearchBatchConfig const *bc,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
@@ -164,7 +164,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                        m->vProjSize,
                        num_tokens,
                        BatchConfig::max_sequence_length(),
-                       BeamSearchBatchConfig::MAX_BEAM_WIDTH,
+                       TreeSearchBatchConfig::MAX_BEAM_WIDTH,
                        /*root*/ curr_depth == 0,
                        m->hidden_size);
   }
@@ -189,7 +189,7 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                              BeamSearchBatchConfig const *bc,
+                              TreeSearchBatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
                               DT const *bias_ptr,
@@ -458,7 +458,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                      BeamSearchBatchConfig const *bc,
+                      TreeSearchBatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -483,14 +483,14 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       hipMemcpyAsync(m->beam_token_infos,
                      &(bc->beamTokenInfo),
                      max_tokens_per_batch * bc->MAX_BEAM_WIDTH *
-                         sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo),
+                         sizeof(TreeSearchBatchConfig::BeamSearchPerTokenInfo),
                      hipMemcpyHostToDevice,
                      stream));
   checkCUDA(hipMemcpyAsync(
       m->beam_request_infos,
       &(bc->beamRequestsInfo),
       bc->max_requests_per_batch() *
-          sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo),
+          sizeof(TreeSearchBatchConfig::BeamSearchPerRequestInfo),
       hipMemcpyHostToDevice,
       stream));
   // phase 1: Implement kernel to compute KQV for input tokens
@@ -517,7 +517,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 /*static*/
 void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     SpecIncMultiHeadSelfAttentionMeta const *m,
-    BeamSearchBatchConfig const *bc,
+    TreeSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -619,16 +619,16 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
   {
     int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
     size_t beam_tokeninfo_size =
-        max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH;
-    size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch();
+        max_tokens_per_batch * TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+    size_t requestinfo_size = TreeSearchBatchConfig::max_requests_per_batch();
     size_t beam_requestinfo_size =
-        BeamSearchBatchConfig::max_requests_per_batch();
+        TreeSearchBatchConfig::max_requests_per_batch();
     size_t total_size =
         requestinfo_size * sizeof(BatchConfig::PerRequestInfo) +
         beam_tokeninfo_size *
-            sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) +
+            sizeof(TreeSearchBatchConfig::BeamSearchPerTokenInfo) +
         beam_requestinfo_size *
-            sizeof(BeamSearchBatchConfig::
+            sizeof(TreeSearchBatchConfig::
                        BeamSearchPerRequestInfo); // more components will
                                                   // be added here later
 
@@ -637,7 +637,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                              total_size);
     beam_token_infos =
         gpu_mem_allocator
-            .allocate_instance<BeamSearchBatchConfig::BeamSearchPerTokenInfo>(
+            .allocate_instance<TreeSearchBatchConfig::BeamSearchPerTokenInfo>(
                 beam_tokeninfo_size);
     // offset += beam_tokeninfo_size *
     //           sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo);
@@ -647,7 +647,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo);
     beam_request_infos =
         gpu_mem_allocator
-            .allocate_instance<BeamSearchBatchConfig::BeamSearchPerRequestInfo>(
+            .allocate_instance<TreeSearchBatchConfig::BeamSearchPerRequestInfo>(
                 beam_requestinfo_size);
     // offset += beam_requestinfo_size *
     //           sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo);
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 2d80ed222..a12bc6895 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -49,7 +49,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     int per_head_size,
     int hidden_size,
     BatchConfig::PerRequestInfo *request_infos,
-    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos,
+    TreeSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos,
     BatchConfig::BitMask *causalMask,
     bool *request_completed) {
 
@@ -315,8 +315,8 @@ __global__ void spec_inc_store_kv_cache(
     DT *vCache_ptr,
     BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
-    BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
-    BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
+    TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
+    TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
     BatchConfig::BitMask *causalMask,
     int qProjSize,
     int kProjSize,
@@ -700,7 +700,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                      BeamSearchBatchConfig const *bc,
+                      TreeSearchBatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -742,7 +742,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 /*static*/
 void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     SpecIncMultiHeadSelfAttentionMeta const *m,
-    BeamSearchBatchConfig const *bc,
+    TreeSearchBatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index c432208ec..7a573e8c0 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -47,7 +47,7 @@ BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) {
   if (bc->get_mode() == INC_DECODING_MODE) {
     assert(Future(future).get_untyped_size() == sizeof(BatchConfig));
   } else if (bc->get_mode() == BEAM_SEARCH_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(BeamSearchBatchConfig));
+    assert(Future(future).get_untyped_size() == sizeof(TreeSearchBatchConfig));
   } else if (bc->get_mode() == TREE_VERIFY_MODE) {
     assert(Future(future).get_untyped_size() == sizeof(TreeVerifyBatchConfig));
   } else {
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index ff7bf1a81..64e12ae21 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -26,13 +26,13 @@ namespace FlexFlow {
 
 LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig");
 
-BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() {
+TreeSearchBatchConfig::TreeSearchBatchConfig() : BatchConfig() {
   this->beam_width = DEFAULT_BEAM_WIDTH;
   this->target_iterations = DEFAULT_TARGET_ITERATIONS;
   current_iteration = 0;
 }
 
-BeamSearchBatchConfig::BeamSearchBatchConfig(int model_id) : BatchConfig() {
+TreeSearchBatchConfig::TreeSearchBatchConfig(int model_id) : BatchConfig() {
   this->model_id = model_id;
   std::cout << "==================\n"
             << "Register Batch Config with Model " << this->model_id
@@ -40,7 +40,7 @@ BeamSearchBatchConfig::BeamSearchBatchConfig(int model_id) : BatchConfig() {
   current_iteration = 0;
 }
 
-BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width,
+TreeSearchBatchConfig::TreeSearchBatchConfig(size_t beam_width,
                                              size_t target_iterations)
     : BatchConfig() {
   this->beam_width = beam_width;
@@ -48,7 +48,7 @@ BeamSearchBatchConfig::BeamSearchBatchConfig(size_t beam_width,
   current_iteration = 0;
 }
 
-BeamSearchBatchConfig::BeamSearchBatchConfig(BeamSearchBatchConfig const &other,
+TreeSearchBatchConfig::TreeSearchBatchConfig(TreeSearchBatchConfig const &other,
                                              int model_id)
     : BatchConfig() {
   this->beam_width = other.beam_width;
@@ -57,20 +57,20 @@ BeamSearchBatchConfig::BeamSearchBatchConfig(BeamSearchBatchConfig const &other,
   current_iteration = 0;
 }
 
-BeamSearchBatchConfig::~BeamSearchBatchConfig() {}
+TreeSearchBatchConfig::~TreeSearchBatchConfig() {}
 
-InferenceMode BeamSearchBatchConfig::get_mode() const {
+InferenceMode TreeSearchBatchConfig::get_mode() const {
   return BEAM_SEARCH_MODE;
 }
 
-bool BeamSearchBatchConfig::done() const {
+bool TreeSearchBatchConfig::done() const {
   assert(current_iteration <= target_iterations);
   return current_iteration == target_iterations;
 }
 
-int BeamSearchBatchConfig::max_beam_depth_all_requests() const {
+int TreeSearchBatchConfig::max_beam_depth_all_requests() const {
   int max_depth_all_requests = 0;
-  for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
+  for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
     if (!request_completed[i] &&
         beamRequestsInfo[i].max_depth > max_depth_all_requests) {
       /* printf("\treq %i has max_depth=%i. Increasing max_depth_all_requests "
@@ -81,17 +81,17 @@ int BeamSearchBatchConfig::max_beam_depth_all_requests() const {
       max_depth_all_requests = beamRequestsInfo[i].max_depth;
     }
   }
-  assert(max_depth_all_requests <= BeamSearchBatchConfig::MAX_BEAM_DEPTH);
+  assert(max_depth_all_requests <= TreeSearchBatchConfig::MAX_BEAM_DEPTH);
   return max_depth_all_requests;
 }
 
-int BeamSearchBatchConfig::get_speculative_request_num() const {
+int TreeSearchBatchConfig::get_speculative_request_num() const {
   return speculative_request_num;
 }
 
-int BeamSearchBatchConfig::current_depth_all_requests() const {
+int TreeSearchBatchConfig::current_depth_all_requests() const {
   int current_depth = 0;
-  for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
+  for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
     if (!request_completed[i] &&
         beamRequestsInfo[i].current_depth > current_depth) {
       /* printf("\treq %i has current_depth=%i. Increasing "
@@ -102,11 +102,11 @@ int BeamSearchBatchConfig::current_depth_all_requests() const {
       current_depth = beamRequestsInfo[i].current_depth;
     }
   }
-  assert(current_depth <= BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1);
+  assert(current_depth <= TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1);
   return current_depth;
 }
 
-std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
+std::ostream &operator<<(std::ostream &os, TreeSearchBatchConfig const &bc) {
   os << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << bc.get_mode()
      << ") @@@@@@@@@@@@@@" << std::endl;
   // Max values
@@ -181,11 +181,11 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
   return os;
 }
 
-void BeamSearchBatchConfig::print() const {
+void TreeSearchBatchConfig::print() const {
   std::cout << *this << std::endl;
 }
 
-void BeamSearchBatchConfig::save_to_file(std::string const &filename) const {
+void TreeSearchBatchConfig::save_to_file(std::string const &filename) const {
   std::ofstream outputFile(filename);
   if (outputFile.is_open()) {
     outputFile << *this << std::endl;
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 2a94df8b4..9d73ca938 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -312,10 +312,10 @@ FutureMap InferenceManager::inference(FFModel *model,
     return inference(model, index, bcf);
   } else if (bc.get_mode() == BEAM_SEARCH_MODE) {
     BatchConfig const *bc_ptr = &bc;
-    BeamSearchBatchConfig const *bsbc_ptr =
-        static_cast<BeamSearchBatchConfig const *>(bc_ptr);
+    TreeSearchBatchConfig const *bsbc_ptr =
+        static_cast<TreeSearchBatchConfig const *>(bc_ptr);
     BeamSearchBatchConfigFuture bcf =
-        Future::from_value<BeamSearchBatchConfig>(*bsbc_ptr);
+        Future::from_value<TreeSearchBatchConfig>(*bsbc_ptr);
     return inference(model, index, bcf);
   } else if (bc.get_mode() == TREE_VERIFY_MODE) {
     BatchConfig const *bc_ptr = &bc;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 40f758282..7af8acfc0 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -93,10 +93,10 @@ Op::Op(FFModel &model,
        int numWeights,
        bool allocate_weights,
        int numOutputs,
-       const ParallelTensor input1,
-       const ParallelTensor input2,
-       const ParallelTensor input3,
-       const ParallelTensor input4)
+       ParallelTensor const input1,
+       ParallelTensor const input2,
+       ParallelTensor const input3,
+       ParallelTensor const input4)
     : Op(model,
          otype,
          dtype,
@@ -116,10 +116,10 @@ Op::Op(FFModel &model,
        int _numInputs,
        int _numWeights,
        int _numOutputs,
-       const ParallelTensor _input1,
-       const ParallelTensor _input2,
-       const ParallelTensor _input3,
-       const ParallelTensor _input4)
+       ParallelTensor const _input1,
+       ParallelTensor const _input2,
+       ParallelTensor const _input3,
+       ParallelTensor const _input4)
     : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++),
       numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs),
       profiling(model.config.profiling),
@@ -1035,9 +1035,9 @@ void Op::register_output_parallel_dims(
                                      operation);
 }
 
-int Op::get_output_to_input_dim_mapping(const ParallelTensor output,
+int Op::get_output_to_input_dim_mapping(ParallelTensor const output,
                                         int output_dim,
-                                        const ParallelTensor input) {
+                                        ParallelTensor const input) {
   int output_idx = -1, input_idx = -1;
   for (int i = 0; i < numOutputs; i++) {
     if (output == outputs[i]) {
@@ -1070,9 +1070,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output,
   return -1;
 }
 
-int Op::get_output_to_weight_dim_mapping(const ParallelTensor output,
+int Op::get_output_to_weight_dim_mapping(ParallelTensor const output,
                                          int output_dim,
-                                         const ParallelTensor weight) {
+                                         ParallelTensor const weight) {
   int output_idx = -1, weight_idx = -1;
   for (int i = 0; i < numOutputs; i++) {
     if (output == outputs[i]) {
@@ -1706,7 +1706,7 @@ Tensor FFModel::create_tensor(int numdim,
 }
 
 ParallelTensor FFModel::create_parallel_tensor(int numdim,
-                                               const ParallelDim dims[],
+                                               ParallelDim const dims[],
                                                DataType data_type,
                                                Op const *op,
                                                int idx,
@@ -1739,7 +1739,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim,
 
 ParallelTensor
     FFModel::create_parallel_tensor_legion_ordering(int numdim,
-                                                    const ParallelDim dims[],
+                                                    ParallelDim const dims[],
                                                     DataType data_type,
                                                     Op const *op,
                                                     int idx,
@@ -1789,7 +1789,7 @@ Tensor FFModel::create_tensor(int const dims[],
 }
 
 template <int NDIM>
-ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[],
+ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[],
                                                DataType data_type,
                                                Op const *owner_op,
                                                int owner_idx,
@@ -1870,7 +1870,7 @@ Parameter FFModel::create_weight(int numdim,
 }
 
 template <int NDIM>
-ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[],
+ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[],
                                                   DataType data_type,
                                                   Op const *owner_op,
                                                   bool create_grad,
@@ -1901,7 +1901,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[],
 }
 
 ParallelParameter FFModel::create_parallel_weight(int numdim,
-                                                  const ParallelDim dims[],
+                                                  ParallelDim const dims[],
                                                   DataType data_type,
                                                   Op const *owner_op,
                                                   bool create_grad,
@@ -1921,7 +1921,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim,
 
 ParallelParameter FFModel::create_parallel_weight_legion_ordering(
     int numdim,
-    const ParallelDim dims[],
+    ParallelDim const dims[],
     DataType data_type,
     Op const *owner_op,
     bool create_grad,
@@ -2135,7 +2135,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight,
 }
 
 bool FFModel::get_parallel_tensor_from_tensor(
-    const Tensor tensor, ParallelTensor &parallel_tensor) const {
+    Tensor const tensor, ParallelTensor &parallel_tensor) const {
   // check if tensor->parallel_tensor is already set
   if (tensor->parallel_tensor != nullptr) {
     parallel_tensor = tensor->parallel_tensor;
@@ -2172,7 +2172,7 @@ bool FFModel::get_parallel_tensor_from_tensor(
 }
 
 void FFModel::create_disjoint_partition(int num_dims,
-                                        const ParallelDim dims[],
+                                        ParallelDim const dims[],
                                         IndexSpace const &part_is,
                                         LogicalRegion const &region,
                                         LogicalPartition &part) {
@@ -2195,7 +2195,7 @@ void FFModel::create_disjoint_partition(int num_dims,
 
 template <int NDIM, int TDIM>
 void FFModel::create_disjoint_partition_with_dim2(
-    const ParallelDim dims[],
+    ParallelDim const dims[],
     IndexSpaceT<TDIM> const &part_is,
     LogicalRegion const &region,
     LogicalPartition &part) {
@@ -2228,7 +2228,7 @@ void FFModel::create_disjoint_partition_with_dim2(
 }
 
 void FFModel::create_aliased_partition(int num_dims,
-                                       const ParallelDim dims[],
+                                       ParallelDim const dims[],
                                        int aliased_dim,
                                        IndexSpace const &part_is,
                                        LogicalRegion const &region,
@@ -2252,7 +2252,7 @@ void FFModel::create_aliased_partition(int num_dims,
 
 template <int NDIM, int TDIM>
 void FFModel::create_aliased_partition_with_dim2(
-    const ParallelDim dims[],
+    ParallelDim const dims[],
     int aliased_dim,
     IndexSpaceT<TDIM> const &part_is,
     LogicalRegion const &region,
@@ -2289,7 +2289,7 @@ void FFModel::create_aliased_partition_with_dim2(
 }
 
 template <int NDIM>
-void FFModel::create_disjoint_partition(const ParallelTensor tensor,
+void FFModel::create_disjoint_partition(ParallelTensor const tensor,
                                         IndexSpaceT<NDIM> const &part_is,
                                         LogicalPartition &part_fwd,
                                         LogicalPartition &part_bwd) {
@@ -2337,7 +2337,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor,
 
 template <int NDIM, int TDIM>
 void FFModel::create_data_parallel_partition_with_diff_dims(
-    const ParallelTensor tensor,
+    ParallelTensor const tensor,
     IndexSpaceT<TDIM> const &part_is,
     LogicalPartition &part_fwd,
     LogicalPartition &part_bwd) {
@@ -2719,7 +2719,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const {
   return get_task_is(view);
 }
 
-IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) {
+IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) {
   MachineView view;
   view.ndims = 0;
   for (int i = 0; i < tensor->num_dims; i++) {
@@ -3308,7 +3308,7 @@ bool FFModel::is_mlp_block(int layer_idx) const {
 }
 
 void FFModel::create_operators_from_layers() {
-  std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
+  std::map<Tensor const, ParallelTensor> tensors_to_parallel_tensors;
   // for (auto const &l : layers) {
   for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) {
     auto const &l = layers[layer_idx];
@@ -4061,38 +4061,38 @@ void FFIterationConfig::reset() {
 
 // Default Config Parameters
 struct DefaultConfig {
-  const static int epochs = 1;
+  static int const epochs = 1;
   // const static int iterations = 1;
-  const static int batchSize = 64;
-  const static bool profiling = false;
-  const static bool inference_debugging = false;
+  static int const batchSize = 64;
+  static bool const profiling = false;
+  static bool const inference_debugging = false;
   constexpr static float learningRate = 0.01f;
   constexpr static float weightDecay = 0.0001f;
-  const static size_t workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB
-  const static int numNodes = 1;
-  const static int workersPerNode = 0;
-  const static int cpusPerNode = 0;
-  const static size_t searchBudget = -1;
-  const static size_t simulatorWorkSpaceSize =
+  static size_t const workSpaceSize = (size_t)128 * 1024 * 1024; // 128 MB
+  static int const numNodes = 1;
+  static int const workersPerNode = 0;
+  static int const cpusPerNode = 0;
+  static size_t const searchBudget = -1;
+  static size_t const simulatorWorkSpaceSize =
       (size_t)2 * 1024 * 1024 * 1024; // 2 GB
   constexpr static float searchAlpha = 1.2f;
-  const static bool searchOverlapBackwardUpdate = false;
-  const static size_t offloadReserveSpaceSize =
+  static bool const searchOverlapBackwardUpdate = false;
+  static size_t const offloadReserveSpaceSize =
       (size_t)8 * 1024 * 1024 * 1024; // 8 GB
-  const static bool cpuOffload = false;
-  const static bool onlyDataParallel = true;
-  const static bool enableSampleParallel = true;
-  const static bool enableParameterParallel = false;
-  const static bool enableAttributeParallel = false;
-  const static bool enableInplaceOptimizations = false;
-  const static bool allowTensorOpMathConversion = false;
-  const static int machine_model_version = 0;
-  const static int simulator_segment_size = 16777216; // 16 MB
-  const static int simulator_max_num_segments = 1;
-  const static int base_optimize_threshold = 10;
-  const static bool enable_control_replication = true;
+  static bool const cpuOffload = false;
+  static bool const onlyDataParallel = true;
+  static bool const enableSampleParallel = true;
+  static bool const enableParameterParallel = false;
+  static bool const enableAttributeParallel = false;
+  static bool const enableInplaceOptimizations = false;
+  static bool const allowTensorOpMathConversion = false;
+  static int const machine_model_version = 0;
+  static int const simulator_segment_size = 16777216; // 16 MB
+  static int const simulator_max_num_segments = 1;
+  static int const base_optimize_threshold = 10;
+  static bool const enable_control_replication = true;
   // The default python data loader type is 2 to enable control replication
-  const static int python_data_loader_type = 2;
+  static int const python_data_loader_type = 2;
 };
 
 FFConfig::FFConfig() {
@@ -4474,7 +4474,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<
-          BeamSearchBatchConfig,
+          TreeSearchBatchConfig,
           RequestManager::prepare_next_batch_beam_task>(
           registrar, "RequestManager Prepare Next Batch (Beam) Task");
     } else {
@@ -4482,7 +4482,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
         registrar.global_registration = false;
       }
       runtime
-          ->register_task_variant<BeamSearchBatchConfig,
+          ->register_task_variant<TreeSearchBatchConfig,
                                   RequestManager::prepare_next_batch_beam_task>(
               registrar);
     }
@@ -4496,7 +4496,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<
-          BeamSearchBatchConfig,
+          TreeSearchBatchConfig,
           RequestManager::prepare_next_batch_init_task>(
           registrar, "RequestManager Prepare Next Batch (Init Beam) Task");
     } else {
@@ -4504,7 +4504,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
         registrar.global_registration = false;
       }
       runtime
-          ->register_task_variant<BeamSearchBatchConfig,
+          ->register_task_variant<TreeSearchBatchConfig,
                                   RequestManager::prepare_next_batch_init_task>(
               registrar);
     }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1f1f93183..909e52d5e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -97,7 +97,7 @@ int RequestManager::get_max_sequence_length() {
 }
 
 void RequestManager::push_spec_infer_tree_width(int tree_width) {
-  assert(tree_width <= BeamSearchBatchConfig::MAX_BEAM_WIDTH);
+  assert(tree_width <= TreeSearchBatchConfig::MAX_BEAM_WIDTH);
   spec_infer_tree_width.emplace_back(tree_width);
 }
 
@@ -171,7 +171,7 @@ size_t RequestManager::get_num_ssms() {
 RequestManager::RequestGuid
     RequestManager::register_new_request(std::vector<TokenId> const &prompt,
                                          int max_sequence_length) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   // Add a new request
   Request request;
@@ -206,7 +206,7 @@ RequestManager::RequestGuid
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
@@ -231,7 +231,7 @@ RequestManager::RequestGuid
 RequestManager::RequestGuid
     RequestManager::register_new_request(std::string const &prompt,
                                          int max_sequence_length) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
@@ -270,7 +270,7 @@ RequestManager::RequestGuid
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
@@ -294,7 +294,7 @@ RequestManager::RequestGuid
 }
 
 bool RequestManager::is_request_completed(RequestGuid const &guid) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   assert(all_requests.find(guid) != all_requests.end());
   Request const &request = all_requests[guid];
   // return request.tokens.size() >= request.max_sequence_length;
@@ -306,7 +306,7 @@ GenerationResult
   // First get the future of the request
   std::future<void> future;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     assert(request_to_promise.find(guid) != request_to_promise.end());
     future = request_to_promise[guid]->get_future();
   }
@@ -314,7 +314,7 @@ GenerationResult
   future.get();
   // Get the generation result
   {
-    const std::lock_guard<std::mutex> lock(request_queue_mutex);
+    std::lock_guard<std::mutex> const lock(request_queue_mutex);
     assert(request_generation_results.find(guid) !=
            request_generation_results.end());
     return request_generation_results[guid];
@@ -352,7 +352,7 @@ BatchConfig RequestManager::prepare_next_batch_task(
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   // Step 1: append result from previous iteration to request's tokens
   for (int i = 0; i < old_bc.num_tokens; i++) {
@@ -560,7 +560,7 @@ BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
   return runtime->execute_task(ctx, launcher);
 }
 
-BeamSearchBatchConfig RequestManager::prepare_next_batch_init_task(
+TreeSearchBatchConfig RequestManager::prepare_next_batch_init_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
@@ -574,17 +574,17 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_init_task(
   return rm->prepare_next_batch_init(bc, result, model_id);
 }
 
-BeamSearchBatchConfig
+TreeSearchBatchConfig
     RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                                             InferenceResult const &result,
                                             int model_id) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
   }
 
   // Step 1: use result to update requests
-  BeamSearchBatchConfig new_bc;
+  TreeSearchBatchConfig new_bc;
   new_bc.num_tokens = 0;
   new_bc.model_id = model_id;
   int result_index = 0;
@@ -768,9 +768,9 @@ BeamSearchBatchConfig
                 ? spec_infer_tree_width[ssm_decoding_steps]
                 : 1;
         new_bc.beamRequestsInfo[i].max_depth =
-            std::min(new_max_depth, BeamSearchBatchConfig::MAX_BEAM_DEPTH);
+            std::min(new_max_depth, TreeSearchBatchConfig::MAX_BEAM_DEPTH);
         for (int j = 0;
-             j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+             j < TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
              j++) {
           new_bc.beamRequestsInfo[i].parent_id[j] = 0;
           new_bc.beamRequestsInfo[i].probs[j] = 1;
@@ -844,7 +844,7 @@ BeamSearchBatchConfig
               ? spec_infer_tree_width[ssm_decoding_steps]
               : 1;
       new_bc.beamRequestsInfo[i].max_depth = 0;
-      for (int j = 0; j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      for (int j = 0; j < TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
            j++) {
         new_bc.beamRequestsInfo[i].parent_id[j] = 0;
         new_bc.beamRequestsInfo[i].probs[j] = 1;
@@ -869,7 +869,7 @@ BeamSearchBatchConfig
   }
 
   // Step 2: Initialize new request
-  for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
+  for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
       if (!pending_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
@@ -902,11 +902,11 @@ BeamSearchBatchConfig
                 : 1;
         new_bc.beamRequestsInfo[i].current_depth = 1;
         new_bc.beamRequestsInfo[i].max_depth =
-            std::min(BeamSearchBatchConfig::MAX_BEAM_DEPTH,
+            std::min(TreeSearchBatchConfig::MAX_BEAM_DEPTH,
                      get_max_tokens_per_batch() -
                          new_bc.requestsInfo[i].num_tokens_in_batch - 1);
         for (int j = 0;
-             j < BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+             j < TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
              j++) {
           new_bc.beamRequestsInfo[i].parent_id[j] = 0;
           new_bc.beamRequestsInfo[i].probs[j] = 1;
@@ -996,30 +996,29 @@ BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
   return runtime->execute_task(ctx, launcher);
 }
 
-BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
+TreeSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
   RequestManager *rm = *((RequestManager **)task->args);
-  BeamSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+  TreeSearchBatchConfig const &bc =
+      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
   BeamInferenceResult const &result =
       Future(task->futures[1]).get_result<BeamInferenceResult>();
   return rm->prepare_next_batch_beam(bc, result);
 }
 
 // update beam search metadata
-BeamSearchBatchConfig
-    RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
+TreeSearchBatchConfig
+    RequestManager::prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
                                             BeamInferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_beam ###############\n";
   }
   if (verbose) {
-    std::cout << "print all results"
-              << "\n";
+    std::cout << "print all results" << "\n";
     for (int i = 0; i < 40; i++) {
       std::cout << result.token_ids[i] << ", ";
     }
@@ -1032,7 +1031,7 @@ BeamSearchBatchConfig
   store_beam_metadata(old_bc, result);
 
   // Step 2: preparing the next batch for existing requests
-  BeamSearchBatchConfig new_bc;
+  TreeSearchBatchConfig new_bc;
   new_bc.model_id = old_bc.model_id;
   // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n";
   int num_generation_tokens = 0;
@@ -1087,7 +1086,7 @@ BeamSearchBatchConfig
           old_bc.beamRequestsInfo[i].beam_size;
 
       assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-                 BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
+                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
              "exceed maximum nodes per layer");
 
       if (request.status == Request::RUNNING) {
@@ -1213,7 +1212,7 @@ BeamSearchBatchConfig
           old_bc.beamRequestsInfo[i].sub_request_num;
 
       assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-                 BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
+                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
              "exceed maximum nodes per layer");
 
       // update the parentid, accumalated_probs, depth, and token_ids
@@ -1318,16 +1317,16 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task(
     Context ctx,
     Runtime *runtime) {
   RequestManager *rm = *((RequestManager **)task->args);
-  std::vector<BeamSearchBatchConfig> old_batches;
+  std::vector<TreeSearchBatchConfig> old_batches;
   for (auto const &bcf : task->futures) {
-    old_batches.push_back(Future(bcf).get_result<BeamSearchBatchConfig>());
+    old_batches.push_back(Future(bcf).get_result<TreeSearchBatchConfig>());
   }
   return rm->prepare_next_batch_verify(old_batches);
 }
 
 TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
-    std::vector<BeamSearchBatchConfig> const &old_batches) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+    std::vector<TreeSearchBatchConfig> const &old_batches) {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   if (verbose) {
     std::cout
@@ -1345,7 +1344,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     if (old_batches.at(0).request_completed[i]) {
       continue;
     } else if (old_batches.at(0).request_running[i]) {
-      max_prompt_load_size -= (BeamSearchBatchConfig::MAX_BEAM_DEPTH + 1);
+      max_prompt_load_size -= (TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1);
     } else {
       max_prompt_load_size -= 1;
     }
@@ -1618,7 +1617,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
   return new_bc;
 }
 
-void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
+void RequestManager::store_beam_metadata(TreeSearchBatchConfig const &old_bc,
                                          BeamInferenceResult const &result) {
   // step1 store the outputs
   if (old_bc.num_tokens <= 0) {
@@ -1677,8 +1676,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
       if (depth == 1) {
         // store the last input into the tree;
         if (verbose) {
-          std::cout << "try to store the input"
-                    << "\n";
+          std::cout << "try to store the input" << "\n";
         }
 
         request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] =
@@ -1729,8 +1727,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
 }
 
 // for updating the beam search metadata in requests in incremental phase
-void RequestManager::update_beam_metadata(BeamSearchBatchConfig &new_bc,
-                                          BeamSearchBatchConfig const &old_bc,
+void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
+                                          TreeSearchBatchConfig const &old_bc,
                                           BeamTree &tree,
                                           int request_index) {
 
@@ -1926,7 +1924,7 @@ bool PreOrder(
     int current_depth,
     int beam_width,
     int id,
-    std::vector<std::pair<BeamSearchBatchConfig::TokenId, int>> &serializedTree,
+    std::vector<std::pair<TreeSearchBatchConfig::TokenId, int>> &serializedTree,
     bool verbose) {
   // terminate
   if (current_depth >= max_depth) {
@@ -1935,8 +1933,7 @@ bool PreOrder(
     if (verbose) {
       std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id]
                 << "\n";
-      std::cout << "return true"
-                << "\n";
+      std::cout << "return true" << "\n";
     }
     return true;
   }
@@ -1991,7 +1988,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
             &inputSerializedTree,
         std::vector<std::pair<BatchConfig::TokenId, int>> const
             &outputSerializedTree) {
-  std::vector<std::pair<BeamSearchBatchConfig::TokenId, int>> verifiedTree;
+  std::vector<std::pair<TreeSearchBatchConfig::TokenId, int>> verifiedTree;
   // verifiedTree.push_back(inputSerializedTree.at(0));
   std::vector<std::pair<int, int>> new_committed_tokens =
       std::vector<std::pair<int, int>>();
@@ -2145,7 +2142,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 }
 
 std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc,
+    RequestManager::traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
                                        int request_index,
                                        int first_token_depth_in_request) {
   if (verbose) {
@@ -2477,7 +2474,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     runtime->begin_trace(ctx, 12345 /*trace_id*/);
 
     for (size_t i = 0; i < get_num_ssms(); i++) {
-      for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      for (int depth = 0; depth < TreeSearchBatchConfig::MAX_BEAM_DEPTH;
            depth++) {
         beam_bcf = beam_bcf_vec[i];
 
@@ -2570,7 +2567,7 @@ void RequestManager::serve_spec_infer_v2(FFModel *llm) {
     runtime->begin_trace(ctx, 12345 /*trace_id*/);
 
     for (size_t i = 0; i < get_num_ssms(); i++) {
-      for (int depth = 0; depth < BeamSearchBatchConfig::MAX_BEAM_DEPTH;
+      for (int depth = 0; depth < TreeSearchBatchConfig::MAX_BEAM_DEPTH;
            depth++) {
         beam_bcf = beam_bcf_vec[i];
 
@@ -2598,7 +2595,7 @@ void RequestManager::serve_spec_infer_v2(FFModel *llm) {
 
 void RequestManager::trigger_request_completion_future(
     RequestGuid const &guid) {
-  const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+  std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
   assert(request_to_promise.find(guid) != request_to_promise.end());
   // Set the completion promise in case other threads are waiting
   request_to_promise[guid]->set_value();
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index fadbf80d6..d4c9f89a3 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -91,25 +91,25 @@ void RequestManager::load_batch_config_task(
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
-    BeamSearchBatchConfig const *beam_batch_config =
-        static_cast<BeamSearchBatchConfig const *>(batch_config);
+    TreeSearchBatchConfig const *beam_batch_config =
+        static_cast<TreeSearchBatchConfig const *>(batch_config);
 
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
                              &(beam_batch_config->beamTokenInfo),
-                             sizeof(BeamSearchBatchConfig::beamTokenInfo),
+                             sizeof(TreeSearchBatchConfig::beamTokenInfo),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
+    total_copy_size += sizeof(TreeSearchBatchConfig::beamTokenInfo);
 
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
                              &(beam_batch_config->beamRequestsInfo),
-                             sizeof(BeamSearchBatchConfig::beamRequestsInfo),
+                             sizeof(TreeSearchBatchConfig::beamRequestsInfo),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
+    total_copy_size += sizeof(TreeSearchBatchConfig::beamRequestsInfo);
 
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 8380d6be7..54e389a6b 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -98,25 +98,25 @@ void RequestManager::load_batch_config_task(
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
-    BeamSearchBatchConfig const *beam_batch_config =
-        static_cast<BeamSearchBatchConfig const *>(batch_config);
+    TreeSearchBatchConfig const *beam_batch_config =
+        static_cast<TreeSearchBatchConfig const *>(batch_config);
 
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
         &(beam_batch_config->beamTokenInfo),
-        sizeof(BeamSearchBatchConfig::beamTokenInfo),
+        sizeof(TreeSearchBatchConfig::beamTokenInfo),
         cudaMemcpyHostToDevice,
         stream));
 
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
+    total_copy_size += sizeof(TreeSearchBatchConfig::beamTokenInfo);
 
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
         &(beam_batch_config->beamRequestsInfo),
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo),
+        sizeof(TreeSearchBatchConfig::beamRequestsInfo),
         cudaMemcpyHostToDevice,
         stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
+    total_copy_size += sizeof(TreeSearchBatchConfig::beamRequestsInfo);
 
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,

From 446b293336b909a14b3b2a78df4e82512d386c63 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 24 Mar 2024 22:04:26 -0400
Subject: [PATCH 006/667] Modified some methods of TreeSearchBatchConfig to
 comply with the new data structure.

---
 include/flexflow/batch_config.h         |   7 +-
 include/flexflow/ffconst.h              |   2 +-
 include/flexflow/request_manager.h      |   4 +-
 src/runtime/beam_search_batch_config.cc | 166 ++++++++----------------
 4 files changed, 59 insertions(+), 120 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index b9996aba4..31b8db646 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -145,18 +145,19 @@ class TreeSearchBatchConfig : public BatchConfig {
                                   TreeSearchBatchConfig const &bc);
   void print() const;
   void save_to_file(std::string const &filename) const;
-  int current_depth_all_requests() const;
+  int current_depth() const;
   int get_speculative_request_num() const;
 
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+  inline static int const MAX_TREE_DEPTH = 16;
 
   // how many requests is in speculative phase
   int speculative_request_num = 0;
+  int current_depth = 0;
   int model_id;
 
   struct TreeSearchPerRequestInfo {
-    int current_depth = -1;
-    int num_tokens_in_layer;
+    int num_tokens_in_layer = 0;
   };
 
   TreeSearchPerRequestInfo tree_requests_info[MAX_NUM_REQUESTS];
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 512645e62..296f80311 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -68,7 +68,7 @@ enum MetricsType {
 
 enum InferenceMode {
   INC_DECODING_MODE = 2001,
-  BEAM_SEARCH_MODE = 2002,
+  TREE_SEARCH_MODE = 2002,
   TREE_VERIFY_MODE = 2003,
 };
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b583d7e60..647783e07 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -191,7 +191,7 @@ class RequestManager {
                                        InferenceResultFuture const &result,
                                        Legion::Context ctx,
                                        Legion::Runtime *runtime);
-  /* The APIs that need to be changed. */
+  /* Old APIs for reference */
   TreeSearchBatchConfig
       prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
                               BeamInferenceResult const &result);
@@ -229,7 +229,7 @@ class RequestManager {
       traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
                          int first_token_depth_in_request);
-  /* The APIs that need to be changed. */
+  /* Old APIs for reference */
 
   /* New APIs */
   TreeSearchBatchConfig
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index 64e12ae21..0011060a4 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -24,157 +24,95 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig");
+LegionRuntime::Logger::Category log_tree_bc("TreeSearchBatchConfig");
 
-TreeSearchBatchConfig::TreeSearchBatchConfig() : BatchConfig() {
-  this->beam_width = DEFAULT_BEAM_WIDTH;
-  this->target_iterations = DEFAULT_TARGET_ITERATIONS;
-  current_iteration = 0;
-}
+TreeSearchBatchConfig::TreeSearchBatchConfig() : BatchConfig() {}
 
-TreeSearchBatchConfig::TreeSearchBatchConfig(int model_id) : BatchConfig() {
-  this->model_id = model_id;
+TreeSearchBatchConfig::TreeSearchBatchConfig(int model_id)
+    : BatchConfig(), model_id(model_id) {
   std::cout << "==================\n"
             << "Register Batch Config with Model " << this->model_id
             << std::endl;
-  current_iteration = 0;
-}
-
-TreeSearchBatchConfig::TreeSearchBatchConfig(size_t beam_width,
-                                             size_t target_iterations)
-    : BatchConfig() {
-  this->beam_width = beam_width;
-  this->target_iterations = target_iterations;
-  current_iteration = 0;
 }
 
+/* Why do we need this? */
 TreeSearchBatchConfig::TreeSearchBatchConfig(TreeSearchBatchConfig const &other,
                                              int model_id)
-    : BatchConfig() {
-  this->beam_width = other.beam_width;
-  this->target_iterations = other.target_iterations;
-  this->model_id = model_id;
-  current_iteration = 0;
-}
+    : BatchConfig(), model_id(model_id) {}
 
 TreeSearchBatchConfig::~TreeSearchBatchConfig() {}
 
 InferenceMode TreeSearchBatchConfig::get_mode() const {
-  return BEAM_SEARCH_MODE;
-}
-
-bool TreeSearchBatchConfig::done() const {
-  assert(current_iteration <= target_iterations);
-  return current_iteration == target_iterations;
-}
-
-int TreeSearchBatchConfig::max_beam_depth_all_requests() const {
-  int max_depth_all_requests = 0;
-  for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (!request_completed[i] &&
-        beamRequestsInfo[i].max_depth > max_depth_all_requests) {
-      /* printf("\treq %i has max_depth=%i. Increasing max_depth_all_requests "
-             "from %i\n",
-             i,
-             beamRequestsInfo[i].max_depth,
-             max_depth_all_requests); */
-      max_depth_all_requests = beamRequestsInfo[i].max_depth;
-    }
-  }
-  assert(max_depth_all_requests <= TreeSearchBatchConfig::MAX_BEAM_DEPTH);
-  return max_depth_all_requests;
+  return TREE_SEARCH_MODE;
 }
 
 int TreeSearchBatchConfig::get_speculative_request_num() const {
   return speculative_request_num;
 }
 
-int TreeSearchBatchConfig::current_depth_all_requests() const {
-  int current_depth = 0;
-  for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (!request_completed[i] &&
-        beamRequestsInfo[i].current_depth > current_depth) {
-      /* printf("\treq %i has current_depth=%i. Increasing "
-             "current_depth_all_requests from %i\n",
-             i,
-             beamRequestsInfo[i].current_depth,
-             current_depth); */
-      current_depth = beamRequestsInfo[i].current_depth;
-    }
-  }
-  assert(current_depth <= TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1);
-  return current_depth;
-}
-
-std::ostream &operator<<(std::ostream &os, TreeSearchBatchConfig const &bc) {
-  os << "@@@@@@@@@@@@@@ BeamSearchBatchConfig (mode " << bc.get_mode()
-     << ") @@@@@@@@@@@@@@" << std::endl;
+std::ostream &
+    operator<<(std::ostream &os,
+               TreeSearchBatchConfig const &tree_search_batch_config) {
+  os << "@@@@@@@@@@@@@@ TreeSearchBatchConfig (mode "
+     << tree_search_batch_config.get_mode() << ") @@@@@@@@@@@@@@" << std::endl;
   // Max values
-  os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
+  os << "Max number of requests: "
+     << tree_search_batch_config.max_requests_per_batch() << std::endl;
+  os << "Max number of tokens: "
+     << tree_search_batch_config.max_tokens_per_batch() << std::endl;
+  os << "Max sequence length: "
+     << tree_search_batch_config.max_sequence_length() << std::endl;
   // Current values
-  os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
-  os << "Number of requests: " << bc.num_active_requests() << std::endl;
-  // BeamSearch-specific
-  os << "Model ID: " << bc.model_id << std::endl;
-  os << "Max Beam Depth (all requests): " << bc.max_beam_depth_all_requests()
+  os << "Number of tokens: " << tree_search_batch_config.num_active_tokens()
      << std::endl;
-  os << "Current depth (all requests): " << bc.current_depth_all_requests()
+  os << "Number of requests: " << tree_search_batch_config.num_active_requests()
+     << std::endl;
+  // Tree Search-specific
+  os << "Model ID: " << tree_search_batch_config.model_id << std::endl;
+  os << "Max tree depth: " << TreeSearchBatchConfig::MAX_TREE_DEPTH
+     << std::endl;
+  os << "Max num branch: "
+     << TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES << std::endl;
+  os << "Current depth: " << tree_search_batch_config.current_depth
      << std::endl;
-  os << "Beam width: " << bc.beam_width << std::endl;
-  os << "Target Iterations: " << bc.target_iterations << std::endl;
-  os << "Current Iterations: " << bc.current_iteration << std::endl;
 
   os << "Per-request info:\n";
-  for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_completed[i]) {
+  for (int i = 0; i < tree_search_batch_config.max_requests_per_batch(); i++) {
+    if (!tree_search_batch_config.request_completed[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
+         << tree_search_batch_config.requestsInfo[i]
+                .first_token_depth_in_request
+         << std::endl;
       os << "    First token offset in batch: "
-         << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
-      os << "    Number of tokens in batch: "
-         << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_completed[i] << std::endl;
-      os << "    Request running: " << bc.request_running[i] << std::endl;
-      os << "    Beam Search Specific: " << std::endl;
-      os << "        beam_size: " << bc.beamRequestsInfo[i].beam_size
+         << tree_search_batch_config.requestsInfo[i].first_token_offset_in_batch
          << std::endl;
-      os << "        current_depth: " << bc.beamRequestsInfo[i].current_depth
+      os << "    Number of tokens in batch: "
+         << tree_search_batch_config.requestsInfo[i].num_tokens_in_batch
          << std::endl;
-      os << "        max_depth: " << bc.beamRequestsInfo[i].max_depth
+      os << "    GUID: "
+         << tree_search_batch_config.requestsInfo[i].request_guid << std::endl;
+      os << "    Max sequence length: "
+         << tree_search_batch_config.requestsInfo[i].max_sequence_length
          << std::endl;
-      os << "        tokens: ";
-      for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) {
-        os << bc.beamRequestsInfo[i].tokens[j] << ", ";
-      }
-      os << std::endl;
-      os << "        probs: ";
-      for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) {
-        os << bc.beamRequestsInfo[i].probs[j] << ", ";
-      }
-      os << std::endl;
-      os << "        parent_id: ";
-      for (int j = 0; j < bc.MAX_BEAM_WIDTH; j++) {
-        os << bc.beamRequestsInfo[i].parent_id[j] << ", ";
-      }
-      os << std::endl;
+      os << "    Request completed: "
+         << tree_search_batch_config.request_completed[i] << std::endl;
+      os << "    Request running: "
+         << tree_search_batch_config.request_running[i] << std::endl;
+      os << "    Tree Search Specific: " << std::endl;
+      os << "        Number of tokens in the current batch: " os << std::endl;
     }
   }
 
   os << "Per-token info:\n";
-  for (int i = 0; i < bc.num_tokens; i++) {
+  for (int i = 0; i < tree_search_batch_config.num_tokens; i++) {
     os << "  Token " << i << ":\n";
     os << "    Absolute depth in request: "
-       << bc.tokensInfo[i].abs_depth_in_request << std::endl;
-    os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
-    os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
-    os << "    Beam Search Specific: " << std::endl;
-    os << "        beam_size: " << bc.beamTokenInfo[i].sub_request_index
+       << tree_search_batch_config.tokensInfo[i].abs_depth_in_request
+       << std::endl;
+    os << "    Request index: "
+       << tree_search_batch_config.tokensInfo[i].request_index << std::endl;
+    os << "    Token id: " << tree_search_batch_config.tokensInfo[i].token_id
        << std::endl;
   }
   os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;

From 146467bc68cae3aa4204efa0faa65c2b6cd82baa Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 24 Mar 2024 22:05:44 -0400
Subject: [PATCH 007/667] Rename file.

---
 .../{beam_search_batch_config.cc => tree_search_batch_config.cc}  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/runtime/{beam_search_batch_config.cc => tree_search_batch_config.cc} (100%)

diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
similarity index 100%
rename from src/runtime/beam_search_batch_config.cc
rename to src/runtime/tree_search_batch_config.cc

From 03173ad1709169f5119c2bc0843e97a8cbe5e4a3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 25 Mar 2024 13:27:39 -0400
Subject: [PATCH 008/667] Add some descriptions about the APIs we are going to
 implement

---
 include/flexflow/request_manager.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 647783e07..a621c811e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -232,38 +232,52 @@ class RequestManager {
   /* Old APIs for reference */
 
   /* New APIs */
+  // Given the last speculation result, prepare the next speculation batch.
   TreeSearchBatchConfig
       prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
                               BeamInferenceResult const &result);
+  // A wrapper function.
   TreeSearchBatchConfigFuture
       prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
                               BeamInferenceResultFuture const &result,
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
+  // Given the verification result, prepare the first speculation batch.
   TreeSearchBatchConfig
       prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                               InferenceResult const &result,
                               int model_id);
+  // A wrapper function.
   TreeSearchBatchConfigFuture
       prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
                               InferenceResultFuture const &result,
                               int model_id,
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
+  // Given the speculation result, prepare the verification batch.
   TreeSearchBatchConfig prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfig> const &old_batches);
+  // A wrapper function.
   TreeVerifyBatchConfigFuture prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfigFuture> const &old_batches,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
+  // This function takes the small model inference results and the last
+  // speculation batch config and use the information to update the token tree
+  // stored in RequestManager::all_requests.
   void store_spec_metadata(TreeSearchBatchConfig const &old_bc,
                            BeamInferenceResult const &result);
+  // Put the last layer of the token tree stored in RequestManager::all_requests
+  // into new_bc::beamRequestsInfo .
   void update_spec_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             Token &tree,
                             int request_index);
 
+  // This function takes the tree stored in the token trees in
+  // RequestManager::all_requests, and convert them into serialized version.
+  // Called by prepare_next_batch_verify().
   std::vector<std::pair<BatchConfig::TokenId, int>>
       traverse_spec_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
@@ -310,19 +324,21 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  /* APIs to modify */
+  /* Old APIs for reference */
+  // A wrapper function.
   static TreeSearchBatchConfig prepare_next_batch_beam_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
+  // A wrapper function.
   static TreeSearchBatchConfig prepare_next_batch_init_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
-  /* APIs to modify */
+  /* Old APIs for reference */
 
   /* New APIs */
   static TreeSearchBatchConfig prepare_next_batch_spec_task(

From 366b7b9bcce65ea9184da0993eadb8c81ba1bf44 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 25 Mar 2024 13:33:04 -0400
Subject: [PATCH 009/667] Rename BeamInferenceResult to SsmInferenceResult

---
 include/flexflow/batch_config.h    |  4 ++--
 include/flexflow/ops/arg_topk.h    |  2 +-
 include/flexflow/ops/argmax.h      |  6 +++---
 include/flexflow/ops/beam_topk.h   |  2 +-
 include/flexflow/request_manager.h |  8 ++++----
 src/ops/arg_topk.cc                |  6 +++---
 src/ops/argmax.cc                  | 14 +++++++-------
 src/ops/beam_topk.cc               |  6 +++---
 src/runtime/model.cc               | 15 ++++++++-------
 src/runtime/request_manager.cc     |  8 ++++----
 10 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 31b8db646..fb81d4f8d 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -28,7 +28,7 @@
 namespace FlexFlow {
 
 class InferenceResult;
-class BeamInferenceResult;
+class SsmInferenceResult;
 
 using BatchConfigFuture = Legion::Future;
 using InferenceResultFuture = Legion::Future;
@@ -163,7 +163,7 @@ class TreeSearchBatchConfig : public BatchConfig {
   TreeSearchPerRequestInfo tree_requests_info[MAX_NUM_REQUESTS];
 };
 
-struct BeamInferenceResult {
+struct SsmInferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId
       token_ids[MAX_NUM_TOKENS *
diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 2b8f858ec..e0f328c59 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -64,7 +64,7 @@ class ArgTopK : public Op {
                      std::vector<Legion::PhysicalRegion> const &regions,
                      Legion::Context ctx,
                      Legion::Runtime *runtime);
-  static BeamInferenceResult inference_speculative_task(
+  static SsmInferenceResult inference_speculative_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
index 298059e3e..d0c549136 100644
--- a/include/flexflow/ops/argmax.h
+++ b/include/flexflow/ops/argmax.h
@@ -34,10 +34,10 @@ class ArgMax : public Op {
   using Params = ArgMaxParams;
   using Input = ParallelTensor;
   ArgMax(FFModel &model,
-         const ParallelTensor input,
+         ParallelTensor const input,
          bool beam_search,
          char const *name);
-  ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input);
+  ArgMax(FFModel &model, ArgMax const &other, ParallelTensor const input);
   ArgMax(FFModel &model,
          Params const &params,
          Input const input,
@@ -66,7 +66,7 @@ class ArgMax : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static BeamInferenceResult
+  static SsmInferenceResult
       inference_task_beam(Legion::Task const *task,
                           std::vector<Legion::PhysicalRegion> const &regions,
                           Legion::Context ctx,
diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h
index 5427ccd0d..32b103b5c 100644
--- a/include/flexflow/ops/beam_topk.h
+++ b/include/flexflow/ops/beam_topk.h
@@ -64,7 +64,7 @@ class BeamTopK : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static BeamInferenceResult
+  static SsmInferenceResult
       inference_task(Legion::Task const *task,
                      std::vector<Legion::PhysicalRegion> const &regions,
                      Legion::Context ctx,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a621c811e..f62fa44f0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -194,7 +194,7 @@ class RequestManager {
   /* Old APIs for reference */
   TreeSearchBatchConfig
       prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
-                              BeamInferenceResult const &result);
+                              SsmInferenceResult const &result);
   BeamSearchBatchConfigFuture
       prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
                               BeamInferenceResultFuture const &result,
@@ -219,7 +219,7 @@ class RequestManager {
       Legion::Runtime *runtime);
 
   void store_beam_metadata(TreeSearchBatchConfig const &old_bc,
-                           BeamInferenceResult const &result);
+                           SsmInferenceResult const &result);
   void update_beam_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             BeamTree &tree,
@@ -235,7 +235,7 @@ class RequestManager {
   // Given the last speculation result, prepare the next speculation batch.
   TreeSearchBatchConfig
       prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
-                              BeamInferenceResult const &result);
+                              SsmInferenceResult const &result);
   // A wrapper function.
   TreeSearchBatchConfigFuture
       prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
@@ -267,7 +267,7 @@ class RequestManager {
   // speculation batch config and use the information to update the token tree
   // stored in RequestManager::all_requests.
   void store_spec_metadata(TreeSearchBatchConfig const &old_bc,
-                           BeamInferenceResult const &result);
+                           SsmInferenceResult const &result);
   // Put the last layer of the token tree stored in RequestManager::all_requests
   // into new_bc::beamRequestsInfo .
   void update_spec_metadata(TreeSearchBatchConfig &new_bc,
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index a18bd404d..614cc476d 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -404,7 +404,7 @@ InferenceResult
   return ir;
 }
 
-BeamInferenceResult ArgTopK::inference_speculative_task(
+SsmInferenceResult ArgTopK::inference_speculative_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
@@ -415,7 +415,7 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
       Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
   if (bc.num_active_tokens() == 0) {
     // Directly return for empty batch config
-    BeamInferenceResult ir;
+    SsmInferenceResult ir;
     return ir;
   }
   ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
@@ -430,7 +430,7 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
   int batch_size = bc.num_active_tokens();
   ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
 
-  BeamInferenceResult ir;
+  SsmInferenceResult ir;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
   download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index a52ce1886..2ead6a1e3 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -44,7 +44,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-Tensor FFModel::argmax(const Tensor input, bool beam_search, char const *name) {
+Tensor FFModel::argmax(Tensor const input, bool beam_search, char const *name) {
   Layer *li = new Layer(this,
                         OP_ARGMAX,
                         input->data_type,
@@ -106,7 +106,7 @@ bool operator==(ArgMaxParams const &lhs, ArgMaxParams const &rhs) {
 }
 
 ArgMax::ArgMax(FFModel &model,
-               const ParallelTensor _input,
+               ParallelTensor const _input,
                bool _beam_search,
                char const *name)
     : Op(model,
@@ -136,12 +136,12 @@ ArgMax::ArgMax(FFModel &model,
   }
 }
 
-ArgMax::ArgMax(FFModel &model, ArgMax const &other, const ParallelTensor input)
+ArgMax::ArgMax(FFModel &model, ArgMax const &other, ParallelTensor const input)
     : ArgMax(model, input, other.beam_search, other.name) {}
 
 ArgMax::ArgMax(FFModel &model,
                ArgMaxParams const &params,
-               const ParallelTensor input,
+               ParallelTensor const input,
                char const *name)
     : ArgMax(model, input, params.beam_search, params.name) {}
 
@@ -332,7 +332,7 @@ FutureMap ArgMax::inference(FFModel const &ff,
   }
 }
 
-BeamInferenceResult
+SsmInferenceResult
     ArgMax::inference_task_beam(Task const *task,
                                 std::vector<PhysicalRegion> const &regions,
                                 Context ctx,
@@ -342,7 +342,7 @@ BeamInferenceResult
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     // Directly return for empty batch config
-    BeamInferenceResult ir;
+    SsmInferenceResult ir;
     return ir;
   }
   ArgMaxMeta *m = *((ArgMaxMeta **)task->local_args);
@@ -355,7 +355,7 @@ BeamInferenceResult
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
-  BeamInferenceResult ir;
+  SsmInferenceResult ir;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   download_tensor(m->probs, ir.probs, batch_size);
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index 5aa6cf5b3..d018ff59e 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -341,7 +341,7 @@ FutureMap BeamTopK::inference(FFModel const &ff,
   return runtime->execute_index_space(ctx, launcher);
 }
 
-BeamInferenceResult
+SsmInferenceResult
     BeamTopK::inference_task(Task const *task,
                              std::vector<PhysicalRegion> const &regions,
                              Context ctx,
@@ -355,7 +355,7 @@ BeamInferenceResult
       Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
 
   if (bc.num_tokens == 0) {
-    BeamInferenceResult ir;
+    SsmInferenceResult ir;
     return ir;
   }
 
@@ -391,7 +391,7 @@ BeamInferenceResult
                                    length,
                                    m->sorted);
 
-  BeamInferenceResult ir;
+  SsmInferenceResult ir;
 
   download_tensor<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
   download_tensor<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 7af8acfc0..4b8984324 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6038,14 +6038,14 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BeamInferenceResult,
+      Runtime::preregister_task_variant<SsmInferenceResult,
                                         ArgTopK::inference_speculative_task>(
           registrar, "ArgTopK Speculative Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BeamInferenceResult,
+      runtime->register_task_variant<SsmInferenceResult,
                                      ArgTopK::inference_speculative_task>(
           registrar);
     }
@@ -6070,15 +6070,16 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BeamInferenceResult,
+      Runtime::preregister_task_variant<SsmInferenceResult,
                                         BeamTopK::inference_task>(
           registrar, "BeamTopK Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BeamInferenceResult,
-                                     BeamTopK::inference_task>(registrar);
+      runtime
+          ->register_task_variant<SsmInferenceResult, BeamTopK::inference_task>(
+              registrar);
     }
   }
   // Sampling task
@@ -6133,14 +6134,14 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BeamInferenceResult,
+      Runtime::preregister_task_variant<SsmInferenceResult,
                                         ArgMax::inference_task_beam>(
           registrar, "ArgMax Inference Task Beam");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BeamInferenceResult,
+      runtime->register_task_variant<SsmInferenceResult,
                                      ArgMax::inference_task_beam>(registrar);
     }
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b8edfdda0..c41dba57d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1017,15 +1017,15 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
   RequestManager *rm = *((RequestManager **)task->args);
   TreeSearchBatchConfig const &bc =
       Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
-  BeamInferenceResult const &result =
-      Future(task->futures[1]).get_result<BeamInferenceResult>();
+  SsmInferenceResult const &result =
+      Future(task->futures[1]).get_result<SsmInferenceResult>();
   return rm->prepare_next_batch_beam(bc, result);
 }
 
 // update beam search metadata
 TreeSearchBatchConfig
     RequestManager::prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
-                                            BeamInferenceResult const &result) {
+                                            SsmInferenceResult const &result) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_beam ###############\n";
@@ -1633,7 +1633,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 }
 
 void RequestManager::store_beam_metadata(TreeSearchBatchConfig const &old_bc,
-                                         BeamInferenceResult const &result) {
+                                         SsmInferenceResult const &result) {
   // step1 store the outputs
   if (old_bc.num_tokens <= 0) {
     return;

From 35384c3483ee0a51d21f23553b32b59afb52c040 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 25 Mar 2024 13:36:16 -0400
Subject: [PATCH 010/667] Rename BeamSearchBatchConfigFuture to
 TreeSearchBatchConfigFuture, and rename BeamInferenceFuture to
 SsmInferenceResultFuture

---
 include/flexflow/batch_config.h    |  5 ++---
 include/flexflow/request_manager.h | 12 ++++++------
 src/runtime/inference_manager.cc   |  2 +-
 src/runtime/request_manager.cc     | 22 +++++++++++-----------
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index fb81d4f8d..d16b1227a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -32,10 +32,9 @@ class SsmInferenceResult;
 
 using BatchConfigFuture = Legion::Future;
 using InferenceResultFuture = Legion::Future;
-using BeamSearchBatchConfigFuture = Legion::Future;
-using TreeVerifyBatchConfigFuture = Legion::Future;
-using BeamInferenceResultFuture = Legion::Future;
 using TreeSearchBatchConfigFuture = Legion::Future;
+using TreeVerifyBatchConfigFuture = Legion::Future;
+using SsmInferenceResultFuture = Legion::Future;
 
 class BatchConfig {
 public:
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f62fa44f0..73531f8a4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -195,16 +195,16 @@ class RequestManager {
   TreeSearchBatchConfig
       prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
                               SsmInferenceResult const &result);
-  BeamSearchBatchConfigFuture
-      prepare_next_batch_beam(BeamSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result,
+  TreeSearchBatchConfigFuture
+      prepare_next_batch_beam(TreeSearchBatchConfigFuture const &old_bc,
+                              SsmInferenceResultFuture const &result,
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
   TreeSearchBatchConfig
       prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                               InferenceResult const &result,
                               int model_id);
-  BeamSearchBatchConfigFuture
+  TreeSearchBatchConfigFuture
       prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
                               InferenceResultFuture const &result,
                               int model_id,
@@ -214,7 +214,7 @@ class RequestManager {
   TreeVerifyBatchConfig prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfig> const &old_batches);
   TreeVerifyBatchConfigFuture prepare_next_batch_verify(
-      std::vector<BeamSearchBatchConfigFuture> const &old_batches,
+      std::vector<TreeSearchBatchConfigFuture> const &old_batches,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
@@ -239,7 +239,7 @@ class RequestManager {
   // A wrapper function.
   TreeSearchBatchConfigFuture
       prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
-                              BeamInferenceResultFuture const &result,
+                              SsmInferenceResultFuture const &result,
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
   // Given the verification result, prepare the first speculation batch.
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 9d73ca938..cf8a7aa89 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -314,7 +314,7 @@ FutureMap InferenceManager::inference(FFModel *model,
     BatchConfig const *bc_ptr = &bc;
     TreeSearchBatchConfig const *bsbc_ptr =
         static_cast<TreeSearchBatchConfig const *>(bc_ptr);
-    BeamSearchBatchConfigFuture bcf =
+    TreeSearchBatchConfigFuture bcf =
         Future::from_value<TreeSearchBatchConfig>(*bsbc_ptr);
     return inference(model, index, bcf);
   } else if (bc.get_mode() == TREE_VERIFY_MODE) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c41dba57d..d7508c978 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -557,7 +557,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/
-BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
+TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
     TreeVerifyBatchConfigFuture const &old_bc,
     InferenceResultFuture const &result,
     int model_id,
@@ -995,9 +995,9 @@ TreeSearchBatchConfig
 }
 
 /***** Beam Search Phase *****/
-BeamSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
-    BeamSearchBatchConfigFuture const &old_bc,
-    BeamInferenceResultFuture const &result,
+TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
+    TreeSearchBatchConfigFuture const &old_bc,
+    SsmInferenceResultFuture const &result,
     Context ctx,
     Runtime *runtime) {
 
@@ -1311,7 +1311,7 @@ TreeSearchBatchConfig
 /***** Verify Phase *****/
 
 TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify(
-    std::vector<BeamSearchBatchConfigFuture> const &old_batches,
+    std::vector<TreeSearchBatchConfigFuture> const &old_batches,
     Context ctx,
     Runtime *runtime) {
 
@@ -2480,9 +2480,9 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
       }
     }
     auto const &next_batch = batch_pipeline.back();
-    BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
+    TreeSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
         next_batch.first, next_batch.second, 0, ctx, runtime);
-    std::vector<BeamSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
+    std::vector<TreeSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
     for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {
       beam_bcf_vec[ssm_id] = beam_bcf;
     }
@@ -2495,7 +2495,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
 
         FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]);
         assert(fm.get_future_map_domain().get_volume() == 1);
-        BeamInferenceResultFuture beam_irf = fm.get_future(0);
+        SsmInferenceResultFuture beam_irf = fm.get_future(0);
         beam_bcf_vec[i] =
             prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime);
       }
@@ -2573,9 +2573,9 @@ void RequestManager::serve_spec_infer_v2(FFModel *llm) {
       }
     }
     auto const &next_batch = batch_pipeline.back();
-    BeamSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
+    TreeSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
         next_batch.first, next_batch.second, 0, ctx, runtime);
-    std::vector<BeamSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
+    std::vector<TreeSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
     for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {
       beam_bcf_vec[ssm_id] = beam_bcf;
     }
@@ -2588,7 +2588,7 @@ void RequestManager::serve_spec_infer_v2(FFModel *llm) {
 
         FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]);
         assert(fm.get_future_map_domain().get_volume() == 1);
-        BeamInferenceResultFuture beam_irf = fm.get_future(0);
+        SsmInferenceResultFuture beam_irf = fm.get_future(0);
         beam_bcf_vec[i] =
             prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime);
       }

From b5561e18925f23bcf6fe161952ed3c1f30304eed Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 10:18:29 -0400
Subject: [PATCH 011/667] Rename
 TreeSearchBatchConfig::TreeSearchPerRequestInfo::num_tokens_in_layer to
 num_tokens_at_depth; Write another version of the data structure of TokenTree

---
 include/flexflow/batch_config.h    |  2 +-
 include/flexflow/request_manager.h | 49 ++++++------------------------
 src/runtime/request_manager.cc     | 14 +++++----
 3 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index d16b1227a..c6fe18752 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -156,7 +156,7 @@ class TreeSearchBatchConfig : public BatchConfig {
   int model_id;
 
   struct TreeSearchPerRequestInfo {
-    int num_tokens_in_layer = 0;
+    int num_tokens_at_depth = 0;
   };
 
   TreeSearchPerRequestInfo tree_requests_info[MAX_NUM_REQUESTS];
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 73531f8a4..acf2287e7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -73,52 +73,21 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  std::vector<struct BeamTree> beam_trees; // Old version, delete after refactor
   std::vector<struct TokenTree> token_trees; // New version
 };
 
-// The old version of beam tree
-// store the result of beam search
-struct BeamTree {
-  struct treeLayer {
-    TreeSearchBatchConfig::TokenId
-        tokens[TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int parent_ids[TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    float probs[TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-    int nodes_num_this_layer = 0;
-  };
-  treeLayer treeLayers[TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1];
+struct TokenTreeNode {
+  BatchConfig::TokenId id;
+  float joint_prob;
+  int parent_pos;
 };
 
-// The new version of BeamTree
-// Named as TokenTree, supports general tree structure.
-class TokenTree {
-  class Node {
-  public:
-    BatchConfig::TokenId id;
-    float unconditional_prob;
-    std::vector<std::shared_ptr<Node>> children;
-    std::shared_ptr<Node> parent;
-    Node(BatchConfig::TokenId id, float prob, std::shared_ptr<Node> parent)
-        : id(id), unconditional_prob(prob), parent(parent) {}
-  };
-
-  class TreeLayer {
-  public:
-    std::vector<std::shared_ptr<Node>> nodes;
-  };
-
-private:
-  std::vector<TreeLayer> layers;
-  // Do we need the root?
-  std::shared_ptr<Node> root;
+struct TreeLayer {
+  std::vector<TokenTreeNode> nodes;
+};
 
-public:
-  TokenTree(BatchConfig::TokenId root_id, float root_prob)
-      : root(std::make_shared<Node>(root_id, root_prob, nullptr)) {
-    layers.push_back(TreeLayer());
-    layers[0].nodes.push_back(root);
-  }
+class TokenTree {
+  std::vector<TreeLayer> tree_layers;
 };
 
 class RequestManager {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d7508c978..64fc9d4d9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1024,21 +1024,23 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
 
 // update beam search metadata
 TreeSearchBatchConfig
-    RequestManager::prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
+    RequestManager::prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
                                             SsmInferenceResult const &result) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
-    std::cout << "\n############### prepare_next_batch_beam ###############\n";
+    std::cout << "\n############### prepare_next_batch_spec ###############\n";
   }
   if (verbose) {
     std::cout << "print all results" << "\n";
     for (int i = 0; i < 40; i++) {
       std::cout << result.token_ids[i] << ", ";
     }
-    std::cout << "Current Beam Depth: "
-              << old_bc.beamRequestsInfo[0].current_depth << "\n";
-    std::cout << "Current sub request num: "
-              << old_bc.beamRequestsInfo[0].sub_request_num << "\n";
+    std::cout << "Current tree depth: " << old_bc.current_depth << "\n";
+    std::cout << "Number of tokens in each requests: " << std::endl;
+    for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
+      std::cout << i << "\t" << old_bc.tree_requests_info[i].num_tokens_at_depth
+                << std::endl;
+    }
   }
   // Step 1: Store result to the beam tree struct
   store_beam_metadata(old_bc, result);

From e205c695b9c4ee0fdf91411d7c7eb25399f6601e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 10:29:55 -0400
Subject: [PATCH 012/667] Add an indicator of whether a token is pruned in
 TokenTreeNode

---
 include/flexflow/request_manager.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index acf2287e7..6ebc7ca94 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -80,6 +80,7 @@ struct TokenTreeNode {
   BatchConfig::TokenId id;
   float joint_prob;
   int parent_pos;
+  bool pruned = false;
 };
 
 struct TreeLayer {

From 2581a58f5c080adf49568fd96fd9892142e32d94 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 10:33:45 -0400
Subject: [PATCH 013/667] Add a priority queue token_tree_node_pool in
 RequestManager. It is used to store shared_ptr of TokenTreeNodes. It is a min
 heap with the joint_prob of a token being a key.

---
 include/flexflow/request_manager.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 6ebc7ca94..e6565b67e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -355,6 +355,9 @@ class RequestManager {
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
+  // This is a helper data structure to store help the pruning of the token
+  // trees across different requests.
+  std::priority_queue<std::shared_ptr<TokenTreeNode>> token_tree_node_pool;
 
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,

From 830f483438d833f6610f92b43a60f8616e50e750 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 10:57:18 -0400
Subject: [PATCH 014/667] Rename task ID from
 RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID to RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
 other changes are caused by formatting.

---
 include/flexflow/model.h       | 186 ++++++++++++++++-----------------
 src/mapper/mapper.cc           | 114 ++++++++++----------
 src/runtime/model.cc           |   2 +-
 src/runtime/request_manager.cc |   2 +-
 4 files changed, 152 insertions(+), 152 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 95be9ab58..6ad60267f 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -246,7 +246,7 @@ enum TaskIDs {
   RM_LOAD_BATCH_CONFIG_TASK_ID,
   RM_PREPARE_NEXT_BATCH_TASK_ID,
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
-  RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+  RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
   // Custom tasks
@@ -407,74 +407,74 @@ class FFModel {
   bool cpu_offload;
   // C++ APIs for constructing models
   // Add an exp layer
-  Tensor exp(const Tensor x, char const *name = NULL);
+  Tensor exp(Tensor const x, char const *name = NULL);
   // Add an add layer
-  Tensor add(const Tensor x,
-             const Tensor y,
+  Tensor add(Tensor const x,
+             Tensor const y,
              bool inplace_a = false,
              char const *name = NULL);
   // Add a subtract layer
-  Tensor subtract(const Tensor x,
-                  const Tensor y,
+  Tensor subtract(Tensor const x,
+                  Tensor const y,
                   bool inplace_a = false,
                   char const *name = NULL);
   // Add a multiply layer
-  Tensor multiply(const Tensor x,
-                  const Tensor y,
+  Tensor multiply(Tensor const x,
+                  Tensor const y,
                   bool inplace_a = false,
                   char const *name = NULL);
   // Add a divide layer
-  Tensor divide(const Tensor x,
-                const Tensor y,
+  Tensor divide(Tensor const x,
+                Tensor const y,
                 bool inplace_a = false,
                 char const *name = NULL);
   // Add a max layer
-  Tensor max(const Tensor x,
-             const Tensor y,
+  Tensor max(Tensor const x,
+             Tensor const y,
              bool inplace_a = false,
              char const *name = NULL);
   // Add a min layer
-  Tensor min(const Tensor x,
-             const Tensor y,
+  Tensor min(Tensor const x,
+             Tensor const y,
              bool inplace_a = false,
              char const *name = NULL);
   // Add a rsqrt layer
-  Tensor rsqrt(const Tensor x, bool inplace = true, char const *name = NULL);
+  Tensor rsqrt(Tensor const x, bool inplace = true, char const *name = NULL);
   // Add a pow layer
-  Tensor pow(const Tensor x,
+  Tensor pow(Tensor const x,
              float const exponent,
              bool inplace = true,
              char const *name = NULL);
   // Add a scalar multiply layer
-  Tensor scalar_multiply(const Tensor x,
+  Tensor scalar_multiply(Tensor const x,
                          float const scalar,
                          bool inplace = true,
                          char const *name = NULL);
-  Tensor scalar_add(const Tensor x,
+  Tensor scalar_add(Tensor const x,
                     float const scalar,
                     bool inplace = true,
                     char const *name = NULL);
-  Tensor scalar_sub(const Tensor x,
+  Tensor scalar_sub(Tensor const x,
                     float const scalar,
                     bool inplace = true,
                     char const *name = NULL);
-  Tensor scalar_truediv(const Tensor x,
+  Tensor scalar_truediv(Tensor const x,
                         float const scalar,
                         bool inplace = true,
                         char const *name = NULL);
   // Add a sin layer
-  Tensor sin(const Tensor x, char const *name = NULL);
+  Tensor sin(Tensor const x, char const *name = NULL);
   // Add a cos layer
-  Tensor cos(const Tensor x, char const *name = NULL);
+  Tensor cos(Tensor const x, char const *name = NULL);
   // Add an activation layer
-  Tensor relu(const Tensor x, bool inplace = true, char const *name = NULL);
-  Tensor identity(const Tensor x, char const *name = NULL);
-  Tensor gelu(const Tensor x, char const *name = NULL);
-  Tensor sigmoid(const Tensor x, char const *name = NULL);
-  Tensor tanh(const Tensor x, char const *name = NULL);
-  Tensor elu(const Tensor x, bool inplace = true, char const *name = NULL);
+  Tensor relu(Tensor const x, bool inplace = true, char const *name = NULL);
+  Tensor identity(Tensor const x, char const *name = NULL);
+  Tensor gelu(Tensor const x, char const *name = NULL);
+  Tensor sigmoid(Tensor const x, char const *name = NULL);
+  Tensor tanh(Tensor const x, char const *name = NULL);
+  Tensor elu(Tensor const x, bool inplace = true, char const *name = NULL);
   // Add a 2D convolutional layer
-  Tensor conv2d(const Tensor input,
+  Tensor conv2d(Tensor const input,
                 int outChannels,
                 int kernelH,
                 int kernelW,
@@ -490,12 +490,12 @@ class FFModel {
                 Initializer *bias_initializer = NULL,
                 char const *name = NULL);
   // Add a dropout layer
-  Tensor dropout(const Tensor input,
+  Tensor dropout(Tensor const input,
                  float rate,
                  unsigned long long seed = 0,
                  char const *name = NULL);
   // Add an embedding layer
-  Tensor embedding(const Tensor input,
+  Tensor embedding(Tensor const input,
                    int num_entries,
                    int outDim,
                    AggrMode aggr,
@@ -504,13 +504,13 @@ class FFModel {
                    Initializer *kernel_initializer = NULL,
                    char const *name = NULL);
   // Add a gather layer
-  Tensor gather(const Tensor input,
-                const Tensor index,
+  Tensor gather(Tensor const input,
+                Tensor const index,
                 int dim,
                 char const *name = NULL);
   // Add a group_by layer
-  void group_by(const Tensor data,
-                const Tensor assign,
+  void group_by(Tensor const data,
+                Tensor const assign,
                 Tensor *outputs,
                 int n,
                 float alpha,
@@ -532,7 +532,7 @@ class FFModel {
                         float lambda_bal,
                         char const *name = NULL);
   // Add a 2D pooling layer
-  Tensor pool2d(const Tensor input,
+  Tensor pool2d(Tensor const input,
                 int kernelH,
                 int kernelW,
                 int strideH,
@@ -543,7 +543,7 @@ class FFModel {
                 ActiMode activation = AC_MODE_NONE,
                 char const *name = NULL);
   // Add a layer_norm layer
-  Tensor layer_norm(const Tensor input,
+  Tensor layer_norm(Tensor const input,
                     std::vector<int> const &axes,
                     bool elementwise_affine,
                     float eps,
@@ -551,9 +551,9 @@ class FFModel {
                     DataType data_type = DT_NONE,
                     char const *name = NULL);
   // Add a layer_norm layer with residual(s)
-  void residual_layer_norm(const Tensor input,
-                           const Tensor residual1,
-                           const Tensor residual2,
+  void residual_layer_norm(Tensor const input,
+                           Tensor const residual1,
+                           Tensor const residual2,
                            Tensor *outputs,
                            bool use_two_residuals,
                            std::vector<int> const &axes,
@@ -563,8 +563,8 @@ class FFModel {
                            DataType data_type = DT_NONE,
                            char const *name = NULL);
   // Add a add_bias_residual_layer_norm layer
-  void add_bias_residual_layer_norm(const Tensor input,
-                                    const Tensor residual,
+  void add_bias_residual_layer_norm(Tensor const input,
+                                    Tensor const residual,
                                     Tensor *outputs,
                                     std::vector<int> const &axes,
                                     bool elementwise_affine,
@@ -573,41 +573,41 @@ class FFModel {
                                     DataType data_type = DT_NONE,
                                     char const *name = NULL);
   // Add a sigmoid_silu_multi layer
-  Tensor sigmoid_silu_multi(const Tensor input1,
-                            const Tensor input2,
+  Tensor sigmoid_silu_multi(Tensor const input1,
+                            Tensor const input2,
                             DataType data_type = DT_NONE,
                             char const *name = NULL);
   // Add a batch_norm layer
   Tensor
-      batch_norm(const Tensor input, bool relu = true, char const *name = NULL);
+      batch_norm(Tensor const input, bool relu = true, char const *name = NULL);
   // Add a batch_matmul layer
-  Tensor batch_matmul(const Tensor A,
-                      const Tensor B,
+  Tensor batch_matmul(Tensor const A,
+                      Tensor const B,
                       int a_seq_length_dim = -1,
                       int b_seq_length_dim = -1,
                       char const *name = nullptr);
   // Add a root mean square layer
-  Tensor rms_norm(const Tensor input,
+  Tensor rms_norm(Tensor const input,
                   float eps,
                   int dim,
                   DataType data_type = DT_NONE,
                   char const *name = NULL);
   // Add a residual root mean square layer
-  void residual_rms_norm(const Tensor input1,
-                         const Tensor input2,
+  void residual_rms_norm(Tensor const input1,
+                         Tensor const input2,
                          Tensor *outputs,
                          float eps,
                          int dim,
                          DataType data_type = DT_NONE,
                          char const *name = NULL);
   // Add a beam search top k layer
-  Tensor beam_top_k(const Tensor input,
+  Tensor beam_top_k(Tensor const input,
                     int max_beam_size,
                     bool sorted,
                     char const *name = NULL);
 
   // Add a dense layer
-  Tensor dense(const Tensor input,
+  Tensor dense(Tensor const input,
                int outDim,
                ActiMode activation = AC_MODE_NONE,
                bool use_bias = true,
@@ -619,7 +619,7 @@ class FFModel {
                float regularizer_lambda = 0.0,
                char const *name = NULL);
   // Add a cast layer
-  Tensor cast(const Tensor input, DataType dtype, char const *name = nullptr);
+  Tensor cast(Tensor const input, DataType dtype, char const *name = nullptr);
   // Add a concat layer
   Tensor
       concat(int n, Tensor const *tensors, int axis, char const *name = NULL);
@@ -634,58 +634,58 @@ class FFModel {
       int experts_internal_dim_size = 0, // hidden dimension for internal layers
       char const *name = NULL);
   // Add a mean layer
-  Tensor mean(const Tensor input,
+  Tensor mean(Tensor const input,
               std::vector<int> const &dims,
               bool keepdims,
               char const *name);
   // Add a moe layer (wrapping topk, group_by and aggregate operators)
-  Tensor moe(const Tensor input,
+  Tensor moe(Tensor const input,
              int num_exp,
              int num_select,
              int expert_hidden_size,
              float alpha,
              float lambda);
   // Add a split layer
-  void split(const Tensor input,
+  void split(Tensor const input,
              Tensor *outputs,
              std::vector<int> const &split,
              int axis,
              char const *name = NULL);
   // Add a flat layer
-  Tensor flat(const Tensor input, char const *name = NULL);
+  Tensor flat(Tensor const input, char const *name = NULL);
   // Add a softmax layer
-  Tensor softmax(const Tensor input,
+  Tensor softmax(Tensor const input,
                  int dim = -1,
                  DataType data_type = DT_NONE,
                  char const *name = NULL);
   // Create input tensors and constants
-  Tensor transpose(const Tensor input,
+  Tensor transpose(Tensor const input,
                    std::vector<int> const &perm,
                    char const *name = NULL);
-  Tensor reduce_sum(const Tensor input,
+  Tensor reduce_sum(Tensor const input,
                     std::vector<int> const &axes,
                     bool keepdims = false,
                     char const *name = nullptr);
-  Tensor reshape(const Tensor input,
+  Tensor reshape(Tensor const input,
                  std::vector<int> const &shape,
                  char const *name = NULL);
-  Tensor reverse(const Tensor input, int axis, char const *name = NULL);
-  void top_k(const Tensor input,
+  Tensor reverse(Tensor const input, int axis, char const *name = NULL);
+  void top_k(Tensor const input,
              Tensor *outputs,
              int k,
              bool sorted,
              char const *name = NULL);
-  Tensor arg_top_k(const Tensor input,
+  Tensor arg_top_k(Tensor const input,
                    // Tensor *outputs,
                    int k,
                    bool sorted,
                    bool speculative_decoding,
                    char const *name = NULL);
-  Tensor argmax(const Tensor input, bool beam_search, char const *name = NULL);
-  Tensor sampling(const Tensor input, float top_p, char const *name = NULL);
-  Tensor multihead_attention(const Tensor query,
-                             const Tensor key,
-                             const Tensor value,
+  Tensor argmax(Tensor const input, bool beam_search, char const *name = NULL);
+  Tensor sampling(Tensor const input, float top_p, char const *name = NULL);
+  Tensor multihead_attention(Tensor const query,
+                             Tensor const key,
+                             Tensor const value,
                              int embed_dim,
                              int num_heads,
                              int kdim = 0,
@@ -697,7 +697,7 @@ class FFModel {
                              DataType data_type = DT_NONE,
                              Initializer *kernel_initializer = NULL,
                              char const *name = NULL);
-  Tensor inc_multihead_self_attention(const Tensor input,
+  Tensor inc_multihead_self_attention(Tensor const input,
                                       int embed_dim,
                                       int num_heads,
                                       int kdim = 0,
@@ -715,7 +715,7 @@ class FFModel {
                                       bool position_bias = false,
                                       char const *name = NULL);
   Tensor
-      spec_inc_multihead_self_attention(const Tensor input,
+      spec_inc_multihead_self_attention(Tensor const input,
                                         int embed_dim,
                                         int num_heads,
                                         int kdim = 0,
@@ -733,7 +733,7 @@ class FFModel {
                                         bool position_bias = false,
                                         char const *name = NULL);
   Tensor inc_multihead_self_attention_verify(
-      const Tensor input,
+      Tensor const input,
       int embed_dim,
       int num_heads,
       int kdim = 0,
@@ -750,7 +750,7 @@ class FFModel {
       bool qk_prod_scaling = true,
       bool position_bias = false,
       char const *name = NULL);
-  Tensor inc_multiquery_self_attention(const Tensor input,
+  Tensor inc_multiquery_self_attention(Tensor const input,
                                        int embed_dim,
                                        int num_q_heads,
                                        int num_kv_heads,
@@ -769,7 +769,7 @@ class FFModel {
                                        bool position_bias = false,
                                        char const *name = NULL);
   Tensor
-      spec_inc_multiquery_self_attention(const Tensor input,
+      spec_inc_multiquery_self_attention(Tensor const input,
                                          int embed_dim,
                                          int num_q_heads,
                                          int num_kv_heads,
@@ -788,7 +788,7 @@ class FFModel {
                                          bool position_bias = false,
                                          char const *name = NULL);
   Tensor inc_multiquery_self_attention_verify(
-      const Tensor input,
+      Tensor const input,
       int embed_dim,
       int num_q_heads,
       int num_kv_heads,
@@ -820,7 +820,7 @@ class FFModel {
                                        bool create_grad = true);
   ParallelTensor
       create_parallel_tensor_legion_ordering(int num_dim,
-                                             const ParallelDim dims[],
+                                             ParallelDim const dims[],
                                              DataType data_type,
                                              Op const *owner_op = NULL,
                                              int owner_idx = 0,
@@ -833,7 +833,7 @@ class FFModel {
                        int owner_idx = 0,
                        bool create_grad = true);
   ParallelTensor create_parallel_tensor(int num_dim,
-                                        const ParallelDim dims[],
+                                        ParallelDim const dims[],
                                         DataType data_type,
                                         Op const *owner_op = NULL,
                                         int owner_idx = 0,
@@ -846,7 +846,7 @@ class FFModel {
                        int owner_idx = 0,
                        bool create_grad = true);
   template <int NDIM>
-  ParallelTensor create_parallel_tensor(const ParallelDim dims[],
+  ParallelTensor create_parallel_tensor(ParallelDim const dims[],
                                         DataType data_type,
                                         Op const *owner_op = NULL,
                                         int owner_idx = 0,
@@ -870,7 +870,7 @@ class FFModel {
       ParameterSyncType sync_type = ParameterSyncType::NONE);
   template <int NDIM>
   ParallelParameter create_parallel_weight(
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       DataType data_type,
       Op const *owner_op = NULL,
       bool create_grad = true,
@@ -878,7 +878,7 @@ class FFModel {
       ParameterSyncType sync_type = ParameterSyncType::NONE);
   ParallelParameter create_parallel_weight(
       int numdim,
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       DataType data_type,
       Op const *owner_op = NULL,
       bool create_grad = true,
@@ -886,7 +886,7 @@ class FFModel {
       ParameterSyncType sync_type = ParameterSyncType::NONE);
   ParallelParameter create_parallel_weight_legion_ordering(
       int numdim,
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       DataType data_type,
       Op const *owner_op = NULL,
       bool create_grad = true,
@@ -895,7 +895,7 @@ class FFModel {
 
   void map_tensor(ParallelTensor tensor, Op const *parallel_op);
   void map_weight(ParallelTensor tensor, Op const *parallel_op);
-  bool get_parallel_tensor_from_tensor(const Tensor tensor,
+  bool get_parallel_tensor_from_tensor(Tensor const tensor,
                                        ParallelTensor &parallel_tensor) const;
 
   template <int NDIM>
@@ -936,7 +936,7 @@ class FFModel {
   // Internal PCG::Node creation APIs
   // ========================================
   template <typename T>
-  PCG::Node get_or_create_node(const typename T::Input &input,
+  PCG::Node get_or_create_node(typename T::Input const &input,
                                typename T::Params const &params) {
     using Params = typename T::Params;
 
@@ -966,50 +966,50 @@ class FFModel {
     return this->new_node(op);
   }
 
-  PCG::Node get_or_create_noop_node(const ParallelTensor input);
+  PCG::Node get_or_create_noop_node(ParallelTensor const input);
   PCG::Node get_or_create_input_node(ParallelTensorShape const &);
   PCG::Node get_or_create_fused_parallel_node(
-      const ParallelTensor input,
+      ParallelTensor const input,
       std::vector<ParallelOpInfo> const &parallel_ops);
-  PCG::Node get_or_create_parallel_op_node(const ParallelTensor input,
+  PCG::Node get_or_create_parallel_op_node(ParallelTensor const input,
                                            ParallelOpInfo const &);
   // ========================================
   // Internal APIs that should not be invoked from applications
   // ========================================
   void create_disjoint_partition(int num_dims,
-                                 const ParallelDim dims[],
+                                 ParallelDim const dims[],
                                  Legion::IndexSpace const &part_is,
                                  Legion::LogicalRegion const &region,
                                  Legion::LogicalPartition &part);
   template <int NDIM, int TDIM>
   void create_disjoint_partition_with_dim2(
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       Legion::IndexSpaceT<TDIM> const &part_is,
       Legion::LogicalRegion const &region,
       Legion::LogicalPartition &part);
   void create_aliased_partition(int num_dims,
-                                const ParallelDim dims[],
+                                ParallelDim const dims[],
                                 int aliased_dim,
                                 Legion::IndexSpace const &part_is,
                                 Legion::LogicalRegion const &region,
                                 Legion::LogicalPartition &part);
   template <int NDIM, int TDIM>
   void create_aliased_partition_with_dim2(
-      const ParallelDim dims[],
+      ParallelDim const dims[],
       int aliased_dim,
       Legion::IndexSpaceT<TDIM> const &part_is,
       Legion::LogicalRegion const &region,
       Legion::LogicalPartition &part);
 
   template <int NDIM>
-  void create_disjoint_partition(const ParallelTensor tensor,
+  void create_disjoint_partition(ParallelTensor const tensor,
                                  Legion::IndexSpaceT<NDIM> const &part_is,
                                  Legion::LogicalPartition &part_fwd,
                                  Legion::LogicalPartition &part_bwd);
 
   template <int NDIM, int TDIM>
   void create_data_parallel_partition_with_diff_dims(
-      const ParallelTensor tensor,
+      ParallelTensor const tensor,
       Legion::IndexSpaceT<TDIM> const &task_is,
       Legion::LogicalPartition &part_fwd,
       Legion::LogicalPartition &part_bwd);
@@ -1097,7 +1097,7 @@ class FFModel {
   Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc);
   Legion::IndexSpace get_or_create_task_is(MachineView const &view);
   Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain);
-  Legion::IndexSpace get_or_create_task_is(const ParallelTensor);
+  Legion::IndexSpace get_or_create_task_is(ParallelTensor const);
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index d7aac4e37..e1d048017 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -33,7 +33,7 @@ FFShardingFunctor::~FFShardingFunctor(void) {}
 
 ShardID FFShardingFunctor::shard(DomainPoint const &point,
                                  Domain const &full_space,
-                                 const size_t total_shards) {
+                                 size_t const total_shards) {
   assert(point.get_dim() == full_space.get_dim());
   int device_id = machine_view.start_device_id;
   for (int i = 0; i < point.get_dim(); i++) {
@@ -259,7 +259,7 @@ Mapper::MapperSyncModel FFMapper::get_mapper_sync_model(void) const {
   return SERIALIZED_REENTRANT_MAPPER_MODEL;
 }
 
-void FFMapper::select_task_options(const MapperContext ctx,
+void FFMapper::select_task_options(MapperContext const ctx,
                                    Task const &task,
                                    TaskOptions &output) {
   unsigned long long task_hash = compute_task_hash(task);
@@ -285,7 +285,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
   }
   if ((task.task_id == RM_PREPARE_NEXT_BATCH_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
-      (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
+      (task.task_id == RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) ||
       (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) {
     output.initial_proc = all_cpus[0];
@@ -374,7 +374,7 @@ void FFMapper::select_task_options(const MapperContext ctx,
   assert(task.is_index_space);
 }
 
-void FFMapper::slice_task(const MapperContext ctx,
+void FFMapper::slice_task(MapperContext const ctx,
                           Task const &task,
                           SliceTaskInput const &input,
                           SliceTaskOutput &output) {
@@ -480,14 +480,14 @@ void FFMapper::slice_task(const MapperContext ctx,
   }
 }
 
-void FFMapper::premap_task(const MapperContext ctx,
+void FFMapper::premap_task(MapperContext const ctx,
                            Task const &task,
                            PremapTaskInput const &input,
                            PremapTaskOutput &output) {
   assert(false);
 }
 
-void FFMapper::map_task(const MapperContext ctx,
+void FFMapper::map_task(MapperContext const ctx,
                         Task const &task,
                         MapTaskInput const &input,
                         MapTaskOutput &output) {
@@ -663,13 +663,13 @@ void FFMapper::map_task(const MapperContext ctx,
   } // for idx
 }
 
-void FFMapper::replicate_task(const MapperContext ctx,
+void FFMapper::replicate_task(MapperContext const ctx,
                               Task const &task,
                               ReplicateTaskInput const &input,
                               ReplicateTaskOutput &output) {
   // Should only be replicated for the top-level task
   assert((task.get_depth() == 0) && (task.regions.size() == 0));
-  const Processor::Kind target_kind = task.target_proc.kind();
+  Processor::Kind const target_kind = task.target_proc.kind();
   VariantID vid;
   {
     std::vector<VariantID> variant_ids;
@@ -685,7 +685,7 @@ void FFMapper::replicate_task(const MapperContext ctx,
   procs.only_kind(target_kind);
   for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end();
        it++) {
-    const AddressSpace space = it->address_space();
+    AddressSpace const space = it->address_space();
     if (handled[space]) {
       continue;
     }
@@ -696,21 +696,21 @@ void FFMapper::replicate_task(const MapperContext ctx,
   assert(count == total_nodes);
 }
 
-void FFMapper::select_task_variant(const MapperContext ctx,
+void FFMapper::select_task_variant(MapperContext const ctx,
                                    Task const &task,
                                    SelectVariantInput const &input,
                                    SelectVariantOutput &output) {
   assert(false);
 }
 
-void FFMapper::postmap_task(const MapperContext ctx,
+void FFMapper::postmap_task(MapperContext const ctx,
                             Task const &task,
                             PostMapInput const &input,
                             PostMapOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_task_sources(const MapperContext ctx,
+void FFMapper::select_task_sources(MapperContext const ctx,
                                    Task const &task,
                                    SelectTaskSrcInput const &input,
                                    SelectTaskSrcOutput &output) {
@@ -795,26 +795,26 @@ void FFMapper::default_policy_select_sources(
 }
 
 void FFMapper::create_task_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Task const &task,
     CreateTaskTemporaryInput const &input,
     CreateTaskTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Task const &task,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Task const &task,
                                 TaskProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Task const &task,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -843,7 +843,7 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   }
 }
 
-void FFMapper::map_inline(const MapperContext ctx,
+void FFMapper::map_inline(MapperContext const ctx,
                           InlineMapping const &inline_op,
                           MapInlineInput const &input,
                           MapInlineOutput &output) {
@@ -944,7 +944,7 @@ void FFMapper::map_inline(const MapperContext ctx,
   }
 }
 
-void FFMapper::select_inline_sources(const MapperContext ctx,
+void FFMapper::select_inline_sources(MapperContext const ctx,
                                      InlineMapping const &inline_op,
                                      SelectInlineSrcInput const &input,
                                      SelectInlineSrcOutput &output) {
@@ -954,27 +954,27 @@ void FFMapper::select_inline_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_inline_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     InlineMapping const &inline_op,
     CreateInlineTemporaryInput const &input,
     CreateInlineTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 InlineMapping const &inline_op,
                                 InlineProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::map_copy(const MapperContext ctx,
+void FFMapper::map_copy(MapperContext const ctx,
                         Copy const &copy,
                         MapCopyInput const &input,
                         MapCopyOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_copy_sources(const MapperContext ctx,
+void FFMapper::select_copy_sources(MapperContext const ctx,
                                    Copy const &copy,
                                    SelectCopySrcInput const &input,
                                    SelectCopySrcOutput &output) {
@@ -982,26 +982,26 @@ void FFMapper::select_copy_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_copy_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Copy const &copy,
     CreateCopyTemporaryInput const &input,
     CreateCopyTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Copy const &copy,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Copy const &copy,
                                 CopyProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Copy const &copy,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1009,14 +1009,14 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   assert(false);
 }
 
-void FFMapper::map_close(const MapperContext ctx,
+void FFMapper::map_close(MapperContext const ctx,
                          Close const &close,
                          MapCloseInput const &input,
                          MapCloseOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_close_sources(const MapperContext ctx,
+void FFMapper::select_close_sources(MapperContext const ctx,
                                     Close const &close,
                                     SelectCloseSrcInput const &input,
                                     SelectCloseSrcOutput &output) {
@@ -1024,20 +1024,20 @@ void FFMapper::select_close_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_close_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Close const &close,
     CreateCloseTemporaryInput const &input,
     CreateCloseTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Close const &close,
                                 CloseProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Close const &close,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1045,26 +1045,26 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   assert(false);
 }
 
-void FFMapper::map_acquire(const MapperContext ctx,
+void FFMapper::map_acquire(MapperContext const ctx,
                            Acquire const &acquire,
                            MapAcquireInput const &input,
                            MapAcquireOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Acquire const &acquire,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Acquire const &acquire,
                                 AcquireProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Acquire const &acquire,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1072,14 +1072,14 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
   assert(false);
 }
 
-void FFMapper::map_release(const MapperContext ctx,
+void FFMapper::map_release(MapperContext const ctx,
                            Release const &release,
                            MapReleaseInput const &input,
                            MapReleaseOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_release_sources(const MapperContext ctx,
+void FFMapper::select_release_sources(MapperContext const ctx,
                                       Release const &release,
                                       SelectReleaseSrcInput const &input,
                                       SelectReleaseSrcOutput &output) {
@@ -1087,26 +1087,26 @@ void FFMapper::select_release_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_release_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Release const &release,
     CreateReleaseTemporaryInput const &input,
     CreateReleaseTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::speculate(const MapperContext ctx,
+void FFMapper::speculate(MapperContext const ctx,
                          Release const &release,
                          SpeculativeOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Release const &release,
                                 ReleaseProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Release const &release,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
@@ -1114,21 +1114,21 @@ void FFMapper::select_sharding_functor(const MapperContext ctx,
 }
 
 void FFMapper::select_partition_projection(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Partition const &partition,
     SelectPartitionProjectionInput const &input,
     SelectPartitionProjectionOutput &output) {
   assert(false);
 }
 
-void FFMapper::map_partition(const MapperContext ctx,
+void FFMapper::map_partition(MapperContext const ctx,
                              Partition const &partition,
                              MapPartitionInput const &input,
                              MapPartitionOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_partition_sources(const MapperContext ctx,
+void FFMapper::select_partition_sources(MapperContext const ctx,
                                         Partition const &partition,
                                         SelectPartitionSrcInput const &input,
                                         SelectPartitionSrcOutput &output) {
@@ -1136,34 +1136,34 @@ void FFMapper::select_partition_sources(const MapperContext ctx,
 }
 
 void FFMapper::create_partition_temporary_instance(
-    const MapperContext ctx,
+    MapperContext const ctx,
     Partition const &partition,
     CreatePartitionTemporaryInput const &input,
     CreatePartitionTemporaryOutput &output) {
   assert(false);
 }
 
-void FFMapper::report_profiling(const MapperContext ctx,
+void FFMapper::report_profiling(MapperContext const ctx,
                                 Partition const &partition,
                                 PartitionProfilingInfo const &input) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Partition const &partition,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        Fill const &fill,
                                        SelectShardingFunctorInput const &input,
                                        SelectShardingFunctorOutput &output) {
   assert(false);
 }
 
-void FFMapper::configure_context(const MapperContext ctx,
+void FFMapper::configure_context(MapperContext const ctx,
                                  Task const &task,
                                  ContextConfigOutput &output) {
   // Increase max_window_size to allow Legion tracing to capture larger traces
@@ -1171,21 +1171,21 @@ void FFMapper::configure_context(const MapperContext ctx,
   // Use the default values and do nothing else
 }
 
-void FFMapper::select_tunable_value(const MapperContext ctx,
+void FFMapper::select_tunable_value(MapperContext const ctx,
                                     Task const &task,
                                     SelectTunableInput const &input,
                                     SelectTunableOutput &output) {
   assert(false);
 }
 
-void FFMapper::select_sharding_functor(const MapperContext ctx,
+void FFMapper::select_sharding_functor(MapperContext const ctx,
                                        MustEpoch const &epoch,
                                        SelectShardingFunctorInput const &input,
                                        MustEpochShardingFunctorOutput &output) {
   assert(false);
 }
 
-void FFMapper::map_must_epoch(const MapperContext ctx,
+void FFMapper::map_must_epoch(MapperContext const ctx,
                               MapMustEpochInput const &input,
                               MapMustEpochOutput &output) {
   // Directly assign each task to its target_proc
@@ -1196,13 +1196,13 @@ void FFMapper::map_must_epoch(const MapperContext ctx,
   assert(input.constraints.size() == 0);
 }
 
-void FFMapper::map_dataflow_graph(const MapperContext ctx,
+void FFMapper::map_dataflow_graph(MapperContext const ctx,
                                   MapDataflowGraphInput const &input,
                                   MapDataflowGraphOutput &output) {
   assert(false);
 }
 
-void FFMapper::memoize_operation(const MapperContext ctx,
+void FFMapper::memoize_operation(MapperContext const ctx,
                                  Mappable const &mappable,
                                  MemoizeInput const &input,
                                  MemoizeOutput &output) {
@@ -1216,7 +1216,7 @@ void FFMapper::memoize_operation(const MapperContext ctx,
 }
 
 // Mapping control and stealing
-void FFMapper::select_tasks_to_map(const MapperContext ctx,
+void FFMapper::select_tasks_to_map(MapperContext const ctx,
                                    SelectMappingInput const &input,
                                    SelectMappingOutput &output) {
   // Just map all the ready tasks
@@ -1227,13 +1227,13 @@ void FFMapper::select_tasks_to_map(const MapperContext ctx,
   }
 }
 
-void FFMapper::select_steal_targets(const MapperContext ctx,
+void FFMapper::select_steal_targets(MapperContext const ctx,
                                     SelectStealingInput const &input,
                                     SelectStealingOutput &output) {
   // Nothing to do, no stealing in FFMapper
 }
 
-void FFMapper::permit_steal_request(const MapperContext ctx,
+void FFMapper::permit_steal_request(MapperContext const ctx,
                                     StealRequestInput const &intput,
                                     StealRequestOutput &output) {
   // Nothing to do, no stealing in FFMapper
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 4b8984324..20c5e3f75 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4468,7 +4468,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
   }
   // RequestManager prepare_next_batch_beam
   {
-    TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+    TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
                                    "RequestManager Prepare Next Batch (Beam)");
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 64fc9d4d9..1f0b9bf57 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1002,7 +1002,7 @@ TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
     Runtime *runtime) {
 
   RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
                         TaskArgument(&rm, sizeof(RequestManager *)));
   launcher.add_future(old_bc);
   launcher.add_future(result);

From 0e58a73345f47d3c5f645f0f40abe47d48abe2b0 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 11:21:31 -0400
Subject: [PATCH 015/667] Modified function name

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1f0b9bf57..272e5cdfd 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -995,7 +995,7 @@ TreeSearchBatchConfig
 }
 
 /***** Beam Search Phase *****/
-TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
+TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_spec(
     TreeSearchBatchConfigFuture const &old_bc,
     SsmInferenceResultFuture const &result,
     Context ctx,
@@ -1009,7 +1009,7 @@ TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_beam(
   return runtime->execute_task(ctx, launcher);
 }
 
-TreeSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
+TreeSearchBatchConfig RequestManager::prepare_next_batch_spec_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,

From 127d26fafc70823b2b05a6e2c7059326b6c56a35 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 11:22:52 -0400
Subject: [PATCH 016/667] Modified API name

---
 include/flexflow/request_manager.h | 4 ++--
 src/runtime/request_manager.cc     | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e6565b67e..6d0787560 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -188,8 +188,8 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void store_beam_metadata(TreeSearchBatchConfig const &old_bc,
-                           SsmInferenceResult const &result);
+  void store_ssm_inference_results(TreeSearchBatchConfig const &old_bc,
+                                   SsmInferenceResult const &result);
   void update_beam_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             BeamTree &tree,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 272e5cdfd..e088bc0bb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1042,8 +1042,8 @@ TreeSearchBatchConfig
                 << std::endl;
     }
   }
-  // Step 1: Store result to the beam tree struct
-  store_beam_metadata(old_bc, result);
+  // Step 1: Store small model's inference result to the token tree struct
+  store_ssm_inference_results(old_bc, result);
 
   // Step 2: preparing the next batch for existing requests
   TreeSearchBatchConfig new_bc;
@@ -1634,8 +1634,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
   return new_bc;
 }
 
-void RequestManager::store_beam_metadata(TreeSearchBatchConfig const &old_bc,
-                                         SsmInferenceResult const &result) {
+void RequestManager::store_ssm_inference_results(
+    TreeSearchBatchConfig const &old_bc, SsmInferenceResult const &result) {
   // step1 store the outputs
   if (old_bc.num_tokens <= 0) {
     return;

From ae0aee4d0b23a338576e25badf88fd783524b805 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 27 Mar 2024 16:38:49 -0400
Subject: [PATCH 017/667] Some name changes in store_ssm_inference_results

---
 src/runtime/request_manager.cc | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e088bc0bb..7d6032a1e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1655,18 +1655,16 @@ void RequestManager::store_ssm_inference_results(
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid !=
             guid) {
 
-      // std::cout << "i is: " << i << "old guid" << guid << " new guid"
-      //           << old_bc.requestsInfo[old_bc.tokensInfo[i].request_index]
-      //                  .request_guid
-      //           << "\n";
-
-      int index = old_bc.tokensInfo[i - 1].request_index;
-      int beam_size = old_bc.beamRequestsInfo[index].beam_size;
+      int request_index = old_bc.tokensInfo[i - 1].request_index;
+      int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
 
-      // int leaf_node_num = old_bc.sub_requests[index];
       int leaf_node_num =
-          old_bc.beamRequestsInfo[index].sub_request_num * beam_size;
-      int depth = old_bc.beamRequestsInfo[index].current_depth;
+          old_bc.tree_requests_info[request_index].num_tokens_at_depth *
+          num_branches;
+      //   int leaf_node_num =
+      //       old_bc.beamRequestsInfo[request_index].sub_request_num *
+      //       num_branches;
+      int depth = old_bc.current_depth;
 
       // Each token yields (beam_width) results
       // int beam_width = old_bc.beamRequestsInfo[index].beam_size;
@@ -1675,18 +1673,19 @@ void RequestManager::store_ssm_inference_results(
       // index
       result_index +=
           (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
-          beam_size;
+          num_branches;
 
       if (verbose) {
         std::cout << "i = " << i << ", result index = " << result_index
                   << ", value: " << result.token_ids[result_index]
                   << ", leaf node num: " << leaf_node_num << ", depth" << depth
-                  << ", beam size: " << beam_size << "\n";
+                  << ", beam size: " << num_branches << "\n";
       }
 
-      Request &request = all_requests[old_bc.requestsInfo[index].request_guid];
+      Request &request =
+          all_requests[old_bc.requestsInfo[request_index].request_guid];
 
-      if (old_bc.requestsInfo[index].num_tokens_in_batch == 0) {
+      if (old_bc.requestsInfo[request_index].num_tokens_in_batch == 0) {
         continue;
       }
 

From f246523f44518ae69d651a5f15bd84fb80fcded2 Mon Sep 17 00:00:00 2001
From: Linshuhuai <1346678655@qq.com>
Date: Tue, 2 Apr 2024 14:46:03 -0700
Subject: [PATCH 018/667] setup new function

---
 include/flexflow/request_manager.h | 12 ++++++++++-
 src/runtime/request_manager.cc     | 33 ++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 6d0787560..e9a056668 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -228,7 +228,7 @@ class RequestManager {
   TreeSearchBatchConfig prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfig> const &old_batches);
   // A wrapper function.
-  TreeVerifyBatchConfigFuture prepare_next_batch_verify(
+  TreeSearchBatchConfigFuture prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfigFuture> const &old_batches,
       Legion::Context ctx,
       Legion::Runtime *runtime);
@@ -324,11 +324,21 @@ class RequestManager {
       Legion::Runtime *runtime);
   /* New APIs */
 
+  /* Old APIs for reference */
   static TreeVerifyBatchConfig prepare_next_batch_verify_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
+  /* Old APIs for reference */
+
+  /* New APIs */
+  static TreeSearchBatchConfig prepare_next_batch_verify_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+  /* New APIs */
 
 private:
   // configuration parameters
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7d6032a1e..61960b8c0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1339,6 +1339,21 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task(
   return rm->prepare_next_batch_verify(old_batches);
 }
 
+/* New APIs */
+TreeSearchBatchConfig RequestManager::prepare_next_batch_verify_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  RequestManager *rm = *((RequestManager **)task->args);
+  std::vector<TreeSearchBatchConfig> old_batches;
+  for (auto const &bcf : task->futures) {
+    old_batches.push_back(Future(bcf).get_result<TreeSearchBatchConfig>());
+  }
+  return rm->prepare_next_batch_verify(old_batches);
+}
+/* New APIs */
+
 TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     std::vector<TreeSearchBatchConfig> const &old_batches) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
@@ -1634,6 +1649,24 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
   return new_bc;
 }
 
+/* New APIs */
+TreeSearchBatchConfig RequestManager::prepare_next_batch_verify(
+    std::vector<TreeSearchBatchConfig> const &old_batches) {
+  if (verbose) {
+    std::cout
+        << "\n############### prepare_next_batch_verify ###############\n";
+  }
+
+  assert(old_batches.size() > 0);
+
+  TreeVerifyBatchConfig new_bc;
+  new_bc.num_tokens_to_commit = 0;
+  new_bc.num_tokens = 0;
+
+  return new_bc;
+}
+/* New APIs */
+
 void RequestManager::store_ssm_inference_results(
     TreeSearchBatchConfig const &old_bc, SsmInferenceResult const &result) {
   // step1 store the outputs

From 582c8d722807a0f100d2ddb9e6b377d6f028891d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 3 Apr 2024 15:54:01 -0400
Subject: [PATCH 019/667] Add a constructor for TokenTreeNode; Refactoring
 store_ssm_inference_results

---
 include/flexflow/request_manager.h |  6 ++++-
 src/runtime/request_manager.cc     | 41 +++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 6d0787560..796be50b6 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -76,11 +76,15 @@ struct Request {
   std::vector<struct TokenTree> token_trees; // New version
 };
 
-struct TokenTreeNode {
+class TokenTreeNode {
   BatchConfig::TokenId id;
   float joint_prob;
   int parent_pos;
   bool pruned = false;
+
+public:
+  TokenTreeNode(BatchConfig::TokenId id, float joint_prob, int parent_pos)
+      : id(id), joint_prob(joint_prob), parent_pos(parent_pos) {}
 };
 
 struct TreeLayer {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7d6032a1e..0b66749ee 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1640,16 +1640,51 @@ void RequestManager::store_ssm_inference_results(
   if (old_bc.num_tokens <= 0) {
     return;
   }
-  auto guid =
+  int depth = old_bc.current_depth;
+  int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+  int num_old_bc_tokens_processed = 0;
+
+  FlexFlow::RequestManager::RequestGuid guid =
       old_bc.requestsInfo[old_bc.tokensInfo[0].request_index].request_guid;
-  auto start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
+  Request &request = all_requests[guid];
+  int request_index = old_bc.tokensInfo[0].request_index;
+  int start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
+
+  //   int result_index_begin = 0;
+  //   int result_index_end =
+  //       old_bc.tree_requests_info[request_index].num_tokens_at_depth *
+  //       num_branches; // Number of leaf tokens of the current request
   int result_index = 0;
 
   if (verbose) {
-    std::cout << "Store total of " << old_bc.num_tokens
+    std::cout << "Store total of " << old_bc.num_tokens * num_branches
               << " tokens in the current batch.\n";
   }
 
+  //   while (num_old_bc_tokens_processed < old_bc.num_tokens) {
+  //     // Process the tokens for the current request
+  //     for (int token_idx = 0;
+  //          token_idx < old_bc.tree_requests_info[request_index];
+  //          token_idx++) {
+  //       for (int token_result_idx = 0; token_result_idx < num_branches;
+  //            token_result_idx++) {
+  //         // Find parent joint probability
+  //         float parent_prob = request.token_trees.at(old_bc.model_id)
+  //                                 .tree_layers[depth - 1]
+  //                                 .nodes[result.parent_id[result_index]]
+  //                                 .joint_prob;
+  //         TokenTreeNode token_tree_node(result.token_ids[result_index],
+  //                                       result.probs[result_index] *
+  //                                       parent_prob,
+  //                                       result.parent_id[result_index]);
+  //         // Try to insert this
+  //         result_index++;
+  //       }
+  //     }
+
+  //     // Update request
+  //   }
+
   for (int i = 0; i <= old_bc.num_tokens; i++) {
     if (i == old_bc.num_tokens ||
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid !=

From ee588b26aae4b8e92a0d378c632cb8e907cf7384 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 10:40:55 -0400
Subject: [PATCH 020/667] Made some fields deprecated

---
 include/flexflow/request_manager.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a969febd5..ef477375a 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -73,7 +73,8 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  std::vector<struct TokenTree> token_trees; // New version
+  // In the current version, we only use one speculator
+  std::vector<struct TokenTree> speculative_token_trees; // New version
 };
 
 class TokenTreeNode {
@@ -353,6 +354,7 @@ class RequestManager {
   Status request_manager_status;
 
   // tree width in each speculative step, if not specified 1
+  [[deprecated("This field will be removed")]]
   std::vector<int> spec_infer_tree_width; // Old version, delete after refactor
 
   // private fields
@@ -369,6 +371,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
+
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
   std::priority_queue<std::shared_ptr<TokenTreeNode>> token_tree_node_pool;
@@ -381,15 +384,15 @@ class RequestManager {
       committed_tokens;
 
   // Multi-model support
+  [[deprecated("Multiple SSMs is no longer supported")]]
   std::vector<FFModel *> ssm_models;
 
-  // Performance profiling
-  size_t num_processed_requests;
-
   // Background server handler
   Legion::Future background_server_handler;
 
-private:
+  // Performance profiling
+  size_t num_processed_requests;
+
   struct ProfileInfo {
     int llm_decoding_steps;
     int ssm_decoding_steps;

From 1f647016160ec32681ff7194e79bfe51b9c56ae5 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 10:53:06 -0400
Subject: [PATCH 021/667] Modified the type of TreeLayer::nodes from
 std::vector to std::list to enable faster removal of elements

---
 include/flexflow/request_manager.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ef477375a..69dc60f54 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -89,7 +89,7 @@ class TokenTreeNode {
 };
 
 struct TreeLayer {
-  std::vector<TokenTreeNode> nodes;
+  std::list<TokenTreeNode> nodes;
 };
 
 class TokenTree {
@@ -357,7 +357,6 @@ class RequestManager {
   [[deprecated("This field will be removed")]]
   std::vector<int> spec_infer_tree_width; // Old version, delete after refactor
 
-  // private fields
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
   ModelType model_type;

From 84eccb037d2a0686ff1f1c80d090c6913c45a9b6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 11:20:00 -0400
Subject: [PATCH 022/667] Add APIs for pruning the last layer of the token
 tree; Add a compartor for TokenTreeNode

---
 include/flexflow/request_manager.h | 22 +++++++++++++++++++++-
 src/runtime/request_manager.cc     | 17 +++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 69dc60f54..e83096396 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -86,6 +86,17 @@ class TokenTreeNode {
 public:
   TokenTreeNode(BatchConfig::TokenId id, float joint_prob, int parent_pos)
       : id(id), joint_prob(joint_prob), parent_pos(parent_pos) {}
+  bool operator>(TokenTreeNode const &other) const {
+    return joint_prob > other.joint_prob;
+  }
+};
+
+// A comparator for shared_ptr<TokenTreeNode>
+struct CompareSharedTokenTreeNodePtr {
+  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
+                  std::shared_ptr<TokenTreeNode> const &rhs) const {
+    return *lhs > *rhs;
+  }
 };
 
 struct TreeLayer {
@@ -373,7 +384,10 @@ class RequestManager {
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
-  std::priority_queue<std::shared_ptr<TokenTreeNode>> token_tree_node_pool;
+  std::priority_queue<std::shared_ptr<TokenTreeNode>,
+                      std::vector<std::shared_ptr<TokenTreeNode>>,
+                      CompareSharedTokenTreeNodePtr>
+      token_tree_node_pool;
 
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,
@@ -399,6 +413,12 @@ class RequestManager {
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
+
+  void add_token_to_speculation_tree(RequestGuid guid,
+                                     BatchConfig::TokenId token_id,
+                                     int parent_pos,
+                                     float joint_prob);
+  void prune_last_layer_of_speculation_tree(RequestGuid guid);
 };
 
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e5feab690..2030efe6a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2715,4 +2715,21 @@ RequestManager *RequestManager::get_request_manager() {
   return request_manager_singleton;
 }
 
+void RequestManager::add_token_to_speculation_tree(
+    RequestGuid guid,
+    BatchConfig::TokenId token_id,
+    int parent_pos,
+    float joint_prob) {}
+
+void RequestManager::prune_last_layer_of_speculation_tree(RequestGuid guid) {
+  // Assume we only use a single small model for now
+  Request &request = all_requests[guid];
+  TreeLayer &last_layer = request.token_trees[0].tree_layers.back();
+  for (auto it = last_layer.nodes.begin(); it != last_layer.nodes.end(); ++it) {
+    if (it->pruned) {
+      last_layer.nodes.erase(it);
+    }
+  }
+}
+
 }; // namespace FlexFlow

From dd6c95d5d5a497b6957af6e2388543e8416f2a70 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 11:54:47 -0400
Subject: [PATCH 023/667] 1. Changed the type of TokenTreeLayer::nodes from
 std::list<TokenTreeNode> to std::list<shared_ptr<TokenTreeNode>>. 2. Modified
 the name of TreeLayer to TokenTreeLayer. 3. Add the implementation of two
 methods RequestManager::add_token_to_speculation_tree and
 RequestManager::prune_last_layer_of_speculation_tree.

---
 include/flexflow/request_manager.h | 10 +++----
 src/runtime/request_manager.cc     | 45 ++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e83096396..abe859e1d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -78,12 +78,12 @@ struct Request {
 };
 
 class TokenTreeNode {
+public:
   BatchConfig::TokenId id;
   float joint_prob;
   int parent_pos;
   bool pruned = false;
 
-public:
   TokenTreeNode(BatchConfig::TokenId id, float joint_prob, int parent_pos)
       : id(id), joint_prob(joint_prob), parent_pos(parent_pos) {}
   bool operator>(TokenTreeNode const &other) const {
@@ -99,12 +99,12 @@ struct CompareSharedTokenTreeNodePtr {
   }
 };
 
-struct TreeLayer {
-  std::list<TokenTreeNode> nodes;
+struct TokenTreeLayer {
+  std::list<shared_ptr<TokenTreeNode>> nodes;
 };
 
-class TokenTree {
-  std::vector<TreeLayer> tree_layers;
+struct TokenTree {
+  std::vector<TokenTreeLayer> tree_layers;
 };
 
 class RequestManager {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2030efe6a..cd625f2c2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2719,14 +2719,53 @@ void RequestManager::add_token_to_speculation_tree(
     RequestGuid guid,
     BatchConfig::TokenId token_id,
     int parent_pos,
-    float joint_prob) {}
+    float joint_prob) {
+  // This function assumes that there are only one small model
+
+  // We maintain the size of the token tree node pool to not exceed
+  // BatchConfig::MAX_NUM_TOKENS
+  if (token_tree_node_pool.size() < BatchConfig::MAX_NUM_TOKENS) {
+    Request &request = all_requests[guid];
+    TokenTreeLayer &last_layer =
+        request.speculative_token_trees[0].tree_layers.back();
+    // Add to the last layer of the speculation tree
+    auto node_ptr =
+        std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
+    last_layer.nodes.push_back(node_ptr);
+    token_tree_node_pool.push(node_ptr);
+    return;
+  }
+
+  // The pool is full, check if the new node has a higher joint probability
+  // than the minimum node in the pool.
+  std::shared_ptr<TokenTreeNode> min_node_in_pool = token_tree_node_pool.top();
+  if (joint_prob < min_node_in_pool->joint_prob) {
+    // Insertion failed
+    return;
+  }
+
+  // Remove the minimum node from the pool, and set its pruned field to true
+  min_node_in_pool->pruned = true;
+  token_tree_node_pool.pop();
+  // Add the new node to the pool and the last layer of the speculation tree
+  auto node_ptr =
+      std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
+  token_tree_node_pool.push(node_ptr);
+  Request &request = all_requests[guid];
+  TokenTreeLayer &last_layer =
+      request.speculative_token_trees[0].tree_layers.back();
+  last_layer.nodes.push_back(node_ptr);
+  return;
+}
 
 void RequestManager::prune_last_layer_of_speculation_tree(RequestGuid guid) {
   // Assume we only use a single small model for now
+
   Request &request = all_requests[guid];
-  TreeLayer &last_layer = request.token_trees[0].tree_layers.back();
+  TokenTreeLayer &last_layer =
+      request.speculative_token_trees[0].tree_layers.back();
   for (auto it = last_layer.nodes.begin(); it != last_layer.nodes.end(); ++it) {
-    if (it->pruned) {
+    if ((*it)->pruned) {
       last_layer.nodes.erase(it);
     }
   }

From 23c3a2f27b14626c006114081bdcbc8f0cbee649 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 12:32:23 -0400
Subject: [PATCH 024/667] Change std::vector<struct TokenTree>
 speculative_token_trees to TokenTree speculative_token_tree because we only
 support one small model.

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index abe859e1d..a3e0f75da 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -74,7 +74,7 @@ struct Request {
   std::vector<BatchConfig::TokenId> tokens;
 
   // In the current version, we only use one speculator
-  std::vector<struct TokenTree> speculative_token_trees; // New version
+  TokenTree speculative_token_tree; // New version
 };
 
 class TokenTreeNode {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cd625f2c2..a6cfb09ca 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2720,14 +2720,12 @@ void RequestManager::add_token_to_speculation_tree(
     BatchConfig::TokenId token_id,
     int parent_pos,
     float joint_prob) {
-  // This function assumes that there are only one small model
-
   // We maintain the size of the token tree node pool to not exceed
   // BatchConfig::MAX_NUM_TOKENS
   if (token_tree_node_pool.size() < BatchConfig::MAX_NUM_TOKENS) {
     Request &request = all_requests[guid];
     TokenTreeLayer &last_layer =
-        request.speculative_token_trees[0].tree_layers.back();
+        request.speculative_token_tree.tree_layers.back();
     // Add to the last layer of the speculation tree
     auto node_ptr =
         std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
@@ -2753,17 +2751,15 @@ void RequestManager::add_token_to_speculation_tree(
   token_tree_node_pool.push(node_ptr);
   Request &request = all_requests[guid];
   TokenTreeLayer &last_layer =
-      request.speculative_token_trees[0].tree_layers.back();
+      request.speculative_token_tree.tree_layers.back();
   last_layer.nodes.push_back(node_ptr);
   return;
 }
 
 void RequestManager::prune_last_layer_of_speculation_tree(RequestGuid guid) {
-  // Assume we only use a single small model for now
-
   Request &request = all_requests[guid];
   TokenTreeLayer &last_layer =
-      request.speculative_token_trees[0].tree_layers.back();
+      request.speculative_token_tree.tree_layers.back();
   for (auto it = last_layer.nodes.begin(); it != last_layer.nodes.end(); ++it) {
     if ((*it)->pruned) {
       last_layer.nodes.erase(it);

From 3b8c2e169b39409af8f9575ed9a27683304e99fe Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 12:37:53 -0400
Subject: [PATCH 025/667] Modified the field std::vector<FFModel *> ssm_models
 to FFModel *ssm_model. Also changed the implementation of FFModel
 *get_ssm_model().

---
 include/flexflow/request_manager.h | 4 +++-
 src/runtime/request_manager.cc     | 5 ++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a3e0f75da..a11b2d125 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -151,7 +151,7 @@ class RequestManager {
                      int initLength,
                      int non_tree_size);
 
-  FFModel *get_ssm_model(int model_id);
+  FFModel *get_ssm_model();
 
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
@@ -399,6 +399,8 @@ class RequestManager {
   // Multi-model support
   [[deprecated("Multiple SSMs is no longer supported")]]
   std::vector<FFModel *> ssm_models;
+  // New version
+  FFModel *ssm_model;
 
   // Background server handler
   Legion::Future background_server_handler;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a6cfb09ca..420e71634 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -172,9 +172,8 @@ int RequestManager::register_ssm_model(FFModel *model) {
   return model_id;
 }
 
-FFModel *RequestManager::get_ssm_model(int model_id) {
-  assert(model_id < ssm_models.size());
-  return ssm_models[model_id];
+FFModel *RequestManager::get_ssm_model() {
+  return ssm_model;
 }
 
 size_t RequestManager::get_num_ssms() {

From c470332dc215671ca8a408d65955b071f3feb066 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 20:39:19 -0400
Subject: [PATCH 026/667] Recover support for multiple small models.

---
 include/flexflow/request_manager.h |  8 +++-----
 src/runtime/request_manager.cc     | 14 +++++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a11b2d125..12b9c36ad 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -74,7 +74,8 @@ struct Request {
   std::vector<BatchConfig::TokenId> tokens;
 
   // In the current version, we only use one speculator
-  TokenTree speculative_token_tree; // New version
+  //   TokenTree speculative_token_tree;
+  std::vector<TokenTree> speculative_token_trees;
 };
 
 class TokenTreeNode {
@@ -151,7 +152,7 @@ class RequestManager {
                      int initLength,
                      int non_tree_size);
 
-  FFModel *get_ssm_model();
+  FFModel *get_ssm_model(int model_id);
 
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
@@ -397,10 +398,7 @@ class RequestManager {
       committed_tokens;
 
   // Multi-model support
-  [[deprecated("Multiple SSMs is no longer supported")]]
   std::vector<FFModel *> ssm_models;
-  // New version
-  FFModel *ssm_model;
 
   // Background server handler
   Legion::Future background_server_handler;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 420e71634..27a788406 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -172,8 +172,9 @@ int RequestManager::register_ssm_model(FFModel *model) {
   return model_id;
 }
 
-FFModel *RequestManager::get_ssm_model() {
-  return ssm_model;
+FFModel *RequestManager::get_ssm_model(int model_id) {
+  assert(model_id >= 0 && model_id < ssm_models.size());
+  return ssm_models[model_id];
 }
 
 size_t RequestManager::get_num_ssms() {
@@ -2719,12 +2720,14 @@ void RequestManager::add_token_to_speculation_tree(
     BatchConfig::TokenId token_id,
     int parent_pos,
     float joint_prob) {
+  // This method assumes only one small model is used for speculation
+
   // We maintain the size of the token tree node pool to not exceed
   // BatchConfig::MAX_NUM_TOKENS
   if (token_tree_node_pool.size() < BatchConfig::MAX_NUM_TOKENS) {
     Request &request = all_requests[guid];
     TokenTreeLayer &last_layer =
-        request.speculative_token_tree.tree_layers.back();
+        request.speculative_token_trees[0].tree_layers.back();
     // Add to the last layer of the speculation tree
     auto node_ptr =
         std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
@@ -2750,15 +2753,16 @@ void RequestManager::add_token_to_speculation_tree(
   token_tree_node_pool.push(node_ptr);
   Request &request = all_requests[guid];
   TokenTreeLayer &last_layer =
-      request.speculative_token_tree.tree_layers.back();
+      request.speculative_token_trees[0].tree_layers.back();
   last_layer.nodes.push_back(node_ptr);
   return;
 }
 
 void RequestManager::prune_last_layer_of_speculation_tree(RequestGuid guid) {
+  // This method assumes only one small model is used for speculation
   Request &request = all_requests[guid];
   TokenTreeLayer &last_layer =
-      request.speculative_token_tree.tree_layers.back();
+      request.speculative_token_trees[0].tree_layers.back();
   for (auto it = last_layer.nodes.begin(); it != last_layer.nodes.end(); ++it) {
     if ((*it)->pruned) {
       last_layer.nodes.erase(it);

From df13914efaa03f89429965b10a8fec4c4bba5e2a Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 21:25:17 -0400
Subject: [PATCH 027/667] Simplified the structure of TokenTree

---
 include/flexflow/request_manager.h | 15 +++---
 src/runtime/request_manager.cc     | 77 ++++++++++++++++--------------
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 12b9c36ad..1403473df 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -100,12 +100,12 @@ struct CompareSharedTokenTreeNodePtr {
   }
 };
 
-struct TokenTreeLayer {
-  std::list<shared_ptr<TokenTreeNode>> nodes;
-};
-
-struct TokenTree {
-  std::vector<TokenTreeLayer> tree_layers;
+class TokenTree {
+public:
+  std::vector<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
+  void add_layer() {
+    tree_layers.emplace_back();
+  }
 };
 
 class RequestManager {
@@ -417,7 +417,8 @@ class RequestManager {
   void add_token_to_speculation_tree(RequestGuid guid,
                                      BatchConfig::TokenId token_id,
                                      int parent_pos,
-                                     float joint_prob);
+                                     float joint_prob,
+                                     int depth);
   void prune_last_layer_of_speculation_tree(RequestGuid guid);
 };
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 27a788406..81fc2521e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1673,6 +1673,7 @@ void RequestManager::store_ssm_inference_results(
   if (old_bc.num_tokens <= 0) {
     return;
   }
+
   int depth = old_bc.current_depth;
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int num_old_bc_tokens_processed = 0;
@@ -1694,29 +1695,30 @@ void RequestManager::store_ssm_inference_results(
               << " tokens in the current batch.\n";
   }
 
-  //   while (num_old_bc_tokens_processed < old_bc.num_tokens) {
-  //     // Process the tokens for the current request
-  //     for (int token_idx = 0;
-  //          token_idx < old_bc.tree_requests_info[request_index];
-  //          token_idx++) {
-  //       for (int token_result_idx = 0; token_result_idx < num_branches;
-  //            token_result_idx++) {
-  //         // Find parent joint probability
-  //         float parent_prob = request.token_trees.at(old_bc.model_id)
-  //                                 .tree_layers[depth - 1]
-  //                                 .nodes[result.parent_id[result_index]]
-  //                                 .joint_prob;
-  //         TokenTreeNode token_tree_node(result.token_ids[result_index],
-  //                                       result.probs[result_index] *
-  //                                       parent_prob,
-  //                                       result.parent_id[result_index]);
-  //         // Try to insert this
-  //         result_index++;
-  //       }
-  //     }
-
-  //     // Update request
-  //   }
+  while (num_old_bc_tokens_processed < old_bc.num_tokens) {
+    // Process the tokens for the current request
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    for (auto parent_it = token_tree.tree_layers[depth - 1].nodes.begin();
+         parent_it != token_tree.tree_layers[depth - 1].nodes.end();
+         parent_it++) {
+      if ((*parent_it)->pruned) {
+        continue;
+      } else {
+        for (int child_idx = 0; child_idx < num_branches; child_idx++) {
+          // Find parent joint probability
+          float parent_prob = (*parent_it)->joint_prob;
+          TokenTreeNode token_tree_node(result.token_ids[result_index],
+                                        result.probs[result_index] *
+                                            parent_prob,
+                                        result.parent_id[result_index]);
+          // Try to insert this
+          result_index++;
+        }
+      }
+    }
+
+    // Update request
+  }
 
   for (int i = 0; i <= old_bc.num_tokens; i++) {
     if (i == old_bc.num_tokens ||
@@ -2719,19 +2721,25 @@ void RequestManager::add_token_to_speculation_tree(
     RequestGuid guid,
     BatchConfig::TokenId token_id,
     int parent_pos,
-    float joint_prob) {
+    float joint_prob,
+    int depth // depth starts from 0
+) {
   // This method assumes only one small model is used for speculation
 
+  // First make sure there are enough layers in the speculation tree
+  Request &request = all_requests[guid];
+  TokenTree &speculative_token_tree = request.speculative_token_trees[0];
+  while (speculative_token_tree.tree_layers.size() <= depth) {
+    speculative_token_tree.add_layer();
+  }
+
   // We maintain the size of the token tree node pool to not exceed
   // BatchConfig::MAX_NUM_TOKENS
   if (token_tree_node_pool.size() < BatchConfig::MAX_NUM_TOKENS) {
-    Request &request = all_requests[guid];
-    TokenTreeLayer &last_layer =
-        request.speculative_token_trees[0].tree_layers.back();
     // Add to the last layer of the speculation tree
     auto node_ptr =
         std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
-    last_layer.nodes.push_back(node_ptr);
+    request.speculative_token_trees[0].tree_layers[depth].push_back(node_ptr);
     token_tree_node_pool.push(node_ptr);
     return;
   }
@@ -2751,23 +2759,18 @@ void RequestManager::add_token_to_speculation_tree(
   auto node_ptr =
       std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
   token_tree_node_pool.push(node_ptr);
-  Request &request = all_requests[guid];
-  TokenTreeLayer &last_layer =
-      request.speculative_token_trees[0].tree_layers.back();
-  last_layer.nodes.push_back(node_ptr);
+  request.speculative_token_trees[0].tree_layers[depth].push_back(node_ptr);
   return;
 }
 
 void RequestManager::prune_last_layer_of_speculation_tree(RequestGuid guid) {
   // This method assumes only one small model is used for speculation
   Request &request = all_requests[guid];
-  TokenTreeLayer &last_layer =
-      request.speculative_token_trees[0].tree_layers.back();
-  for (auto it = last_layer.nodes.begin(); it != last_layer.nodes.end(); ++it) {
+  auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
+  for (auto it = last_layer.begin(); it != last_layer.end(); ++it) {
     if ((*it)->pruned) {
-      last_layer.nodes.erase(it);
+      last_layer.erase(it);
     }
   }
 }
-
 }; // namespace FlexFlow

From 58941cc9502f09697679acadeae322b662383c64 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 21:59:31 -0400
Subject: [PATCH 028/667] 1. Implement store_ssm_inference_results to maintain
 the token tree. 2. Decouple the initialization of the token tree with the
 maintainence of the token tree, resulting in a new function void
 RequestManager::initialize_root_of_spec_token_trees().

---
 include/flexflow/request_manager.h |  13 +--
 src/runtime/request_manager.cc     | 156 +++++++++--------------------
 2 files changed, 53 insertions(+), 116 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 1403473df..6f28aa54e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -414,12 +414,13 @@ class RequestManager {
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
 
-  void add_token_to_speculation_tree(RequestGuid guid,
-                                     BatchConfig::TokenId token_id,
-                                     int parent_pos,
-                                     float joint_prob,
-                                     int depth);
-  void prune_last_layer_of_speculation_tree(RequestGuid guid);
+  void RequestManager::initialize_root_of_spec_token_trees();
+  void add_token_to_spec_token_tree(RequestGuid guid,
+                                    BatchConfig::TokenId token_id,
+                                    int parent_pos,
+                                    float joint_prob,
+                                    int depth);
+  void prune_last_layer_of_spec_token_tree(RequestGuid guid);
 };
 
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 81fc2521e..971d6a152 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1666,28 +1666,41 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_verify(
   return new_bc;
 }
 /* New APIs */
+void RequestManager::initialize_root_of_spec_token_trees() {
+  // This method assumes only one small model is used for speculation
+
+  // TODO: Do we need to iterate over all requests?
+  for (auto &request_pair : all_requests) {
+    Request &request = request_pair.second;
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    token_tree.tree_layers.clear();
+    token_tree.add_layer();
+    token_tree.tree_layers[0].emplace_back(
+        // TODO: Make sure every request has at least one token,
+        // otherwise, we need to handle this case
+        std::make_shared<TokenTreeNode>(request.tokens.back(), 0, 1.0));
+  }
+}
 
 void RequestManager::store_ssm_inference_results(
     TreeSearchBatchConfig const &old_bc, SsmInferenceResult const &result) {
-  // step1 store the outputs
   if (old_bc.num_tokens <= 0) {
     return;
   }
 
+  // Depth starts from 1, because the root is at layer 0
   int depth = old_bc.current_depth;
+  assert(depth > 0);
+
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int num_old_bc_tokens_processed = 0;
 
+  int request_index = old_bc.tokensInfo[0].request_index;
   FlexFlow::RequestManager::RequestGuid guid =
-      old_bc.requestsInfo[old_bc.tokensInfo[0].request_index].request_guid;
+      old_bc.requestsInfo[request_index].request_guid;
   Request &request = all_requests[guid];
-  int request_index = old_bc.tokensInfo[0].request_index;
   int start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
 
-  //   int result_index_begin = 0;
-  //   int result_index_end =
-  //       old_bc.tree_requests_info[request_index].num_tokens_at_depth *
-  //       num_branches; // Number of leaf tokens of the current request
   int result_index = 0;
 
   if (verbose) {
@@ -1697,118 +1710,41 @@ void RequestManager::store_ssm_inference_results(
 
   while (num_old_bc_tokens_processed < old_bc.num_tokens) {
     // Process the tokens for the current request
+    int num_parent_tokens_processed_in_request = 0;
     TokenTree &token_tree = request.speculative_token_trees[0];
-    for (auto parent_it = token_tree.tree_layers[depth - 1].nodes.begin();
-         parent_it != token_tree.tree_layers[depth - 1].nodes.end();
+    std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
+        token_tree.tree_layers[depth - 1];
+    for (auto parent_it = tree_layer.begin(); parent_it != tree_layer.end();
          parent_it++) {
       if ((*parent_it)->pruned) {
-        continue;
+        // Parent token is pruned, we have to skip all its children
+        // Because no token is pruned in the last layer during the small
+        // model inference, the reason why some parents are pruned is that
+        // adding tokens to the new layer of the tree may result in some
+        // node being pruned in internal layers.
+        result_index += num_branches;
       } else {
+        // Parent token is not pruned
         for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-          // Find parent joint probability
           float parent_prob = (*parent_it)->joint_prob;
-          TokenTreeNode token_tree_node(result.token_ids[result_index],
-                                        result.probs[result_index] *
-                                            parent_prob,
-                                        result.parent_id[result_index]);
-          // Try to insert this
+          add_token_to_spec_token_tree(guid,
+                                       result.token_ids[result_index],
+                                       result.probs[result_index] * parent_prob,
+                                       result.parent_id[result_index],
+                                       depth);
           result_index++;
         }
       }
+      num_old_bc_tokens_processed++;
     }
 
-    // Update request
-  }
-
-  for (int i = 0; i <= old_bc.num_tokens; i++) {
-    if (i == old_bc.num_tokens ||
-        old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid !=
-            guid) {
-
-      int request_index = old_bc.tokensInfo[i - 1].request_index;
-      int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-
-      int leaf_node_num =
-          old_bc.tree_requests_info[request_index].num_tokens_at_depth *
-          num_branches;
-      //   int leaf_node_num =
-      //       old_bc.beamRequestsInfo[request_index].sub_request_num *
-      //       num_branches;
-      int depth = old_bc.current_depth;
-
-      // Each token yields (beam_width) results
-      // int beam_width = old_bc.beamRequestsInfo[index].beam_size;
-
-      // Count tokens sent to model in this request to find the final token's
-      // index
-      result_index +=
-          (old_bc.tokensInfo[i - 1].abs_depth_in_request - start_depth) *
-          num_branches;
-
-      if (verbose) {
-        std::cout << "i = " << i << ", result index = " << result_index
-                  << ", value: " << result.token_ids[result_index]
-                  << ", leaf node num: " << leaf_node_num << ", depth" << depth
-                  << ", beam size: " << num_branches << "\n";
-      }
-
-      Request &request =
-          all_requests[old_bc.requestsInfo[request_index].request_guid];
-
-      if (old_bc.requestsInfo[request_index].num_tokens_in_batch == 0) {
-        continue;
-      }
-
-      if (depth == 1) {
-        // store the last input into the tree;
-        if (verbose) {
-          std::cout << "try to store the input" << "\n";
-        }
-
-        request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] =
-            request.tokens.back();
-        request.beam_trees.at(old_bc.model_id).treeLayers[0].probs[0] = 1;
-        request.beam_trees.at(old_bc.model_id).treeLayers[0].parent_ids[0] = -1;
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[0]
-            .nodes_num_this_layer = 1;
-
-        if (verbose) {
-          std::cout << "Store the previous last token to the tree root: "
-                    << request.tokens.back() << "\n";
-        }
-      }
-      request.beam_trees.at(old_bc.model_id)
-          .treeLayers[depth]
-          .nodes_num_this_layer = leaf_node_num;
-      for (int beam_id = 0; beam_id < leaf_node_num; beam_id++) {
-
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[depth]
-            .tokens[beam_id] = result.token_ids[result_index];
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[depth]
-            .probs[beam_id] = result.probs[result_index];
-        request.beam_trees.at(old_bc.model_id)
-            .treeLayers[depth]
-            .parent_ids[beam_id] = result.parent_id[result_index];
-
-        if (verbose) {
-          std::cout << "tree value: " << depth << "token: "
-                    << request.beam_trees.at(old_bc.model_id)
-                           .treeLayers[depth]
-                           .tokens[beam_id]
-                    << "result tokens: " << result.token_ids[result_index];
-        }
-        result_index += 1;
-      }
-      // update the guid and start_depth for current request
-      if (i < old_bc.num_tokens) {
-        int new_req_idx = old_bc.tokensInfo[i].request_index;
-        guid = old_bc.requestsInfo[new_req_idx].request_guid;
-        start_depth = old_bc.tokensInfo[i].abs_depth_in_request;
-      }
-    }
+    // Switch to the next request
+    int request_index =
+        old_bc.tokensInfo[num_old_bc_tokens_processed].request_index;
+    FlexFlow::RequestManager::RequestGuid guid =
+        old_bc.requestsInfo[request_index].request_guid;
+    Request &request = all_requests[guid];
+    int start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
   }
 }
 
@@ -2717,7 +2653,7 @@ RequestManager *RequestManager::get_request_manager() {
   return request_manager_singleton;
 }
 
-void RequestManager::add_token_to_speculation_tree(
+void RequestManager::add_token_to_spec_token_tree(
     RequestGuid guid,
     BatchConfig::TokenId token_id,
     int parent_pos,
@@ -2763,7 +2699,7 @@ void RequestManager::add_token_to_speculation_tree(
   return;
 }
 
-void RequestManager::prune_last_layer_of_speculation_tree(RequestGuid guid) {
+void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
   // This method assumes only one small model is used for speculation
   Request &request = all_requests[guid];
   auto &last_layer = request.speculative_token_trees[0].tree_layers.back();

From fba3c61ec8b43b700e0c9bf1fa0684dd7e556f49 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 22:03:48 -0400
Subject: [PATCH 029/667] Remove serve_spec_infer_v2 because we will conduct
 inplace changes to the old version.

---
 include/flexflow/request_manager.h |  1 -
 src/runtime/request_manager.cc     | 93 ------------------------------
 2 files changed, 94 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 6f28aa54e..27e536158 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -156,7 +156,6 @@ class RequestManager {
 
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
-  void serve_spec_infer_v2(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,
                                    int max_sequence_length);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 971d6a152..cec832b66 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2522,99 +2522,6 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   }
 }
 
-/*static*/
-void RequestManager::serve_spec_infer_v2(FFModel *llm) {
-  Context ctx = llm->config.lg_ctx;
-  Runtime *runtime = llm->config.lg_hlr;
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  {
-    // Compile the llm
-    im->compile_model_and_allocate_buffer(llm);
-    assert(im->model_weights_loaders.find(llm) !=
-           im->model_weights_loaders.end());
-    // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
-    // init operators
-    im->init_operators_inference(llm);
-  }
-  for (size_t i = 0; i < get_num_ssms(); i++) {
-    // Compile the i-th ssm
-    FFModel *ssm = get_ssm_model(i);
-    im->compile_model_and_allocate_buffer(ssm);
-    assert(im->model_weights_loaders.find(ssm) !=
-           im->model_weights_loaders.end());
-    // Load model weights
-    im->model_weights_loaders[ssm]->load_weights(ssm);
-    // init operators
-    im->init_operators_inference(ssm);
-  }
-
-  std::queue<std::pair<TreeVerifyBatchConfigFuture, InferenceResultFuture>>
-      batch_pipeline;
-  // Legion futures for inc_decoding and spec_infer
-  TreeVerifyBatchConfigFuture last_tree_bcf;
-  InferenceResultFuture last_tree_irf;
-  {
-    // Initialize futures for spec infer
-    TreeVerifyBatchConfig tree_bc;
-    InferenceResult tree_ir;
-    last_tree_bcf = Future::from_value<TreeVerifyBatchConfig>(tree_bc);
-    last_tree_irf = Future::from_value<InferenceResult>(tree_ir);
-  }
-  batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf));
-
-  while (!is_background_server_terminated()) {
-
-    if (batch_pipeline.size() >= 4) {
-      // Block here to avoid launching too many batches
-      auto const &batch = batch_pipeline.front();
-      batch.second.get_void_result();
-    }
-    // deque finished batches
-    while (batch_pipeline.size() > 1) {
-      auto const &batch = batch_pipeline.front();
-      if (batch.second.is_ready()) {
-        batch_pipeline.pop();
-      } else {
-        break;
-      }
-    }
-    auto const &next_batch = batch_pipeline.back();
-    TreeSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
-        next_batch.first, next_batch.second, 0, ctx, runtime);
-    std::vector<TreeSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
-    for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {
-      beam_bcf_vec[ssm_id] = beam_bcf;
-    }
-    runtime->begin_trace(ctx, 12345 /*trace_id*/);
-
-    for (size_t i = 0; i < get_num_ssms(); i++) {
-      for (int depth = 0; depth < TreeSearchBatchConfig::MAX_BEAM_DEPTH;
-           depth++) {
-        beam_bcf = beam_bcf_vec[i];
-
-        FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]);
-        assert(fm.get_future_map_domain().get_volume() == 1);
-        SsmInferenceResultFuture beam_irf = fm.get_future(0);
-        beam_bcf_vec[i] =
-            prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime);
-      }
-    }
-    // Token Tree Verification
-    {
-      TreeVerifyBatchConfigFuture tree_bcf =
-          prepare_next_batch_verify(beam_bcf_vec, ctx, runtime);
-      FutureMap fm = im->inference(llm, 0, tree_bcf);
-      assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture tree_irf = fm.get_future(0);
-      batch_pipeline.push(std::make_pair(tree_bcf, tree_irf));
-      last_tree_bcf = tree_bcf;
-      last_tree_irf = tree_irf;
-    }
-    runtime->end_trace(ctx, 12345 /*trace_id*/);
-  }
-}
-
 void RequestManager::trigger_request_completion_future(
     RequestGuid const &guid) {
   std::lock_guard<std::mutex> const lock(request_to_promise_mutex);

From 1491b6a84ea78b0648007353675e37888d1861cf Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 4 Apr 2024 23:49:57 -0400
Subject: [PATCH 030/667] Mark update_beam_metadata as deprecated

---
 src/runtime/request_manager.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cec832b66..1fa621fda 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1042,13 +1042,15 @@ TreeSearchBatchConfig
                 << std::endl;
     }
   }
-  // Step 1: Store small model's inference result to the token tree struct
+
+  // Store small model's inference result to the token tree struct
   store_ssm_inference_results(old_bc, result);
 
-  // Step 2: preparing the next batch for existing requests
+  // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;
   new_bc.model_id = old_bc.model_id;
-  // std::cout << "old_bc.model_id: " << old_bc.model_id << "\n";
+  // We only support single small model
+  assert(old_bc.num_tokens > 0);
   int num_generation_tokens = 0;
 
   // Add incremental tokens to the batch
@@ -1749,6 +1751,7 @@ void RequestManager::store_ssm_inference_results(
 }
 
 // for updating the beam search metadata in requests in incremental phase
+[[deprecated("I don't think this function is used anymore")]]
 void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
                                           TreeSearchBatchConfig const &old_bc,
                                           BeamTree &tree,

From 974780d648010c7c5989e48b206de93af3b49724 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 5 Apr 2024 00:05:07 -0400
Subject: [PATCH 031/667] Rename appendBitMask to append_bitmask and
 updateBitMask to update_bitmask to align with other APIs

---
 include/flexflow/request_manager.h | 18 +++++++++---------
 src/runtime/request_manager.cc     | 24 ++++++++++++------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 27e536158..9249ff297 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -142,15 +142,15 @@ class RequestManager {
   void register_output_filepath(std::string const &);
   void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
-  void appendBitMask(BatchConfig::BitMask &bitmask,
-                     int newNodes,
-                     int preBeamSize,
-                     int old_sub_num,
-                     BeamTree const tree,
-                     int currentDepth);
-  void updateBitMask(BatchConfig::BitMask &bitmask,
-                     int initLength,
-                     int non_tree_size);
+  void append_bitmask(BatchConfig::BitMask &bitmask,
+                      int newNodes,
+                      int preBeamSize,
+                      int old_sub_num,
+                      BeamTree const tree,
+                      int currentDepth);
+  void update_bitmask(BatchConfig::BitMask &bitmask,
+                      int initLength,
+                      int non_tree_size);
 
   FFModel *get_ssm_model(int model_id);
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1fa621fda..e45cfa5d9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -793,9 +793,9 @@ TreeSearchBatchConfig
 
         new_bc.sub_requests[i] = 1;
 
-        updateBitMask(new_bc.causalMask[i],
-                      verified_tokens.size(),
-                      request.tokens.size());
+        update_bitmask(new_bc.causalMask[i],
+                       verified_tokens.size(),
+                       request.tokens.size());
 
         // Token Info
         for (int j = 0; j < verified_tokens.size(); j++) {
@@ -1154,12 +1154,12 @@ TreeSearchBatchConfig
              &old_bc.causalMask[i],
              sizeof(BatchConfig::BitMask));
       BeamTree tree = request.beam_trees[old_bc.model_id];
-      appendBitMask(new_bc.causalMask[i],
-                    new_bc.beamRequestsInfo[i].sub_request_num,
-                    old_bc.beamRequestsInfo[i].beam_size,
-                    old_bc.beamRequestsInfo[i].sub_request_num,
-                    tree,
-                    old_bc.beamRequestsInfo[i].current_depth);
+      append_bitmask(new_bc.causalMask[i],
+                     new_bc.beamRequestsInfo[i].sub_request_num,
+                     old_bc.beamRequestsInfo[i].beam_size,
+                     old_bc.beamRequestsInfo[i].sub_request_num,
+                     tree,
+                     old_bc.beamRequestsInfo[i].current_depth);
       for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
         int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
         for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) {
@@ -1823,9 +1823,9 @@ void RequestManager::initBitMask(BatchConfig::BitMask &bitmask,
 }
 
 // prepare next init
-void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask,
-                                   int initLength,
-                                   int non_tree_size) {
+void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
+                                    int initLength,
+                                    int non_tree_size) {
   // assert(initLength == 1);
   // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
   // 0000000..1000

From 2f30b222c1dae8e93e1184628ad04b476e09797f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 6 Apr 2024 11:26:33 -0400
Subject: [PATCH 032/667] 1. Changed the name of initBitMask to init_bitmask.
 2. Removed some unused parameters in append_bitmask.

---
 include/flexflow/request_manager.h |  4 +--
 src/runtime/request_manager.cc     | 56 ++++++++++++++----------------
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 9249ff297..fa4d5172f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -140,12 +140,10 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
-  void initBitMask(BatchConfig::BitMask &bitmask, int initLength);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
+  void init_bitmask(BatchConfig::BitMask &bitmask, int initLength);
   void append_bitmask(BatchConfig::BitMask &bitmask,
                       int newNodes,
-                      int preBeamSize,
-                      int old_sub_num,
                       BeamTree const tree,
                       int currentDepth);
   void update_bitmask(BatchConfig::BitMask &bitmask,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e45cfa5d9..0e72a2943 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -948,8 +948,8 @@ TreeSearchBatchConfig
           new_bc.num_tokens++;
         }
 
-        initBitMask(new_bc.causalMask[i],
-                    new_bc.requestsInfo[i].num_tokens_in_batch);
+        init_bitmask(new_bc.causalMask[i],
+                     new_bc.requestsInfo[i].num_tokens_in_batch);
 
         // if (new_bc.requestsInfo[i].num_tokens_in_batch <
         // new_request.initial_len) {
@@ -1806,8 +1806,8 @@ void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
 // bit mask related function
 
 // prompt phase, init task
-void RequestManager::initBitMask(BatchConfig::BitMask &bitmask,
-                                 int initLength) {
+void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,
+                                  int initLength) {
   assert(initLength > 0);
   // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
   // 0000000..1000
@@ -1852,32 +1852,11 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
   //           << "\n";
 }
 
-// prompt phase, init task
-void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
-                                          int initLength) {
-  assert(initLength > 0);
-  // std::cout << "append pending bit mask: " << initLength << "\n";
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
-  bitmask.non_tree_cache_size = 0;
-  bitmask.tree_size = 1;
-  bitmask.prompt_size += initLength;
-  bitmask.this_layer_size = initLength;
-
-  // for (int i = 0; i < bitmask.prompt_size; i++) {
-  //   for (int j = i; j < bitmask.prompt_size; j++) {
-  //     bitmask.mask[i] |= (1 << j);
-  //   }
-  // }
-}
-
 // prepare next beam, append layers to the tree
-void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask,
-                                   int newNodes,
-                                   int preBeamSize,
-                                   int old_sub_num,
-                                   BeamTree const tree,
-                                   int currentDepth) {
+void RequestManager::append_bitmask(BatchConfig::BitMask &bitmask,
+                                    int newNodes,
+                                    BeamTree const tree,
+                                    int currentDepth) {
   int pre_tree_size = bitmask.tree_size;
   bitmask.tree_size += newNodes;
   bitmask.this_layer_size = newNodes;
@@ -1943,6 +1922,25 @@ void RequestManager::appendBitMask(BatchConfig::BitMask &bitmask,
   //           << "\n";
 }
 
+// prompt phase, init task
+void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
+                                          int initLength) {
+  assert(initLength > 0);
+  // std::cout << "append pending bit mask: " << initLength << "\n";
+  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
+  // 0000000..1000
+  bitmask.non_tree_cache_size = 0;
+  bitmask.tree_size = 1;
+  bitmask.prompt_size += initLength;
+  bitmask.this_layer_size = initLength;
+
+  // for (int i = 0; i < bitmask.prompt_size; i++) {
+  //   for (int j = i; j < bitmask.prompt_size; j++) {
+  //     bitmask.mask[i] |= (1 << j);
+  //   }
+  // }
+}
+
 bool PreOrder(
     BeamTree const &tree,
     int max_depth,

From 5acafe9e9c1ce5ec75bb4209ec146f587c0bb56d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 6 Apr 2024 21:33:47 -0400
Subject: [PATCH 033/667] 1. Add a BitSet class to replace the bit mask based
 on long integer. 2. Renamed BitMask.mask to BitMask.bit_mask. 3. Renamed
 BitMask.this_layer_size to BitMask.current_layer_size. 4. Add another version
 of the implementation of append_bitmask, which is inplace.

---
 include/flexflow/batch_config.h              |  42 ++++++--
 include/flexflow/request_manager.h           |   1 +
 src/ops/spec_inc_multihead_self_attention.cu |   5 +-
 src/runtime/request_manager.cc               | 104 +++++++++++++------
 4 files changed, 108 insertions(+), 44 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index c6fe18752..1009bd459 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -84,17 +84,43 @@ class BatchConfig {
   };
 
   struct BitMask {
-    unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0};
-
-    // how many tokens before the tree, every sub requests need this part of
-    // cache
+    class Bitset {
+    public:
+      Bitset() : bits{0} {}
+
+      Bitset(Bitset const &other) {
+        // Copy the entire array of bits from 'other' to this object
+        std::memcpy(bits, other.bits, sizeof(bits));
+      }
+
+      void set_bit(size_t pos) {
+        size_t idx = pos / 64; // Find the index in the array
+        size_t bit = pos % 64; // Find the bit position within the uint64_t
+        bits[idx] |= (1ULL << bit);
+      }
+
+      void reset_bit(size_t pos) {
+        size_t idx = pos / 64;
+        size_t bit = pos % 64;
+        bits[idx] &= ~(1ULL << bit);
+      }
+
+      bool test_bit(size_t pos) const {
+        size_t idx = pos / 64;
+        size_t bit = pos % 64;
+        return (bits[idx] & (1ULL << bit)) != 0;
+      }
+
+    private:
+      uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 8]; // Array to hold 256 bits
+    };
+
+    Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM];
+    // the number of tokens before the tree
     int non_tree_cache_size = 0;
-
     // current tree size
     int tree_size = 0;
-
-    int this_layer_size = 0;
-
+    int current_layer_size = 0;
     // input length-> prompt/root
     int prompt_size = 0;
   };
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index fa4d5172f..629801cd0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -146,6 +146,7 @@ class RequestManager {
                       int newNodes,
                       BeamTree const tree,
                       int currentDepth);
+  void append_bitmask(RequestGuid guid, BatchConfig::BitMask &bitmask);
   void update_bitmask(BatchConfig::BitMask &bitmask,
                       int initLength,
                       int non_tree_size);
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 0b6b89e61..d5cddb15e 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -92,7 +92,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
 
   int first_token_idx = 0;
   for (int r = 0; r < batch_config_request_id; r++) {
-    first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size;
+    first_token_idx +=
+        request_completed[r] ? 0 : causalMask[r].current_layer_size;
   }
 
   int const tree_branch_num =
@@ -347,7 +348,7 @@ __global__ void spec_inc_store_kv_cache(
     // if tree token:
 
     int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size +
-                          bitmask.tree_size - 1 - bitmask.this_layer_size +
+                          bitmask.tree_size - 1 - bitmask.current_layer_size +
                           token_idx - request_token_offset;
 
     kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0e72a2943..6fc0dcc55 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1088,35 +1088,37 @@ TreeSearchBatchConfig
       int ssm_decoding_steps =
           profiling_requests[request.guid].ssm_decoding_steps;
 
-      new_bc.beamRequestsInfo[i].beam_size =
-          spec_infer_tree_width.size() > ssm_decoding_steps
-              ? spec_infer_tree_width[ssm_decoding_steps]
-              : 1;
-
-      new_bc.beamRequestsInfo[i].max_depth =
-          old_bc.beamRequestsInfo[i].max_depth;
-
-      new_bc.sub_requests[i] =
-          old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size;
-      new_bc.beamRequestsInfo[i].sub_request_num =
-          old_bc.beamRequestsInfo[i].sub_request_num *
-          old_bc.beamRequestsInfo[i].beam_size;
-
-      assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
-             "exceed maximum nodes per layer");
-
-      if (request.status == Request::RUNNING) {
-        new_bc.beamRequestsInfo[i].current_depth =
-            old_bc.beamRequestsInfo[i].current_depth + 1;
-        new_bc.request_running[i] = true;
-        // do the slot exchange to minimize the cache exchange in kernel.
-        update_beam_metadata(
-            new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i);
-
-      } else {
-        assert(false && "Request should not be pending in beam search phase");
-      }
+      // We don't need the following logic
+      //   new_bc.beamRequestsInfo[i].beam_size =
+      //       spec_infer_tree_width.size() > ssm_decoding_steps
+      //           ? spec_infer_tree_width[ssm_decoding_steps]
+      //           : 1;
+
+      //   new_bc.beamRequestsInfo[i].max_depth =
+      //       old_bc.beamRequestsInfo[i].max_depth;
+
+      //   new_bc.sub_requests[i] =
+      //       old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size;
+      //   new_bc.beamRequestsInfo[i].sub_request_num =
+      //       old_bc.beamRequestsInfo[i].sub_request_num *
+      //       old_bc.beamRequestsInfo[i].beam_size;
+
+      //   assert(new_bc.beamRequestsInfo[i].sub_request_num <=
+      //              TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
+      //          "exceed maximum nodes per layer");
+
+      //   if (request.status == Request::RUNNING) {
+      //     new_bc.beamRequestsInfo[i].current_depth =
+      //         old_bc.beamRequestsInfo[i].current_depth + 1;
+      //     new_bc.request_running[i] = true;
+      //     // do the slot exchange to minimize the cache exchange in kernel.
+      //     update_beam_metadata(
+      //         new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i);
+
+      //   } else {
+      //     assert(false && "Request should not be pending in beam search
+      //     phase");
+      //   }
 
       // do the slot exchange to minimize the cache exchange in kernel.
       // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
@@ -1253,7 +1255,7 @@ TreeSearchBatchConfig
           request.tokens.size()) {
         // request is done
         new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-        new_bc.causalMask[i].this_layer_size = 0;
+        new_bc.causalMask[i].layer_size = 0;
         new_bc.beamRequestsInfo[i].sub_request_num = 0;
         new_bc.beamRequestsInfo[i].beam_size = 1;
       } else {
@@ -1815,7 +1817,7 @@ void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,
   bitmask.tree_size = 1;
 
   bitmask.prompt_size = initLength;
-  bitmask.this_layer_size = initLength;
+  bitmask.layer_size = initLength;
   // std::cout << "see bit mask" << bitmask.prompt_size << "\n";
   // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n";
   // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n";
@@ -1838,7 +1840,7 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
 
   bitmask.non_tree_cache_size = non_tree_size + initLength - 1;
   bitmask.tree_size = 1;
-  bitmask.this_layer_size = initLength;
+  bitmask.layer_size = initLength;
   // std::cout << "non_tree_size: " << non_tree_size << "\n";
   bitmask.prompt_size = 1;
   for (int i = 0; i < bitmask.prompt_size; i++) {
@@ -1859,7 +1861,7 @@ void RequestManager::append_bitmask(BatchConfig::BitMask &bitmask,
                                     int currentDepth) {
   int pre_tree_size = bitmask.tree_size;
   bitmask.tree_size += newNodes;
-  bitmask.this_layer_size = newNodes;
+  bitmask.layer_size = newNodes;
   assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM &&
          "do not support tree size > 64");
   // preBeamSize: replicate num
@@ -1922,6 +1924,40 @@ void RequestManager::append_bitmask(BatchConfig::BitMask &bitmask,
   //           << "\n";
 }
 
+void RequestManager::append_bitmask(RequestGuid guid,
+                                    BatchConfig::BitMask &bitmask) {
+  // This function changes the bitmask in place
+  Request &request = all_requests[guid];
+  std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
+      request.speculative_token_trees[0].tree_layers.back();
+  int new_layer_size = tree_layer.size();
+  int last_layer_size = bitmask.current_layer_size;
+  int previous_tree_size = bitmask.tree_size;
+  bitmask.current_layer_size = new_layer_size;
+  bitmask.tree_size += new_layer_size;
+
+  assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+
+  int parent_offset = previous_tree_size - last_layer_size;
+  int child_offset = previous_tree_size;
+
+  int child_idx = 0;
+  for (auto const &child_ptr : tree_layer) {
+    // Each child copy its parent's mask
+    // Here we assume child_ptr->parent_pos denotes the position of the parent
+    // in its corresponding layer
+    bitmask.bit_mask[child_offset + child_idx] =
+        bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
+    // Each child attend to its parent
+    bitmask.bit_mask[child_offset + child_idx].set_bit(parent_offset +
+                                                       child_ptr->parent_pos);
+    // Each child attend to itself
+    bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
+                                                       child_idx);
+    child_idx++;
+  }
+}
+
 // prompt phase, init task
 void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
                                           int initLength) {
@@ -1932,7 +1968,7 @@ void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
   bitmask.non_tree_cache_size = 0;
   bitmask.tree_size = 1;
   bitmask.prompt_size += initLength;
-  bitmask.this_layer_size = initLength;
+  bitmask.layer_size = initLength;
 
   // for (int i = 0; i < bitmask.prompt_size; i++) {
   //   for (int j = i; j < bitmask.prompt_size; j++) {

From d0f1f1bc8ac713930eefecf0a49b6c2d4dc39dc1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 7 Apr 2024 11:08:24 -0400
Subject: [PATCH 034/667] 1. To make the request manager stateful, a causal
 mask is added to Request and a field current_speculation_step is also added.
 2. To account for the max number of speculated tokens in a request, a field
 tree_size is added to TokenTree. 3. Rewrite the add_token_to_spec_token_tree
 function to account for the maximum speculation tree size.

---
 include/flexflow/request_manager.h | 14 ++++--
 src/runtime/request_manager.cc     | 75 +++++++++++++++++++-----------
 2 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 629801cd0..c21066bb7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -73,9 +73,10 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  // In the current version, we only use one speculator
-  //   TokenTree speculative_token_tree;
+  // TokenTree speculative_token_tree;
   std::vector<TokenTree> speculative_token_trees;
+  // To make request manager stateful, we need to store the causal mask here
+  BatchConfig::BitMask causal_mask;
 };
 
 class TokenTreeNode {
@@ -103,6 +104,7 @@ struct CompareSharedTokenTreeNodePtr {
 class TokenTree {
 public:
   std::vector<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
+  int tree_size = 0;
   void add_layer() {
     tree_layers.emplace_back();
   }
@@ -381,6 +383,11 @@ class RequestManager {
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
 
+  // Added to make the request manager stateful. During the processing of the
+  // first small model inference results, the step equals to 1. That is, every
+  // time a small model inference task is launched, the step is increased by 1.
+  int current_speculation_step = 0;
+
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
   std::priority_queue<std::shared_ptr<TokenTreeNode>,
@@ -416,8 +423,7 @@ class RequestManager {
   void add_token_to_spec_token_tree(RequestGuid guid,
                                     BatchConfig::TokenId token_id,
                                     int parent_pos,
-                                    float joint_prob,
-                                    int depth);
+                                    float joint_prob);
   void prune_last_layer_of_spec_token_tree(RequestGuid guid);
 };
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6fc0dcc55..ff90da141 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1945,7 +1945,7 @@ void RequestManager::append_bitmask(RequestGuid guid,
   for (auto const &child_ptr : tree_layer) {
     // Each child copy its parent's mask
     // Here we assume child_ptr->parent_pos denotes the position of the parent
-    // in its corresponding layer
+    // in its corresponding layer, check this
     bitmask.bit_mask[child_offset + child_idx] =
         bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
     // Each child attend to its parent
@@ -2597,49 +2597,72 @@ RequestManager *RequestManager::get_request_manager() {
   return request_manager_singleton;
 }
 
-void RequestManager::add_token_to_spec_token_tree(
-    RequestGuid guid,
-    BatchConfig::TokenId token_id,
-    int parent_pos,
-    float joint_prob,
-    int depth // depth starts from 0
-) {
+void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
+                                                  BatchConfig::TokenId token_id,
+                                                  int parent_pos,
+                                                  float joint_prob) {
   // This method assumes only one small model is used for speculation
 
   // First make sure there are enough layers in the speculation tree
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
-  while (speculative_token_tree.tree_layers.size() <= depth) {
+
+  if (speculative_token_tree.tree_layers.size() == current_speculation_step) {
+    // When adding the first token, we need to add a new layer
     speculative_token_tree.add_layer();
+  } else {
+    // To add a token, the tree depth is either the same as the current
+    // speculation step or one more than the current speculation step.
+    assert(speculative_token_tree.tree_layers.size() ==
+               current_speculation_step + 1 &&
+           "The depth of the token tree should be consistent with the depth of "
+           "the token being added");
   }
 
   // We maintain the size of the token tree node pool to not exceed
   // BatchConfig::MAX_NUM_TOKENS
-  if (token_tree_node_pool.size() < BatchConfig::MAX_NUM_TOKENS) {
-    // Add to the last layer of the speculation tree
-    auto node_ptr =
-        std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
-    request.speculative_token_trees[0].tree_layers[depth].push_back(node_ptr);
-    token_tree_node_pool.push(node_ptr);
-    return;
-  }
-
-  // The pool is full, check if the new node has a higher joint probability
-  // than the minimum node in the pool.
-  std::shared_ptr<TokenTreeNode> min_node_in_pool = token_tree_node_pool.top();
-  if (joint_prob < min_node_in_pool->joint_prob) {
+  if (token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS) {
+    // The pool is full, check if the new node has a higher joint probability
+    // than the minimum node in the pool.
+    std::shared_ptr<TokenTreeNode> min_node_in_pool =
+        token_tree_node_pool.top();
+    if (joint_prob < min_node_in_pool->joint_prob) {
+      // Insertion failed
+      return;
+    } else {
+      // Remove the minimum node from the pool, and set its pruned field to true
+      min_node_in_pool->pruned = true;
+      token_tree_node_pool.pop();
+      speculative_token_tree.tree_size--;
+    }
+  } else if (token_tree_node_pool.size() > BatchConfig::MAX_NUM_TOKENS) {
+    assert(false && "The size of the token tree node pool should not exceed "
+                    "BatchConfig::MAX_NUM_TOKENS");
+  }
+  // Do nothing if the pool is not full
+
+  // The request's token tree size should not exceed
+  // BatchConfig::MAX_SPEC_TREE_TOKEN_NUM
+  // The judgement is done here to avoid the case where the tree is full but a
+  // node is pruned.
+  if (speculative_token_tree.tree_size ==
+      BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
     // Insertion failed
     return;
+  } else if (speculative_token_tree.tree_size >
+             BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
+    assert(false && "The size of the token tree should not exceed "
+                    "BatchConfig::MAX_SPEC_TREE_TOKEN_NUM");
   }
 
-  // Remove the minimum node from the pool, and set its pruned field to true
-  min_node_in_pool->pruned = true;
-  token_tree_node_pool.pop();
   // Add the new node to the pool and the last layer of the speculation tree
   auto node_ptr =
       std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
   token_tree_node_pool.push(node_ptr);
-  request.speculative_token_trees[0].tree_layers[depth].push_back(node_ptr);
+  request.speculative_token_trees[0]
+      .tree_layers[current_speculation_step]
+      .push_back(node_ptr);
+  speculative_token_tree.tree_size++;
   return;
 }
 

From 0919a833ae4d9939659d853024a3a6531585c173 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 7 Apr 2024 11:15:40 -0400
Subject: [PATCH 035/667] Add depth check in
 RequestManager::prune_last_layer_of_spec_token_tree because we don't try to
 maintail all the speculaton trees in the same depth. The trees with depth <=
 current_speculation_step will have no tokens added to them later.

---
 src/runtime/request_manager.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ff90da141..8dd70c23b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2669,6 +2669,12 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
 void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
   // This method assumes only one small model is used for speculation
   Request &request = all_requests[guid];
+
+  if (request.speculative_token_trees[0].tree_layers.size() <=
+      current_speculation_step) {
+    // There are no tokens in the last layer
+    return;
+  }
   auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
   for (auto it = last_layer.begin(); it != last_layer.end(); ++it) {
     if ((*it)->pruned) {

From fb292aaaf716b6b4e5bc01d1ea9bb7db0d30a454 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 7 Apr 2024 11:59:57 -0400
Subject: [PATCH 036/667] 1. Add guid in token_tree_node_pool to track the
 speculation tree size of different requests when adding tokens to the token
 trees. 2. Fix bug in add_token_to_spec_token_tree.

---
 include/flexflow/request_manager.h | 17 +++++---
 src/runtime/request_manager.cc     | 68 +++++++++++++++++++++---------
 2 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index c21066bb7..ca638c628 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -94,10 +94,12 @@ class TokenTreeNode {
 };
 
 // A comparator for shared_ptr<TokenTreeNode>
-struct CompareSharedTokenTreeNodePtr {
-  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
-                  std::shared_ptr<TokenTreeNode> const &rhs) const {
-    return *lhs > *rhs;
+struct CompareSharedTokenTreeNodePtrRequestGuidPair {
+  bool operator()(std::pair<std::shared_ptr<TokenTreeNode>,
+                            BatchConfig::RequestGuid> const &lhs,
+                  std::pair<std::shared_ptr<TokenTreeNode>,
+                            BatchConfig::RequestGuid> const &rhs) const {
+    return lhs.first->joint_prob > rhs.first->joint_prob;
   }
 };
 
@@ -390,9 +392,10 @@ class RequestManager {
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
-  std::priority_queue<std::shared_ptr<TokenTreeNode>,
-                      std::vector<std::shared_ptr<TokenTreeNode>>,
-                      CompareSharedTokenTreeNodePtr>
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
+      CompareSharedTokenTreeNodePtrRequestGuidPair>
       token_tree_node_pool;
 
   // TODO: Move this two vector to request struct
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8dd70c23b..9432527c9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2619,21 +2619,30 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
            "the token being added");
   }
 
+  bool remove_min_node = false;
+  bool add_new_node = true;
+
+  std::shared_ptr<TokenTreeNode> min_node_ptr = nullptr;
+  RequestGuid min_node_guid = -1;
+  if (token_tree_node_pool.size() > 0) {
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
+        min_node_pair_in_pool = token_tree_node_pool.top();
+    min_node_ptr = min_node_pair_in_pool.first;
+    min_node_guid = min_node_pair_in_pool.second;
+  }
+
   // We maintain the size of the token tree node pool to not exceed
-  // BatchConfig::MAX_NUM_TOKENS
+  //  BatchConfig::MAX_NUM_TOKENS
   if (token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS) {
     // The pool is full, check if the new node has a higher joint probability
     // than the minimum node in the pool.
-    std::shared_ptr<TokenTreeNode> min_node_in_pool =
-        token_tree_node_pool.top();
-    if (joint_prob < min_node_in_pool->joint_prob) {
+
+    if (joint_prob < min_node_ptr->joint_prob) {
       // Insertion failed
-      return;
+      add_new_node = false;
     } else {
       // Remove the minimum node from the pool, and set its pruned field to true
-      min_node_in_pool->pruned = true;
-      token_tree_node_pool.pop();
-      speculative_token_tree.tree_size--;
+      remove_min_node = true;
     }
   } else if (token_tree_node_pool.size() > BatchConfig::MAX_NUM_TOKENS) {
     assert(false && "The size of the token tree node pool should not exceed "
@@ -2647,23 +2656,44 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
   // node is pruned.
   if (speculative_token_tree.tree_size ==
       BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
-    // Insertion failed
-    return;
+    if (remove_min_node && guid == min_node_guid) {
+      // The minimum node in the pool is pruned, and it's in the same request
+      // with the new node. Only in this case we can add the new node.
+      // Because remove_min_node is true means that the new node has a higher
+      // joint probability than the minimum node in the pool.
+      add_new_node = true;
+    } else {
+      // Otherwise, we cannot add the new node, and we don't need to expel the
+      // minimum node from the pool.
+      add_new_node = false;
+      remove_min_node = false;
+    }
   } else if (speculative_token_tree.tree_size >
              BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
     assert(false && "The size of the token tree should not exceed "
                     "BatchConfig::MAX_SPEC_TREE_TOKEN_NUM");
   }
 
-  // Add the new node to the pool and the last layer of the speculation tree
-  auto node_ptr =
-      std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
-  token_tree_node_pool.push(node_ptr);
-  request.speculative_token_trees[0]
-      .tree_layers[current_speculation_step]
-      .push_back(node_ptr);
-  speculative_token_tree.tree_size++;
-  return;
+  assert(!(remove_min_node && !add_new_node) &&
+         "The minimum node should be removed only when the new node is added");
+
+  if (remove_min_node) {
+    // Remove the minimum node from the pool, and set its pruned field to true
+    min_node_ptr->pruned = true;
+    token_tree_node_pool.pop();
+    all_requests[min_node_guid].speculative_token_trees[0].tree_size--;
+  }
+
+  if (add_new_node) {
+    // Add the new node to the pool and the last layer of the speculation tree
+    auto node_ptr =
+        std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
+    token_tree_node_pool.push(std::make_pair(node_ptr, guid));
+    request.speculative_token_trees[0]
+        .tree_layers[current_speculation_step]
+        .push_back(node_ptr);
+    speculative_token_tree.tree_size++;
+  }
 }
 
 void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {

From 73fec2345786f1d857869bec867a78179b0cf5b6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 7 Apr 2024 23:12:53 -0400
Subject: [PATCH 037/667] 1. Remove the parameter old_bc from
 store_ssm_inference_results so that it only operates on data stored in the
 request manager. 2. Reimplemented store_ssm_inference_results. 3. Add a
 mapping from the position of a request in the batch to the request's guid.

---
 include/flexflow/request_manager.h |   5 +-
 src/runtime/request_manager.cc     | 128 ++++++++++++++---------------
 2 files changed, 63 insertions(+), 70 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ca638c628..3719d2886 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -207,8 +207,7 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void store_ssm_inference_results(TreeSearchBatchConfig const &old_bc,
-                                   SsmInferenceResult const &result);
+  bool store_ssm_inference_results(SsmInferenceResult const &result);
   void update_beam_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             BeamTree &tree,
@@ -389,6 +388,8 @@ class RequestManager {
   // first small model inference results, the step equals to 1. That is, every
   // time a small model inference task is launched, the step is increased by 1.
   int current_speculation_step = 0;
+  // Maps the index of the request in the batch config to the request guid.
+  int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9432527c9..c94ac6f7b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1686,69 +1686,56 @@ void RequestManager::initialize_root_of_spec_token_trees() {
   }
 }
 
-void RequestManager::store_ssm_inference_results(
-    TreeSearchBatchConfig const &old_bc, SsmInferenceResult const &result) {
-  if (old_bc.num_tokens <= 0) {
-    return;
-  }
-
-  // Depth starts from 1, because the root is at layer 0
-  int depth = old_bc.current_depth;
-  assert(depth > 0);
+bool RequestManager::store_ssm_inference_results(
+    SsmInferenceResult const &result) {
+  // This function returns false if no tokens are added to the token tree,
+  // which indicates that the ssm inference phase is done.
+  assert(current_speculation_step > 0);
 
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int num_old_bc_tokens_processed = 0;
-
-  int request_index = old_bc.tokensInfo[0].request_index;
-  FlexFlow::RequestManager::RequestGuid guid =
-      old_bc.requestsInfo[request_index].request_guid;
-  Request &request = all_requests[guid];
-  int start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
-
   int result_index = 0;
 
-  if (verbose) {
-    std::cout << "Store total of " << old_bc.num_tokens * num_branches
-              << " tokens in the current batch.\n";
-  }
+  // TODO: here we assume that the order of the tokens in the last
+  // TreeSearchBatchConfig and hence the last SsmInferenceResult is equal to the
+  // order of the request in the last TreeSearchBatchConfig, check this!
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    FlexFlow::RequestManager::RequestGuid guid =
+        guid_of_requests[request_index];
+    Request &request = all_requests[guid];
 
-  while (num_old_bc_tokens_processed < old_bc.num_tokens) {
-    // Process the tokens for the current request
-    int num_parent_tokens_processed_in_request = 0;
     TokenTree &token_tree = request.speculative_token_trees[0];
-    std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-        token_tree.tree_layers[depth - 1];
-    for (auto parent_it = tree_layer.begin(); parent_it != tree_layer.end();
-         parent_it++) {
-      if ((*parent_it)->pruned) {
-        // Parent token is pruned, we have to skip all its children
-        // Because no token is pruned in the last layer during the small
-        // model inference, the reason why some parents are pruned is that
-        // adding tokens to the new layer of the tree may result in some
-        // node being pruned in internal layers.
-        result_index += num_branches;
-      } else {
-        // Parent token is not pruned
-        for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-          float parent_prob = (*parent_it)->joint_prob;
-          add_token_to_spec_token_tree(guid,
-                                       result.token_ids[result_index],
-                                       result.probs[result_index] * parent_prob,
-                                       result.parent_id[result_index],
-                                       depth);
-          result_index++;
+    if (token_tree.tree_layers.size() < current_speculation_step) {
+      // This means that the parent layer is empty
+      continue;
+    } else {
+      std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
+          token_tree.tree_layers[current_speculation_step - 1];
+      for (auto parent_it = parent_tree_layer.begin();
+           parent_it != parent_tree_layer.end();
+           parent_it++) {
+        if ((*parent_it)->pruned) {
+          // Parent token is pruned, we have to skip all its children
+          // Because no token is pruned in the last layer during the small
+          // model inference, the reason why some parents are pruned is that
+          // adding tokens to the new layer of the tree may result in some
+          // node being pruned in internal layers.
+          result_index += num_branches;
+        } else {
+          // Parent token is not pruned
+          for (int child_idx = 0; child_idx < num_branches; child_idx++) {
+            float parent_prob = (*parent_it)->joint_prob;
+            add_token_to_spec_token_tree(guid,
+                                         result.token_ids[result_index],
+                                         result.probs[result_index] *
+                                             parent_prob,
+                                         result.parent_id[result_index]);
+            result_index++;
+          }
         }
       }
-      num_old_bc_tokens_processed++;
     }
-
-    // Switch to the next request
-    int request_index =
-        old_bc.tokensInfo[num_old_bc_tokens_processed].request_index;
-    FlexFlow::RequestManager::RequestGuid guid =
-        old_bc.requestsInfo[request_index].request_guid;
-    Request &request = all_requests[guid];
-    int start_depth = old_bc.tokensInfo[0].abs_depth_in_request;
   }
 }
 
@@ -1789,7 +1776,8 @@ void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
       new_bc.beamRequestsInfo[request_index].tokens[j] =
           tree.treeLayers[depth].tokens[j];
       // std::cout << "token: " << j << ": "
-      //           << new_bc.beamRequestsInfo[request_index].tokens[j] << "\n";
+      //           << new_bc.beamRequestsInfo[request_index].tokens[j] <<
+      //           "\n";
     }
   }
   if (verbose) {
@@ -1811,8 +1799,8 @@ void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
 void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,
                                   int initLength) {
   assert(initLength > 0);
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
+  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
+  // t4: 0000000..1000
   bitmask.non_tree_cache_size = 0;
   bitmask.tree_size = 1;
 
@@ -1829,8 +1817,8 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
                                     int initLength,
                                     int non_tree_size) {
   // assert(initLength == 1);
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
+  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
+  // t4: 0000000..1000
   assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM &&
          "do not support tree size > 64");
   assert(initLength >= 1 && "verified token num should >= 1");
@@ -1919,8 +1907,9 @@ void RequestManager::append_bitmask(BatchConfig::BitMask &bitmask,
   // }
 
   // std::cout << "see bit mask append" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask append" << bitmask.non_tree_cache_size << "\n";
-  // std::cout << "see bit mask append" << std::bitset<64>(bitmask.mask[0])
+  // std::cout << "see bit mask append" << bitmask.non_tree_cache_size <<
+  // "\n"; std::cout << "see bit mask append" <<
+  // std::bitset<64>(bitmask.mask[0])
   //           << "\n";
 }
 
@@ -1963,8 +1952,8 @@ void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
                                           int initLength) {
   assert(initLength > 0);
   // std::cout << "append pending bit mask: " << initLength << "\n";
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4:
-  // 0000000..1000
+  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
+  // t4: 0000000..1000
   bitmask.non_tree_cache_size = 0;
   bitmask.tree_size = 1;
   bitmask.prompt_size += initLength;
@@ -2057,8 +2046,8 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
                     outputSerializedTree.size());
   { // Input tree
     std::ostringstream oss;
-    // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token id,
-    // depth) pairs
+    // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token
+    // id, depth) pairs
     for (auto const &pair : inputSerializedTree) {
       oss << " " << pair.second << ":" << pair.first;
       // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
@@ -2087,8 +2076,9 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
     log_req_mgr.print("Committed tokens:%s", oss.str().c_str());
   }
 
-  // It's safe to have inputSerializedTree.size() > outputSerializedTree.size()
-  // In this case the inputSeriedTree ends with padding 0s
+  // It's safe to have inputSerializedTree.size() >
+  // outputSerializedTree.size() In this case the inputSeriedTree ends with
+  // padding 0s
   assert(inputSerializedTree.size() >= outputSerializedTree.size());
 
   int *treeLayers = new int[inputSerializedTree.size()];
@@ -2171,7 +2161,8 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
         // at this point, you'll not go other branches
         // std::cout << "verify tree push back: " << output.first
         //           << ", tree size is: " << verifiedTree.size()
-        //           << ", ??: " << input.first << ", " << input.second << "\n";
+        //           << ", ??: " << input.first << ", " << input.second <<
+        //           "\n";
       }
 
       assert(committed_tokens.at(guid).at(i).first == input.second);
@@ -2641,7 +2632,8 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
       // Insertion failed
       add_new_node = false;
     } else {
-      // Remove the minimum node from the pool, and set its pruned field to true
+      // Remove the minimum node from the pool, and set its pruned field to
+      // true
       remove_min_node = true;
     }
   } else if (token_tree_node_pool.size() > BatchConfig::MAX_NUM_TOKENS) {

From f1b9e72bf3b2518dd3f61515000f4c82e113d935 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 10 Apr 2024 11:29:15 -0400
Subject: [PATCH 038/667] Fix a bug

---
 src/runtime/request_manager.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c94ac6f7b..e095cd00a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1937,9 +1937,6 @@ void RequestManager::append_bitmask(RequestGuid guid,
     // in its corresponding layer, check this
     bitmask.bit_mask[child_offset + child_idx] =
         bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
-    // Each child attend to its parent
-    bitmask.bit_mask[child_offset + child_idx].set_bit(parent_offset +
-                                                       child_ptr->parent_pos);
     // Each child attend to itself
     bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
                                                        child_idx);

From 0b0e89b7de7f77fa21a702e9cc3b19d4785d84cd Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 11 Apr 2024 15:21:07 -0400
Subject: [PATCH 039/667] Change the name of a parameter of
 store_ssm_inference_results

---
 include/flexflow/request_manager.h |  3 ++-
 src/runtime/request_manager.cc     | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3719d2886..152072e97 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -207,7 +207,8 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  bool store_ssm_inference_results(SsmInferenceResult const &result);
+  bool store_ssm_inference_results(
+      SsmInferenceResult const &ssm_inference_result);
   void update_beam_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             BeamTree &tree,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e095cd00a..14dae7b7f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1687,7 +1687,7 @@ void RequestManager::initialize_root_of_spec_token_trees() {
 }
 
 bool RequestManager::store_ssm_inference_results(
-    SsmInferenceResult const &result) {
+    SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
   assert(current_speculation_step > 0);
@@ -1726,11 +1726,11 @@ bool RequestManager::store_ssm_inference_results(
           // Parent token is not pruned
           for (int child_idx = 0; child_idx < num_branches; child_idx++) {
             float parent_prob = (*parent_it)->joint_prob;
-            add_token_to_spec_token_tree(guid,
-                                         result.token_ids[result_index],
-                                         result.probs[result_index] *
-                                             parent_prob,
-                                         result.parent_id[result_index]);
+            add_token_to_spec_token_tree(
+                guid,
+                ssm_inference_result.token_ids[result_index],
+                ssm_inference_result.probs[result_index] * parent_prob,
+                ssm_inference_result.parent_id[result_index]);
             result_index++;
           }
         }

From dfb674a9861f0648c9ae5ed1a3ac2fb512a49b0f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 11 Apr 2024 15:23:10 -0400
Subject: [PATCH 040/667] Change the name of a parameter in
 prepare_next_batch_spec

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 152072e97..5f1a0402f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -224,7 +224,7 @@ class RequestManager {
   // Given the last speculation result, prepare the next speculation batch.
   TreeSearchBatchConfig
       prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
-                              SsmInferenceResult const &result);
+                              SsmInferenceResult const &ssm_inference_result);
   // A wrapper function.
   TreeSearchBatchConfigFuture
       prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 14dae7b7f..19b502ce7 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1023,9 +1023,9 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec_task(
 }
 
 // update beam search metadata
-TreeSearchBatchConfig
-    RequestManager::prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
-                                            SsmInferenceResult const &result) {
+TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
+    TreeSearchBatchConfig const &old_bc,
+    SsmInferenceResult const &ssm_inference_result) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_spec ###############\n";
@@ -1033,7 +1033,7 @@ TreeSearchBatchConfig
   if (verbose) {
     std::cout << "print all results" << "\n";
     for (int i = 0; i < 40; i++) {
-      std::cout << result.token_ids[i] << ", ";
+      std::cout << ssm_inference_result.token_ids[i] << ", ";
     }
     std::cout << "Current tree depth: " << old_bc.current_depth << "\n";
     std::cout << "Number of tokens in each requests: " << std::endl;
@@ -1044,7 +1044,7 @@ TreeSearchBatchConfig
   }
 
   // Store small model's inference result to the token tree struct
-  store_ssm_inference_results(old_bc, result);
+  store_ssm_inference_results(old_bc, ssm_inference_result);
 
   // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;

From b8a9dff1a72f500db04243d7f901e0331a5b450b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 11 Apr 2024 17:27:02 -0400
Subject: [PATCH 041/667] Add deprecated to some fields; Add a copy constructor
 for BitMask

---
 include/flexflow/batch_config.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 1009bd459..00b79a4c8 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -64,6 +64,12 @@ class BatchConfig {
   int num_tokens;
   // number of tokens in prompt phase, start offset of tokens in inc_decoding
   // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
+
+  // TODO: remove this in the kernel.
+  // Previously this field is used to track how many tokens are from the
+  // decoding phase, now since we separate decoding from prefilling in small
+  // model inference, we don't need this field anymore.
+  [[deprecated("Not in use anymore")]]
   int num_generation_tokens;
 
   struct PerRequestInfo {
@@ -72,7 +78,9 @@ class BatchConfig {
     int num_tokens_in_batch;
     int max_sequence_length;
 
+    // TODO: remove this field
     // request id in batch config:
+    [[deprecated("This is now moved to the request manager")]]
     int batch_config_request_id;
     bool prompt_phase = false;
     RequestGuid request_guid;
@@ -83,7 +91,7 @@ class BatchConfig {
     TokenId token_id;
   };
 
-  struct BitMask {
+  class BitMask {
     class Bitset {
     public:
       Bitset() : bits{0} {}
@@ -115,6 +123,7 @@ class BatchConfig {
       uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 8]; // Array to hold 256 bits
     };
 
+  public:
     Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM];
     // the number of tokens before the tree
     int non_tree_cache_size = 0;
@@ -123,6 +132,16 @@ class BatchConfig {
     int current_layer_size = 0;
     // input length-> prompt/root
     int prompt_size = 0;
+    BitMask() = default;
+    BitMask(BitMask const &other) {
+      non_tree_cache_size = other.non_tree_cache_size;
+      tree_size = other.tree_size;
+      current_layer_size = other.current_layer_size;
+      prompt_size = other.prompt_size;
+      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
+        bit_mask[i] = other.bit_mask[i];
+      }
+    }
   };
 
   BitMask causalMask[MAX_NUM_REQUESTS];

From 2119d248fd411a4f2ad08f69dcd76fdb0f167e49 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 11 Apr 2024 17:29:41 -0400
Subject: [PATCH 042/667] Reimplement prepare_next_batch_spec, remove the
 parameter old_bc, so that it only depend on the state of the request manager.
 We may further separate store_ssm_inference_results from it. Some unsured
 parts are marked with TODOs.

---
 include/flexflow/request_manager.h |   3 +-
 src/runtime/request_manager.cc     | 308 +++++------------------------
 2 files changed, 50 insertions(+), 261 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5f1a0402f..02f2f0bb6 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -223,8 +223,7 @@ class RequestManager {
   /* New APIs */
   // Given the last speculation result, prepare the next speculation batch.
   TreeSearchBatchConfig
-      prepare_next_batch_spec(TreeSearchBatchConfig const &old_bc,
-                              SsmInferenceResult const &ssm_inference_result);
+      prepare_next_batch_spec(SsmInferenceResult const &ssm_inference_result);
   // A wrapper function.
   TreeSearchBatchConfigFuture
       prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 19b502ce7..993ece669 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1024,7 +1024,6 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec_task(
 
 // update beam search metadata
 TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
-    TreeSearchBatchConfig const &old_bc,
     SsmInferenceResult const &ssm_inference_result) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
@@ -1035,280 +1034,71 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
     for (int i = 0; i < 40; i++) {
       std::cout << ssm_inference_result.token_ids[i] << ", ";
     }
-    std::cout << "Current tree depth: " << old_bc.current_depth << "\n";
+    std::cout << "Current tree depth: " << current_speculation_step << "\n";
     std::cout << "Number of tokens in each requests: " << std::endl;
-    for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
-      std::cout << i << "\t" << old_bc.tree_requests_info[i].num_tokens_at_depth
-                << std::endl;
-    }
   }
 
   // Store small model's inference result to the token tree struct
-  store_ssm_inference_results(old_bc, ssm_inference_result);
+  store_ssm_inference_results(ssm_inference_result);
 
   // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;
-  new_bc.model_id = old_bc.model_id;
-  // We only support single small model
-  assert(old_bc.num_tokens > 0);
-  int num_generation_tokens = 0;
-
-  // Add incremental tokens to the batch
-  int num_active_req = -1;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i] || !old_bc.request_running[i]) {
-      continue;
-    }
-    num_active_req++;
-    // Comment out this assertion since num_tokens_in_batch can be
-    // zero when beam search has reached required sequence length
-    // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-    Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-    int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
-                           old_bc.requestsInfo[i].num_tokens_in_batch;
-
-    // assert(processed_tokens < request.tokens.size());
-    log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
-    {
-      log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
-                          << new_bc.num_tokens;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-      profiling_requests[request.guid].ssm_decoding_steps += 1;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-      // update the beam search metadata
-      // how many sub request in current request
-      // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH
-      // entries?
-      // update the parentid, accumalated_probs, depth, and token_ids
-      int ssm_decoding_steps =
-          profiling_requests[request.guid].ssm_decoding_steps;
+  // We assume that only one small model is in use now
+  new_bc.model_id = 0;
 
-      // We don't need the following logic
-      //   new_bc.beamRequestsInfo[i].beam_size =
-      //       spec_infer_tree_width.size() > ssm_decoding_steps
-      //           ? spec_infer_tree_width[ssm_decoding_steps]
-      //           : 1;
-
-      //   new_bc.beamRequestsInfo[i].max_depth =
-      //       old_bc.beamRequestsInfo[i].max_depth;
-
-      //   new_bc.sub_requests[i] =
-      //       old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size;
-      //   new_bc.beamRequestsInfo[i].sub_request_num =
-      //       old_bc.beamRequestsInfo[i].sub_request_num *
-      //       old_bc.beamRequestsInfo[i].beam_size;
-
-      //   assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-      //              TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
-      //          "exceed maximum nodes per layer");
-
-      //   if (request.status == Request::RUNNING) {
-      //     new_bc.beamRequestsInfo[i].current_depth =
-      //         old_bc.beamRequestsInfo[i].current_depth + 1;
-      //     new_bc.request_running[i] = true;
-      //     // do the slot exchange to minimize the cache exchange in kernel.
-      //     update_beam_metadata(
-      //         new_bc, old_bc, request.beam_trees.at(old_bc.model_id), i);
-
-      //   } else {
-      //     assert(false && "Request should not be pending in beam search
-      //     phase");
-      //   }
-
-      // do the slot exchange to minimize the cache exchange in kernel.
-      // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id),
-      // i);
-      if (new_bc.requestsInfo[i].first_token_depth_in_request >=
-          request.tokens.size()) {
-        // Incremental phase
-        if (request.status == Request::RUNNING) {
-          // todo this is replaced by this_layer_size, but should check it
-          new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-        } else {
-          assert(false && "Request should be done");
-          // new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-        }
-
-        if (verbose) {
-          std::cout << "[ Beam Spec] " << request.guid << std::endl;
-          std::cout << "Incremental phase: " << request.tokens.size()
-                    << ", num_tokens_in_batch: "
-                    << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-        }
-      }
-
-      if (verbose) {
-        std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
-                  << std::endl;
-        std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
-                  << std::endl;
-      }
-
-      // register more tokens due to the beam width
-
-      // copy metadata
-      memcpy(&new_bc.causalMask[i],
-             &old_bc.causalMask[i],
-             sizeof(BatchConfig::BitMask));
-      BeamTree tree = request.beam_trees[old_bc.model_id];
-      append_bitmask(new_bc.causalMask[i],
-                     new_bc.beamRequestsInfo[i].sub_request_num,
-                     old_bc.beamRequestsInfo[i].beam_size,
-                     old_bc.beamRequestsInfo[i].sub_request_num,
-                     tree,
-                     old_bc.beamRequestsInfo[i].current_depth);
-      for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-        int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-        for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-
-          // get value from requestinfo
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_bc.beamRequestsInfo[i].tokens[k];
-
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
-          new_bc.num_tokens++;
-
-          num_generation_tokens++;
-        }
-      }
-    }
-  }
-
-  // how many requests is in speculative phase
-  new_bc.speculative_request_num = num_active_req + 1;
+  new_bc.num_tokens = 0;
 
-  // Add prompt tokens to the batch
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i] || old_bc.request_running[i]) {
-      continue;
-    }
-    num_active_req++;
-    // Comment out this assertion since num_tokens_in_batch can be
-    // zero when beam search has reached required sequence length
-    // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-    Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+  // TODO: check if we should use BatchConfig::MAX_NUM_REQUESTS or some variable
+  // storing the current active requests
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    // TODO: check this!
+    assert(request.status == Request::RUNNING);
+    new_bc.request_completed[request_index] = false;
+    // TODO
     int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
                            old_bc.requestsInfo[i].num_tokens_in_batch;
-
-    // assert(processed_tokens < request.tokens.size());
-    log_req_mgr.debug() << "processed_tokens: " << processed_tokens << "\n";
-
-    {
-      log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", "
-                          << new_bc.num_tokens;
-      new_bc.request_completed[i] = false;
-      new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      // update the beam search metadata
-      // how many sub request in current request
-      // why is sub_requests has max_requests_per_batch() * MAX_BEAM_WIDTH
-      // entries?
-      int ssm_decoding_steps =
-          profiling_requests[request.guid].ssm_decoding_steps;
-
-      new_bc.beamRequestsInfo[i].beam_size = 1;
-      // printf("beam size: %d, %d\n",
-      //        new_bc.beamRequestsInfo[i].beam_size,
-      //        ssm_decoding_steps);
-      new_bc.beamRequestsInfo[i].max_depth =
-          old_bc.beamRequestsInfo[i].max_depth;
-      // new_bc.sub_requests[i] =
-      //     old_bc.sub_requests[i] * new_bc.beamRequestsInfo[i].beam_size;
-      new_bc.sub_requests[i] = 1;
-      new_bc.beamRequestsInfo[i].sub_request_num =
-          old_bc.beamRequestsInfo[i].sub_request_num;
-
-      assert(new_bc.beamRequestsInfo[i].sub_request_num <=
-                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES &&
-             "exceed maximum nodes per layer");
-
-      // update the parentid, accumalated_probs, depth, and token_ids
-
-      if (request.status == Request::PENDING) {
-        // if the request is pending, we need to update the beam search
-        // metadata based on the initial length
-        new_bc.beamRequestsInfo[i].current_depth =
-            old_bc.beamRequestsInfo[i].current_depth;
-        new_bc.request_running[i] = false;
-      } else {
-        assert(false && "Request should be pending");
-      }
-
-      memcpy(&new_bc.causalMask[i],
-             &old_bc.causalMask[i],
-             sizeof(BatchConfig::BitMask));
-
-      new_bc.requestsInfo[i].prompt_phase = true;
-      if (new_bc.requestsInfo[i].first_token_depth_in_request >=
-          request.tokens.size()) {
-        // request is done
-        new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-        new_bc.causalMask[i].layer_size = 0;
-        new_bc.beamRequestsInfo[i].sub_request_num = 0;
-        new_bc.beamRequestsInfo[i].beam_size = 1;
-      } else {
-        // Prompt phase
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
-                         BatchConfig::max_requests_per_batch() + i,
-                     (int)request.tokens.size() -
-                         new_bc.requestsInfo[i].first_token_depth_in_request);
-        request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch;
-        BeamTree tree = request.beam_trees[old_bc.model_id];
-        appendPendingRequest(new_bc.causalMask[i],
-                             new_bc.requestsInfo[i].num_tokens_in_batch);
-      }
-
-      if (verbose) {
-        std::cout << "[ Beam Spec] " << request.guid << std::endl;
-        std::cout << "Prompt phase: " << request.tokens.size()
-                  << ", num_tokens_in_batch:"
-                  << new_bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-        std::cout << "Update ssm cache size: " << request.ssm_cache_size
-                  << std::endl;
-
-        std::cout << "SSM KV Cache Size beam: " << request.ssm_cache_size
-                  << std::endl;
-        std::cout << "LLM KV Cache Size beam: " << request.llm_cache_size
-                  << std::endl;
-      }
-
-      // register more tokens due to the beam width
-      for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-        int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
-        for (int k = 0; k < new_bc.beamRequestsInfo[i].sub_request_num; k++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
-
-          // get value from requestinfo
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              request.tokens[request.tokens.size() -
-                             new_bc.requestsInfo[i].num_tokens_in_batch + j];
-
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = k;
-          new_bc.num_tokens++;
-        }
+    new_bc.requestsInfo[request_index].first_token_depth_in_request =
+        processed_tokens;
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].request_guid = guid;
+    profiling_requests[request.guid].ssm_decoding_steps += 1;
+
+    // Fill in the tokens
+    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    if (token_tree.tree_layers.size() <= current_speculation_step) {
+      // This request has no token to decode in this and the following small
+      // model inference steps
+      new_bc.tree_requests_info[request_index].num_tokens_at_depth = 0;
+      continue;
+    } else {
+      std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
+          token_tree.tree_layers.at(current_speculation_step);
+      new_bc.tree_requests_info[request_index].num_tokens_at_depth =
+          current_layer.size();
+      for (auto &node_ptr : current_layer) {
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+        // TODO: check this!
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.tokens.size() + current_speculation_step;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
+        new_bc.num_tokens++;
       }
     }
+
+    // TODO: we should call append_bitmask at some point before this
+    // Copy the causal mask
+    new_bc.causalMask[request_index] = request.causal_mask;
   }
 
-  new_bc.num_generation_tokens = num_generation_tokens;
+  // TODO: how do we know how many reqeusts are in the speculative phase if the
+  // batch is not full? how many requests is in speculative phase
+  new_bc.speculative_request_num = num_active_req + 1;
   if (verbose) {
-    std::cout << "prepare_next_batch_beam OLD vs NEW batchconfigs:"
-              << std::endl;
-    old_bc.print();
+    std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
     new_bc.print();
   }
   return new_bc;

From aa16cb1df6a156b4d5bfa2a5c7bbad586ce00347 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 12 Apr 2024 13:10:24 -0400
Subject: [PATCH 043/667] Change the name of request_complete to
 request_available in BatchConfig

---
 include/flexflow/batch_config.h               |  3 +-
 include/flexflow/config.h                     |  2 +-
 src/ops/inc_multihead_self_attention.cpp      |  2 +-
 src/ops/inc_multihead_self_attention.cu       |  2 +-
 src/ops/spec_inc_multihead_self_attention.cpp |  2 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu  |  2 +-
 src/runtime/batch_config.cc                   |  8 ++---
 src/runtime/request_manager.cc                | 32 +++++++++----------
 src/runtime/request_manager.cu                | 12 +++----
 src/runtime/tree_search_batch_config.cc       |  4 +--
 src/runtime/tree_verify_batch_config.cc       |  4 +--
 12 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 00b79a4c8..74c361c4d 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -148,8 +148,7 @@ class BatchConfig {
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
 
-  bool request_completed[MAX_NUM_REQUESTS];
-  bool request_running[MAX_NUM_REQUESTS];
+  bool request_available[MAX_NUM_REQUESTS];
 };
 
 class TreeVerifyBatchConfig : public BatchConfig {
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index f9d901323..ed254e584 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -85,7 +85,7 @@ struct FFHandler {
       sizeof(TreeSearchBatchConfig::beamRequestsInfo) +
       sizeof(BatchConfig::causalMask) +
       sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_completed);
+      sizeof(BatchConfig::request_available);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   DataType quantization_type;
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 55be42fa4..bf35509d1 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -530,7 +530,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_available[i]) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index d09beddb2..b32003c55 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -929,7 +929,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (bc->request_available[i] || (!bc->requestsInfo[i].prompt_phase)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 676fad935..fbe8e825f 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -223,7 +223,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_available[i]) {
       continue;
     }
     for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 26291fb3b..e8292c858 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -178,7 +178,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_available[i]) {
       continue;
     }
     int last_token_idx_of_the_request =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 50c056c81..1076225a1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -536,7 +536,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_available[i]) {
       continue;
     }
     assert(processed_tokens_in_batch ==
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index e563119f5..66d9e18a0 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -30,7 +30,7 @@ BatchConfig::BatchConfig() : num_tokens(0) {
     requestsInfo[i].first_token_depth_in_request = 0;
     requestsInfo[i].first_token_offset_in_batch = 0;
     requestsInfo[i].num_tokens_in_batch = 0;
-    request_completed[i] = true;
+    request_available[i] = true;
   }
   for (int i = 0; i < MAX_NUM_TOKENS; i++) {
     tokensInfo[i].abs_depth_in_request = 0;
@@ -63,7 +63,7 @@ InferenceMode BatchConfig::get_mode() const {
 int BatchConfig::num_active_requests() const {
   int num_requests = 0;
   for (int i = 0; i < max_requests_per_batch(); i++) {
-    if (!request_completed[i]) {
+    if (!request_available[i]) {
       num_requests++;
     }
   }
@@ -113,7 +113,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   // Per-request info
   os << "Per-request info:\n";
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_completed[i]) {
+    if (!bc.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
          << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
@@ -124,7 +124,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_completed[i] << std::endl;
+      os << "    Request completed: " << bc.request_available[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
     }
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 993ece669..b6709f20c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -391,7 +391,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   // Step 2: prepare the next batch for existing requests
   BatchConfig new_bc;
   for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i]) { // add new requests to the next batch
+    if (old_bc.request_available[i]) { // add new requests to the next batch
       continue;
     } else {
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
@@ -470,7 +470,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         }
 
       } else {
-        new_bc.request_completed[i] = false;
+        new_bc.request_available[i] = false;
         new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
@@ -511,7 +511,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 
   // Step 3: add new requests to the next batch
   for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (new_bc.request_completed[i]) {
+    if (new_bc.request_available[i]) {
       if (!pending_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_request_queue.front();
@@ -526,7 +526,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_sequence_length =
             new_request.max_sequence_length;
-        new_bc.request_completed[i] = false;
+        new_bc.request_available[i] = false;
         new_bc.requestsInfo[i].prompt_phase = true;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
@@ -606,7 +606,7 @@ TreeSearchBatchConfig
   int num_active_req = -1;
 
   for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i]) {
+    if (old_bc.request_available[i]) {
       continue;
     }
     size_t guid = old_bc.requestsInfo[i].request_guid;
@@ -698,7 +698,7 @@ TreeSearchBatchConfig
         trigger_request_completion_future(request.guid);
         log_req_mgr.print("Final output: %s", output.c_str());
 
-        new_bc.request_completed[i] = true;
+        new_bc.request_available[i] = true;
         new_bc.request_running[i] = false;
         num_processed_requests++;
 
@@ -750,7 +750,7 @@ TreeSearchBatchConfig
 
       } else { // Request not finished, pass verified_tokens to next iteration
 
-        new_bc.request_completed[i] = false;
+        new_bc.request_available[i] = false;
         new_bc.request_running[i] = true;
         num_active_req++;
 
@@ -830,7 +830,7 @@ TreeSearchBatchConfig
       }
 
     } else if (request.status == Request::PENDING) {
-      new_bc.request_completed[i] = false;
+      new_bc.request_available[i] = false;
       new_bc.request_running[i] = false;
       num_active_req++;
 
@@ -883,7 +883,7 @@ TreeSearchBatchConfig
 
   // Step 2: Initialize new request
   for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (new_bc.request_completed[i]) {
+    if (new_bc.request_available[i]) {
       if (!pending_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_request_queue.front();
@@ -925,7 +925,7 @@ TreeSearchBatchConfig
           new_bc.beamRequestsInfo[i].probs[j] = 1;
         }
 
-        new_bc.request_completed[i] = false;
+        new_bc.request_available[i] = false;
         new_bc.requestsInfo[i].prompt_phase = true;
 
         new_bc.beamRequestsInfo[i].sub_request_num = 1;
@@ -1056,7 +1056,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
     Request &request = all_requests[guid];
     // TODO: check this!
     assert(request.status == Request::RUNNING);
-    new_bc.request_completed[request_index] = false;
+    new_bc.request_available[request_index] = false;
     // TODO
     int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
                            old_bc.requestsInfo[i].num_tokens_in_batch;
@@ -1165,7 +1165,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
   int max_prompt_load_size = get_max_verify_tokens_per_batch();
   for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) {
-    if (old_batches.at(0).request_completed[i]) {
+    if (old_batches.at(0).request_available[i]) {
       continue;
     } else if (old_batches.at(0).request_running[i]) {
       max_prompt_load_size -= (TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1);
@@ -1175,7 +1175,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
   }
   int num_active_req = -1;
   for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) {
-    if (old_batches.at(0).request_completed[i]) {
+    if (old_batches.at(0).request_available[i]) {
       continue;
     }
     num_active_req++;
@@ -1225,7 +1225,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
              sizeof(BatchConfig::BitMask));
       // TODO: Check this
       new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-      new_bc.request_completed[i] = false;
+      new_bc.request_available[i] = false;
 
       // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", "
       //           << new_bc.causalMask[i].tree_size << ", "
@@ -1361,7 +1361,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           old_batches.at(0).requestsInfo[i].max_sequence_length;
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
-      new_bc.request_completed[i] = false;
+      new_bc.request_available[i] = false;
 
       new_bc.requestsInfo[i].num_tokens_in_batch =
           std::min(max_prompt_load_size,
@@ -1537,7 +1537,7 @@ void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
                                           int request_index) {
 
   // do the exchange
-  if (new_bc.request_completed[request_index]) {
+  if (new_bc.request_available[request_index]) {
     assert(false);
   }
   int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 54e389a6b..d460cd933 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -128,12 +128,12 @@ void RequestManager::load_batch_config_task(
 
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
+        &(batch_config->request_available),
+        sizeof(BatchConfig::request_available),
         cudaMemcpyHostToDevice,
         stream));
 
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    total_copy_size += sizeof(BatchConfig::request_available);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
@@ -155,12 +155,12 @@ void RequestManager::load_batch_config_task(
 
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
+        &(batch_config->request_available),
+        sizeof(BatchConfig::request_available),
         cudaMemcpyHostToDevice,
         stream));
 
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    total_copy_size += sizeof(BatchConfig::request_available);
   }
 
   // add a size check
diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
index 0011060a4..c24ffad84 100644
--- a/src/runtime/tree_search_batch_config.cc
+++ b/src/runtime/tree_search_batch_config.cc
@@ -78,7 +78,7 @@ std::ostream &
 
   os << "Per-request info:\n";
   for (int i = 0; i < tree_search_batch_config.max_requests_per_batch(); i++) {
-    if (!tree_search_batch_config.request_completed[i]) {
+    if (!tree_search_batch_config.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
          << tree_search_batch_config.requestsInfo[i]
@@ -96,7 +96,7 @@ std::ostream &
          << tree_search_batch_config.requestsInfo[i].max_sequence_length
          << std::endl;
       os << "    Request completed: "
-         << tree_search_batch_config.request_completed[i] << std::endl;
+         << tree_search_batch_config.request_available[i] << std::endl;
       os << "    Request running: "
          << tree_search_batch_config.request_running[i] << std::endl;
       os << "    Tree Search Specific: " << std::endl;
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index 841c735f5..eeb015a6c 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -45,7 +45,7 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
 
   os << "Per-request info:\n";
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_completed[i]) {
+    if (!bc.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
          << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
@@ -56,7 +56,7 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_completed[i] << std::endl;
+      os << "    Request completed: " << bc.request_available[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
     }
   }

From c3a17cd684fb2b81c7e99f902c81adb12e6b4bd0 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 12 Apr 2024 13:16:28 -0400
Subject: [PATCH 044/667] 1. Remove BatchConfig.num_generation_tokens 2. Remove
 BatchConfig::PerRequestInfo.max_sequence_length 3. Remove
 BatchConfig::PerRequestInfo.batch_config_request_id 4. Remove
 BatchConfig::PerRequestInfo.prompt_phase 5. Remove
 BatchConfig::PerRequestInfo.request_guid 6. Remove
 TreeSearchBatchConfig.tree_requests_info 7. Move
 TreeSearchBatchConfig.speculative_request_num to
 BatchConfig.num_available_requests

---
 include/flexflow/batch_config.h | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 74c361c4d..d20f6c714 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -62,28 +62,12 @@ class BatchConfig {
 
   //  Set by update
   int num_tokens;
-  // number of tokens in prompt phase, start offset of tokens in inc_decoding
-  // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-
-  // TODO: remove this in the kernel.
-  // Previously this field is used to track how many tokens are from the
-  // decoding phase, now since we separate decoding from prefilling in small
-  // model inference, we don't need this field anymore.
-  [[deprecated("Not in use anymore")]]
-  int num_generation_tokens;
+  int num_available_requests;
 
   struct PerRequestInfo {
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
-    int max_sequence_length;
-
-    // TODO: remove this field
-    // request id in batch config:
-    [[deprecated("This is now moved to the request manager")]]
-    int batch_config_request_id;
-    bool prompt_phase = false;
-    RequestGuid request_guid;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;
@@ -195,15 +179,8 @@ class TreeSearchBatchConfig : public BatchConfig {
   inline static int const MAX_TREE_DEPTH = 16;
 
   // how many requests is in speculative phase
-  int speculative_request_num = 0;
   int current_depth = 0;
   int model_id;
-
-  struct TreeSearchPerRequestInfo {
-    int num_tokens_at_depth = 0;
-  };
-
-  TreeSearchPerRequestInfo tree_requests_info[MAX_NUM_REQUESTS];
 };
 
 struct SsmInferenceResult {

From 861b3fb2e49a555455d2f7af45390d8b3bdd40be Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 12 Apr 2024 18:15:36 -0400
Subject: [PATCH 045/667] Add field request_available in RequestManager to
 indicate whether a request in a certain slot is available.

---
 include/flexflow/request_manager.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 02f2f0bb6..042601288 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -390,6 +390,7 @@ class RequestManager {
   int current_speculation_step = 0;
   // Maps the index of the request in the batch config to the request guid.
   int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
+  bool request_available[BatchConfig::MAX_NUM_REQUESTS];
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.

From aed45769b5bed81451d3837c9cc9e0fb03526c40 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 12 Apr 2024 22:03:00 -0400
Subject: [PATCH 046/667] Add a check to see if a slot in the BatchConfig is
 empty in store_ssm_inference_results

---
 src/runtime/request_manager.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b6709f20c..7d5f45998 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1483,7 +1483,6 @@ bool RequestManager::store_ssm_inference_results(
   assert(current_speculation_step > 0);
 
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-  int num_old_bc_tokens_processed = 0;
   int result_index = 0;
 
   // TODO: here we assume that the order of the tokens in the last
@@ -1491,6 +1490,10 @@ bool RequestManager::store_ssm_inference_results(
   // order of the request in the last TreeSearchBatchConfig, check this!
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
     FlexFlow::RequestManager::RequestGuid guid =
         guid_of_requests[request_index];
     Request &request = all_requests[guid];

From d3905eba17c6f766565060db12ec0b244cf4bce1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 12 Apr 2024 22:14:04 -0400
Subject: [PATCH 047/667] 1. Rename
 BatchConfig::PerRequestInfo.first_token_depth_in_request to
 BatchConfig::PerRequestInfo.first_token_index_in_request to better reflect
 its meaning. 2. Rename BatchConfig::PerTokenInfo.abs_depth_in_request to
 BatchConfig::PerTokenInfo.abs_index_in_request to better reflect its meaning.
 3. Fix some TODOs in prepare_next_batch_spec.

---
 include/flexflow/batch_config.h               |  4 +-
 src/ops/inc_multihead_self_attention.cpp      |  8 +-
 src/ops/inc_multihead_self_attention.cu       | 10 +--
 src/ops/spec_inc_multihead_self_attention.cpp |  4 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  8 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 14 ++--
 src/runtime/batch_config.cc                   |  8 +-
 src/runtime/request_manager.cc                | 82 ++++++++++---------
 src/runtime/request_manager.cpp               |  2 +-
 src/runtime/request_manager.cu                |  2 +-
 src/runtime/tree_search_batch_config.cc       |  4 +-
 src/runtime/tree_verify_batch_config.cc       |  4 +-
 12 files changed, 76 insertions(+), 74 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index d20f6c714..f67eaa404 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -65,12 +65,12 @@ class BatchConfig {
   int num_available_requests;
 
   struct PerRequestInfo {
-    int first_token_depth_in_request;
+    int first_token_index_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
   };
   struct PerTokenInfo {
-    int abs_depth_in_request;
+    int abs_index_in_request;
     int request_index;
     TokenId token_id;
   };
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index bf35509d1..e1e299d42 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -157,7 +157,7 @@ __global__ void
 
     int token_idx =
         (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+    size_t pos = tokenInfos[token_idx].abs_index_in_request;
     int pos_i = real_i % (proj_size / 2);
     float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
@@ -203,7 +203,7 @@ __global__ void
     // get position of token
 
     // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+    size_t pos = tokenInfos[token_idx].abs_index_in_request;
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
@@ -232,7 +232,7 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
 
     // key cache
     kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
@@ -534,7 +534,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+    int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
     // bc->token_last_available_idx[i] + 1;
     // Compute (QK^T/sqrt(d_k))
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b32003c55..e44b4c97f 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -86,7 +86,7 @@ __global__ void compute_attention_kernel_generation_kernel(
   int const first_step = 0;
 
   int const tlength =
-      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].first_token_index_in_request +
       request_infos[batch_config_request_id].num_tokens_in_batch;
 
   // shared memory objects
@@ -420,7 +420,7 @@ __global__ void
 
     int token_idx =
         (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+    size_t pos = tokenInfos[token_idx].abs_index_in_request;
 
     // float before_real = complex_input[i].x, before_complex =
     // complex_input[i].y;
@@ -470,7 +470,7 @@ __global__ void
     // get position of token
 
     // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+    size_t pos = tokenInfos[token_idx].abs_index_in_request;
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
@@ -866,7 +866,7 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
 
     // key cache
     kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
@@ -933,7 +933,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+    int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
     // Step 1: compute query-key product QK.T/sqrt(d_k)
     {
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index fbe8e825f..187ada4d5 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -67,7 +67,7 @@ __global__ void spec_store_kv_cache(
     // int const beam_width = id_map[token_idx].beam_width;
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
     int const sub_req_id = beamTokenInfos[token_idx].sub_request_index;
     int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id];
     int const beam_depth = beamRequestInfos[req_id].current_depth;
@@ -232,7 +232,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       // int total_tokens = bc->token_last_available_idx[i] + 1;
 
       int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+      int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                          bc->requestsInfo[i].num_tokens_in_batch;
       // Compute (QK^T/sqrt(d_k))
       int m_ = num_new_tokens;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index e8292c858..9e491127d 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -119,7 +119,7 @@ __global__ void update_tree_branch_kv_cache(
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
     kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
                offset] = kVal;
     vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
@@ -187,13 +187,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
       int num_new_tokens = 1;
       int j = processed_tokens_in_batch;
       while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_depth_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_depth_in_request)) {
+             (bc->tokensInfo[j].abs_index_in_request + 1 ==
+              bc->tokensInfo[j + 1].abs_index_in_request)) {
         j++;
         num_new_tokens++;
       }
 
-      int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1;
+      int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1;
       assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
       {
         // update K-V cache
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 1076225a1..2d76fcf07 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -83,7 +83,7 @@ __global__ void compute_attention_kernel_fused_kernel(
   int const first_step = 0;
 
   int const tlength =
-      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].first_token_index_in_request +
       request_infos[batch_config_request_id].num_tokens_in_batch;
   int const qlength =
       request_infos[batch_config_request_id].num_tokens_in_batch;
@@ -98,7 +98,7 @@ __global__ void compute_attention_kernel_fused_kernel(
 
   bool prompt_phase = request_infos[batch_config_request_id].prompt_phase;
   int q_start =
-      request_infos[batch_config_request_id].first_token_depth_in_request;
+      request_infos[batch_config_request_id].first_token_index_in_request;
 
   // shared memory objects
   extern __shared__ char smem_[];
@@ -424,7 +424,7 @@ __global__ void update_tree_branch_kv_cache(
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
     int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
     kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
                offset] = kVal;
     vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
@@ -461,7 +461,7 @@ __global__ void update_tree_branch_kv_cache_fused(
     int const request_token_offset =
         request_infos[req_id].first_token_offset_in_batch;
     int const first_token_depth =
-        request_infos[req_id].first_token_depth_in_request;
+        request_infos[req_id].first_token_index_in_request;
 
     // if(i % hidden_size == 0){
     //   printf("update token request id: %d, %d, %d  real id %d, value%.10f\n",
@@ -547,13 +547,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
       int num_new_tokens = 1;
       int j = processed_tokens_in_batch;
       while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_depth_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_depth_in_request)) {
+             (bc->tokensInfo[j].abs_index_in_request + 1 ==
+              bc->tokensInfo[j + 1].abs_index_in_request)) {
         j++;
         num_new_tokens++;
       }
 
-      int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1;
+      int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1;
       assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
       {
         // update K-V cache
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 66d9e18a0..6fba6eff5 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -27,13 +27,13 @@ using Legion::Memory;
 
 BatchConfig::BatchConfig() : num_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
-    requestsInfo[i].first_token_depth_in_request = 0;
+    requestsInfo[i].first_token_index_in_request = 0;
     requestsInfo[i].first_token_offset_in_batch = 0;
     requestsInfo[i].num_tokens_in_batch = 0;
     request_available[i] = true;
   }
   for (int i = 0; i < MAX_NUM_TOKENS; i++) {
-    tokensInfo[i].abs_depth_in_request = 0;
+    tokensInfo[i].abs_index_in_request = 0;
     tokensInfo[i].request_index = 0;
     tokensInfo[i].token_id = 0;
   }
@@ -116,7 +116,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
     if (!bc.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
+         << bc.requestsInfo[i].first_token_index_in_request << std::endl;
       os << "    First token offset in batch: "
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
@@ -134,7 +134,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   for (int i = 0; i < bc.num_tokens; i++) {
     os << "  Token " << i << ":\n";
     os << "    Absolute depth in request: "
-       << bc.tokensInfo[i].abs_depth_in_request << std::endl;
+       << bc.tokensInfo[i].abs_index_in_request << std::endl;
     os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
     os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7d5f45998..f2311769c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -372,11 +372,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
-    if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
+    if (old_bc.tokensInfo[i].abs_index_in_request + 1 < request.tokens.size()) {
       // This is a prompt token
       continue;
     } else {
-      assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 ==
+      assert(old_bc.tokensInfo[i].abs_index_in_request + 1 ==
              request.tokens.size());
       // This is a decoding token
       log_req_mgr.print("Output token is: %d", result.token_ids[i]);
@@ -397,7 +397,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
       int processed_tokens =
-          old_bc.requestsInfo[i].first_token_depth_in_request +
+          old_bc.requestsInfo[i].first_token_index_in_request +
           old_bc.requestsInfo[i].num_tokens_in_batch;
       assert(processed_tokens < request.tokens.size());
       bool request_completed = false;
@@ -471,7 +471,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 
       } else {
         new_bc.request_available[i] = false;
-        new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
+        new_bc.requestsInfo[i].first_token_index_in_request = processed_tokens;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
@@ -479,7 +479,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             old_bc.requestsInfo[i].max_sequence_length;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 ==
+        if (new_bc.requestsInfo[i].first_token_index_in_request + 1 ==
             request.tokens.size()) {
           // Incremental phase
           new_bc.requestsInfo[i].num_tokens_in_batch = 1;
@@ -490,13 +490,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.requestsInfo[i].num_tokens_in_batch =
               std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
                        (int)request.tokens.size() -
-                           new_bc.requestsInfo[i].first_token_depth_in_request);
+                           new_bc.requestsInfo[i].first_token_index_in_request);
           new_bc.requestsInfo[i].prompt_phase = true;
         }
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
+          int depth = new_bc.requestsInfo[i].first_token_index_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = depth;
           assert(depth < request.tokens.size());
           new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
           new_bc.num_tokens++;
@@ -518,7 +518,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         pending_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
-        new_bc.requestsInfo[i].first_token_depth_in_request = 0;
+        new_bc.requestsInfo[i].first_token_index_in_request = 0;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid = new_request.guid;
         new_bc.requestsInfo[i].num_tokens_in_batch =
@@ -536,9 +536,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         profile_info.start_time = Realm::Clock::current_time_in_microseconds();
         profiling_requests[new_request.guid] = profile_info;
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
+          int depth = new_bc.requestsInfo[i].first_token_index_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = depth;
           assert(depth < new_request.tokens.size());
           new_bc.tokensInfo[new_bc.num_tokens].token_id =
               new_request.tokens[depth];
@@ -632,7 +632,7 @@ TreeSearchBatchConfig
 
     while (result_index < old_bc.num_tokens &&
            old_bc.tokensInfo[result_index].request_index == i) {
-      int abs_depth = old_bc.tokensInfo[result_index].abs_depth_in_request;
+      int abs_depth = old_bc.tokensInfo[result_index].abs_index_in_request;
       int token_id = result.token_ids[result_index];
 
       if (request.status == Request::PENDING) {
@@ -755,7 +755,7 @@ TreeSearchBatchConfig
         num_active_req++;
 
         // Normal Request Info
-        new_bc.requestsInfo[i].first_token_depth_in_request =
+        new_bc.requestsInfo[i].first_token_index_in_request =
             verified_tokens.front().second;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
@@ -768,7 +768,7 @@ TreeSearchBatchConfig
         // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
         int new_max_depth =
             new_bc.requestsInfo[i].max_sequence_length -
-            new_bc.requestsInfo[i].first_token_depth_in_request -
+            new_bc.requestsInfo[i].first_token_index_in_request -
             verified_tokens.size();
         new_bc.beamRequestsInfo[i].current_depth = 1;
 
@@ -804,7 +804,7 @@ TreeSearchBatchConfig
           // Normal Token Info
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               token.second;
 
           // Beam Token Info
@@ -839,7 +839,7 @@ TreeSearchBatchConfig
       assert(request.ssm_cache_size == request.initial_len);
 
       // Normal Request Info
-      new_bc.requestsInfo[i].first_token_depth_in_request =
+      new_bc.requestsInfo[i].first_token_index_in_request =
           request.ssm_cache_size;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
@@ -890,7 +890,7 @@ TreeSearchBatchConfig
         pending_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         num_active_req++;
-        new_bc.requestsInfo[i].first_token_depth_in_request = 0;
+        new_bc.requestsInfo[i].first_token_index_in_request = 0;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid = new_request.guid;
         new_bc.requestsInfo[i].num_tokens_in_batch =
@@ -935,9 +935,9 @@ TreeSearchBatchConfig
         new_bc.sub_requests[i] = 1;
 
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
+          int depth = new_bc.requestsInfo[i].first_token_index_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = depth;
           assert(depth < new_request.tokens.size());
           new_bc.tokensInfo[new_bc.num_tokens].token_id =
               new_request.tokens[depth];
@@ -1045,26 +1045,28 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
   TreeSearchBatchConfig new_bc;
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
-
   new_bc.num_tokens = 0;
+  new_bc.num_available_requests = 0;
 
-  // TODO: check if we should use BatchConfig::MAX_NUM_REQUESTS or some variable
-  // storing the current active requests
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
+    if (!request_available[request_index]) {
+      new_bc.request_available[request_index] = false;
+      continue;
+    }
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
-    // TODO: check this!
     assert(request.status == Request::RUNNING);
-    new_bc.request_available[request_index] = false;
+    new_bc.request_available[request_index] = true;
+    new_bc.num_available_requests++;
     // TODO
     int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
                            old_bc.requestsInfo[i].num_tokens_in_batch;
-    new_bc.requestsInfo[request_index].first_token_depth_in_request =
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
         processed_tokens;
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
-    new_bc.requestsInfo[request_index].request_guid = guid;
+    // TODO: check profiling
     profiling_requests[request.guid].ssm_decoding_steps += 1;
 
     // Fill in the tokens
@@ -1072,17 +1074,17 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
     if (token_tree.tree_layers.size() <= current_speculation_step) {
       // This request has no token to decode in this and the following small
       // model inference steps
-      new_bc.tree_requests_info[request_index].num_tokens_at_depth = 0;
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
           token_tree.tree_layers.at(current_speculation_step);
-      new_bc.tree_requests_info[request_index].num_tokens_at_depth =
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
           current_layer.size();
       for (auto &node_ptr : current_layer) {
         new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
         // TODO: check this!
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
             request.tokens.size() + current_speculation_step;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
         new_bc.num_tokens++;
@@ -1096,7 +1098,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
 
   // TODO: how do we know how many reqeusts are in the speculative phase if the
   // batch is not full? how many requests is in speculative phase
-  new_bc.speculative_request_num = num_active_req + 1;
+  new_bc.num_available_requests = num_active_req + 1;
   if (verbose) {
     std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
     new_bc.print();
@@ -1210,7 +1212,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       }
 
       // Normal Request Info
-      new_bc.requestsInfo[i].first_token_depth_in_request =
+      new_bc.requestsInfo[i].first_token_index_in_request =
           dfs_tree_inputs.front().second;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid =
@@ -1265,7 +1267,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       // Incremental phase: only add the last committed token
       new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
       new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
           request.tokens.size() - 1;
 
       new_bc.num_tokens++;
@@ -1277,7 +1279,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         break;
       }
 
-      new_bc.requestsInfo[i].first_token_depth_in_request =
+      new_bc.requestsInfo[i].first_token_index_in_request =
           request.tokens.size() - 1;
 
       bool cutLayer = false;
@@ -1291,7 +1293,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
         // Normal Token Info
         new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
             token.second;
 
         new_bc.num_tokens++;
@@ -1311,8 +1313,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           new_bc.num_tokens--;
           new_bc.requestsInfo[i].num_tokens_in_batch--;
           // std::cout << "cut: " << j << "\n";
-          if (new_bc.tokensInfo[j].abs_depth_in_request !=
-              new_bc.tokensInfo[j - 1].abs_depth_in_request) {
+          if (new_bc.tokensInfo[j].abs_index_in_request !=
+              new_bc.tokensInfo[j - 1].abs_index_in_request) {
             break;
           }
         }
@@ -1352,7 +1354,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
              sizeof(BatchConfig::BitMask));
 
       // Normal Request Info
-      new_bc.requestsInfo[i].first_token_depth_in_request =
+      new_bc.requestsInfo[i].first_token_index_in_request =
           request.llm_cache_size;
       new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
       new_bc.requestsInfo[i].request_guid =
@@ -1366,7 +1368,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[i].num_tokens_in_batch =
           std::min(max_prompt_load_size,
                    (int)request.initial_len -
-                       new_bc.requestsInfo[i].first_token_depth_in_request);
+                       new_bc.requestsInfo[i].first_token_index_in_request);
       max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch;
 
       std::cout << "max_prompt_load_size: " << max_prompt_load_size
@@ -1381,7 +1383,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].token_id =
               request.tokens[request.llm_cache_size + j];
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               request.llm_cache_size + j;
           new_bc.num_tokens++;
         }
@@ -1419,7 +1421,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
           new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               request.tokens.size() - 1;
 
           new_bc.num_tokens++;
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index d4c9f89a3..adced990e 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -160,7 +160,7 @@ void RequestManager::load_positions_task(
   int dram_copy[BatchConfig::MAX_NUM_TOKENS];
 
   for (int i = 0; i < batch_config->num_tokens; i++) {
-    dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset;
+    dram_copy[i] = batch_config->tokensInfo[i].abs_index_in_request + offset;
   }
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index d460cd933..c5fd6b3a7 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -186,7 +186,7 @@ void RequestManager::load_positions_task(
   int dram_copy[BatchConfig::MAX_NUM_TOKENS];
 
   for (int i = 0; i < batch_config->num_tokens; i++) {
-    dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset;
+    dram_copy[i] = batch_config->tokensInfo[i].abs_index_in_request + offset;
   }
 
   cudaStream_t stream;
diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
index c24ffad84..ff3308da3 100644
--- a/src/runtime/tree_search_batch_config.cc
+++ b/src/runtime/tree_search_batch_config.cc
@@ -82,7 +82,7 @@ std::ostream &
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
          << tree_search_batch_config.requestsInfo[i]
-                .first_token_depth_in_request
+                .first_token_index_in_request
          << std::endl;
       os << "    First token offset in batch: "
          << tree_search_batch_config.requestsInfo[i].first_token_offset_in_batch
@@ -108,7 +108,7 @@ std::ostream &
   for (int i = 0; i < tree_search_batch_config.num_tokens; i++) {
     os << "  Token " << i << ":\n";
     os << "    Absolute depth in request: "
-       << tree_search_batch_config.tokensInfo[i].abs_depth_in_request
+       << tree_search_batch_config.tokensInfo[i].abs_index_in_request
        << std::endl;
     os << "    Request index: "
        << tree_search_batch_config.tokensInfo[i].request_index << std::endl;
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index eeb015a6c..7df2d3307 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -48,7 +48,7 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
     if (!bc.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_depth_in_request << std::endl;
+         << bc.requestsInfo[i].first_token_index_in_request << std::endl;
       os << "    First token offset in batch: "
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
@@ -65,7 +65,7 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
   for (int i = 0; i < bc.num_tokens; i++) {
     os << "  Token " << i << ":\n";
     os << "    Absolute depth in request: "
-       << bc.tokensInfo[i].abs_depth_in_request << std::endl;
+       << bc.tokensInfo[i].abs_index_in_request << std::endl;
     os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
     os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
   }

From 4a3c0ebd1ae166d1d4b802212a58102af6af0a39 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 12 Apr 2024 22:33:00 -0400
Subject: [PATCH 048/667] Fix some TODOs in prepare_next_batch_spec

---
 include/flexflow/batch_config.h    |  1 -
 include/flexflow/request_manager.h |  4 ++++
 src/runtime/request_manager.cc     | 30 ++++++++++++++++++------------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index f67eaa404..c3f72e6c4 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -60,7 +60,6 @@ class BatchConfig {
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
-  //  Set by update
   int num_tokens;
   int num_available_requests;
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 042601288..6e0976449 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -106,7 +106,10 @@ struct CompareSharedTokenTreeNodePtrRequestGuidPair {
 class TokenTree {
 public:
   std::vector<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
+  // The numebr of tokens in the tree that are not pruned
   int tree_size = 0;
+  // The numebr of tokens in the tree including the pruned ones
+  int tree_node_size = 0;
   void add_layer() {
     tree_layers.emplace_back();
   }
@@ -391,6 +394,7 @@ class RequestManager {
   // Maps the index of the request in the batch config to the request guid.
   int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
   bool request_available[BatchConfig::MAX_NUM_REQUESTS];
+  int num_available_requests = 0;
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f2311769c..8db91f8c1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1038,6 +1038,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
     std::cout << "Number of tokens in each requests: " << std::endl;
   }
 
+  // TODO: separate this
   // Store small model's inference result to the token tree struct
   store_ssm_inference_results(ssm_inference_result);
 
@@ -1059,14 +1060,9 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
     new_bc.num_available_requests++;
-    // TODO
-    int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request +
-                           old_bc.requestsInfo[i].num_tokens_in_batch;
-    new_bc.requestsInfo[request_index].first_token_index_in_request =
-        processed_tokens;
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
-    // TODO: check profiling
+    // TODO: check this profiling
     profiling_requests[request.guid].ssm_decoding_steps += 1;
 
     // Fill in the tokens
@@ -1075,19 +1071,30 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
       // This request has no token to decode in this and the following small
       // model inference steps
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.tokens.size() + token_tree.tree_node_size;
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
           token_tree.tree_layers.at(current_speculation_step);
+      // Exclude the current layer from the token tree, because we want the
+      // start index
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.tokens.size() + token_tree.tree_node_size -
+          current_layer.size();
       new_bc.requestsInfo[request_index].num_tokens_in_batch =
           current_layer.size();
-      for (auto &node_ptr : current_layer) {
+
+      int child_index = 0;
+      for (auto const &node_ptr : current_layer) {
         new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-        // TODO: check this!
         new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-            request.tokens.size() + current_speculation_step;
+            new_bc.requestsInfo[request_index].first_token_index_in_request +
+            child_index;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
+
         new_bc.num_tokens++;
+        child_index++;
       }
     }
 
@@ -1096,9 +1103,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
     new_bc.causalMask[request_index] = request.causal_mask;
   }
 
-  // TODO: how do we know how many reqeusts are in the speculative phase if the
-  // batch is not full? how many requests is in speculative phase
-  new_bc.num_available_requests = num_active_req + 1;
+  new_bc.num_available_requests = num_available_requests;
   if (verbose) {
     std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
     new_bc.print();
@@ -2477,6 +2482,7 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
         .tree_layers[current_speculation_step]
         .push_back(node_ptr);
     speculative_token_tree.tree_size++;
+    speculative_token_tree.tree_node_size++;
   }
 }
 

From 66d25eecfc0a8499ec3104ec67158e31b85ed09f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 11:26:46 -0400
Subject: [PATCH 049/667] Define several request manager states

---
 include/flexflow/request_manager.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a38a3b267..2facef4dc 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -97,14 +97,15 @@ struct BeamTree {
 class RequestManager {
 public:
   enum Status {
-    INITIALIZED = 1001,
-    SERVING = 1002,
-    TERMINATED = 1003,
+    PREFILLING = 1001,
+    DECODING = 1002,
+    SSM_SPEC = 1003,
+    LLM_VERIFY = 1004,
   };
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
-  static const RequestGuid INVALID_GUID = 0;
+  static RequestGuid const INVALID_GUID = 0;
   RequestManager();
   static RequestManager *get_request_manager();
   size_t get_num_processed_requests();

From 9b7162cda0c7f44e4df85a89e2dae12cdd5e9ef4 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 11:46:54 -0400
Subject: [PATCH 050/667] Add back back ground server status to avoid the
 invalidation of some related functions

---
 include/flexflow/request_manager.h |  6 ++++
 src/runtime/request_manager.cc     | 45 ++++++++++++++----------------
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 2facef4dc..8f018a6ce 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -102,6 +102,11 @@ class RequestManager {
     SSM_SPEC = 1003,
     LLM_VERIFY = 1004,
   };
+  enum BackgroundServerStatus {
+    INITIALIZED = 2001,
+    SERVING = 2002,
+    TERMINATED = 2003,
+  };
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
@@ -265,6 +270,7 @@ class RequestManager {
   int max_spec_tree_token_num;
   int max_sequence_length;
   Status request_manager_status;
+  BackgroundServerStatus background_server_status;
 
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 16513e918..340fa5177 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -44,7 +44,7 @@ std::string LoadBytesFromFile(std::string const &path) {
 }
 
 RequestManager::RequestManager()
-    : request_manager_status(INITIALIZED), verbose(false),
+    : background_server_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
       total_request_run_time(0.0f) {
   // The following config parameters are set
@@ -184,7 +184,7 @@ size_t RequestManager::get_num_ssms() {
 RequestManager::RequestGuid
     RequestManager::register_new_request(std::vector<TokenId> const &prompt,
                                          int max_sequence_length) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   // Add a new request
   Request request;
@@ -219,7 +219,7 @@ RequestManager::RequestGuid
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
@@ -244,7 +244,7 @@ RequestManager::RequestGuid
 RequestManager::RequestGuid
     RequestManager::register_new_request(std::string const &prompt,
                                          int max_sequence_length) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
@@ -283,7 +283,7 @@ RequestManager::RequestGuid
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
@@ -307,7 +307,7 @@ RequestManager::RequestGuid
 }
 
 bool RequestManager::is_request_completed(RequestGuid const &guid) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   assert(all_requests.find(guid) != all_requests.end());
   Request const &request = all_requests[guid];
   // return request.tokens.size() >= request.max_sequence_length;
@@ -319,7 +319,7 @@ GenerationResult
   // First get the future of the request
   std::future<void> future;
   {
-    const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     assert(request_to_promise.find(guid) != request_to_promise.end());
     future = request_to_promise[guid]->get_future();
   }
@@ -327,7 +327,7 @@ GenerationResult
   future.get();
   // Get the generation result
   {
-    const std::lock_guard<std::mutex> lock(request_queue_mutex);
+    std::lock_guard<std::mutex> const lock(request_queue_mutex);
     assert(request_generation_results.find(guid) !=
            request_generation_results.end());
     return request_generation_results[guid];
@@ -365,7 +365,7 @@ BatchConfig RequestManager::prepare_next_batch_task(
 
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   // Step 1: append result from previous iteration to request's tokens
   for (int i = 0; i < old_bc.num_tokens; i++) {
@@ -591,7 +591,7 @@ BeamSearchBatchConfig
     RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
                                             InferenceResult const &result,
                                             int model_id) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
   }
@@ -1026,13 +1026,12 @@ BeamSearchBatchConfig RequestManager::prepare_next_batch_beam_task(
 BeamSearchBatchConfig
     RequestManager::prepare_next_batch_beam(BeamSearchBatchConfig const &old_bc,
                                             BeamInferenceResult const &result) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_beam ###############\n";
   }
   if (verbose) {
-    std::cout << "print all results"
-              << "\n";
+    std::cout << "print all results" << "\n";
     for (int i = 0; i < 40; i++) {
       std::cout << result.token_ids[i] << ", ";
     }
@@ -1340,7 +1339,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task(
 
 TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
     std::vector<BeamSearchBatchConfig> const &old_batches) {
-  const std::lock_guard<std::mutex> lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   if (verbose) {
     std::cout
@@ -1692,8 +1691,7 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
       if (depth == 1) {
         // store the last input into the tree;
         if (verbose) {
-          std::cout << "try to store the input"
-                    << "\n";
+          std::cout << "try to store the input" << "\n";
         }
 
         request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] =
@@ -1950,8 +1948,7 @@ bool PreOrder(
     if (verbose) {
       std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id]
                 << "\n";
-      std::cout << "return true"
-                << "\n";
+      std::cout << "return true" << "\n";
     }
     return true;
   }
@@ -2315,8 +2312,8 @@ std::vector<GenerationResult>
 }
 
 void RequestManager::start_background_server(FFModel *model) {
-  assert(request_manager_status == INITIALIZED);
-  request_manager_status = SERVING;
+  assert(background_server_status == INITIALIZED);
+  background_server_status = SERVING;
   // Start background task
   Runtime *runtime = Runtime::get_runtime();
   Context ctx = Runtime::get_context();
@@ -2520,7 +2517,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
 
 void RequestManager::trigger_request_completion_future(
     RequestGuid const &guid) {
-  const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
+  std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
   assert(request_to_promise.find(guid) != request_to_promise.end());
   // Set the completion promise in case other threads are waiting
   request_to_promise[guid]->set_value();
@@ -2533,8 +2530,8 @@ void RequestManager::terminate_background_server_at_exit() {
 }
 
 void RequestManager::terminate_background_server() {
-  if (request_manager_status == SERVING) {
-    request_manager_status = TERMINATED;
+  if (background_server_status == SERVING) {
+    background_server_status = TERMINATED;
     // Wait for the background server to terminate
     Runtime *runtime = Runtime::get_runtime();
     Context ctx = Runtime::get_context();
@@ -2543,7 +2540,7 @@ void RequestManager::terminate_background_server() {
 }
 
 bool RequestManager::is_background_server_terminated() {
-  return request_manager_status == TERMINATED;
+  return background_server_status == TERMINATED;
 }
 
 RequestManager *request_manager_singleton = nullptr;

From e54514b68f2a6c6042c102e56b77f79594b150d0 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 13:54:18 -0400
Subject: [PATCH 051/667] Add the two APIs

---
 include/flexflow/request_manager.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8f018a6ce..e20612e87 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -263,6 +263,9 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
+  void update_inference_results(std::vector<InferenceResult> const &results);
+  BatchConfig get_next_batch_config();
+
 private:
   // configuration parameters
   int max_requests_per_batch;

From 660924f506a859dc02436ab64f17ad631a1c97ce Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 16:24:46 -0400
Subject: [PATCH 052/667] Adjust the API update_inference_results

---
 include/flexflow/request_manager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e20612e87..4cb909288 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -263,7 +263,7 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void update_inference_results(std::vector<InferenceResult> const &results);
+  void update_inference_results(InferenceResult const &results);
   BatchConfig get_next_batch_config();
 
 private:

From e38b0e4847e1fd904cb30b2d9431d88323e1d2a3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 16:29:08 -0400
Subject: [PATCH 053/667] Add an API update_inference_results for storing small
 model inference results specifically

---
 include/flexflow/request_manager.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index edbb4581d..ff48bc20e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -368,6 +368,7 @@ class RequestManager {
   /* New APIs */
 
   void update_inference_results(InferenceResult const &results);
+  void update_inference_results(SsmInferenceResult const &results);
   BatchConfig get_next_batch_config();
 
 private:

From ee9f89b925befdb6fbfdd62605c8e91ce2972a4d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 16:35:50 -0400
Subject: [PATCH 054/667] Rename store_ssm_inference_results to
 update_inference_results.

---
 include/flexflow/request_manager.h | 4 +---
 src/runtime/request_manager.cc     | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ff48bc20e..c0997f333 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -216,8 +216,6 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  bool store_ssm_inference_results(
-      SsmInferenceResult const &ssm_inference_result);
   void update_beam_metadata(TreeSearchBatchConfig &new_bc,
                             TreeSearchBatchConfig const &old_bc,
                             BeamTree &tree,
@@ -368,7 +366,7 @@ class RequestManager {
   /* New APIs */
 
   void update_inference_results(InferenceResult const &results);
-  void update_inference_results(SsmInferenceResult const &results);
+  bool update_inference_results(SsmInferenceResult const &ssm_inference_result);
   BatchConfig get_next_batch_config();
 
 private:
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 40dcfc51f..efaecdc67 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1040,7 +1040,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
 
   // TODO: separate this
   // Store small model's inference result to the token tree struct
-  store_ssm_inference_results(ssm_inference_result);
+  update_inference_results(ssm_inference_result);
 
   // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;
@@ -1483,7 +1483,7 @@ void RequestManager::initialize_root_of_spec_token_trees() {
   }
 }
 
-bool RequestManager::store_ssm_inference_results(
+bool RequestManager::update_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.

From 8c8c286583eb5adc7b291c42e49e598c31b8cc7d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 17:08:18 -0400
Subject: [PATCH 055/667] Remove old API update_beam_metadata

---
 include/flexflow/request_manager.h |  5 ---
 src/runtime/request_manager.cc     | 60 ++----------------------------
 2 files changed, 3 insertions(+), 62 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index c0997f333..d638de8dc 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -216,11 +216,6 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void update_beam_metadata(TreeSearchBatchConfig &new_bc,
-                            TreeSearchBatchConfig const &old_bc,
-                            BeamTree &tree,
-                            int request_index);
-
   std::vector<std::pair<BatchConfig::TokenId, int>>
       traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index efaecdc67..2cf8efa6b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1492,9 +1492,9 @@ bool RequestManager::update_inference_results(
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
 
-  // TODO: here we assume that the order of the tokens in the last
+  // Here we assume that the order of the tokens in the last
   // TreeSearchBatchConfig and hence the last SsmInferenceResult is equal to the
-  // order of the request in the last TreeSearchBatchConfig, check this!
+  // order of the request in the last TreeSearchBatchConfig
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1539,61 +1539,7 @@ bool RequestManager::update_inference_results(
   }
 }
 
-// for updating the beam search metadata in requests in incremental phase
-[[deprecated("I don't think this function is used anymore")]]
-void RequestManager::update_beam_metadata(TreeSearchBatchConfig &new_bc,
-                                          TreeSearchBatchConfig const &old_bc,
-                                          BeamTree &tree,
-                                          int request_index) {
-
-  // do the exchange
-  if (new_bc.request_available[request_index]) {
-    assert(false);
-  }
-  int depth = new_bc.beamRequestsInfo[request_index].current_depth - 1;
-  int beam_size = new_bc.beamRequestsInfo[request_index].beam_size;
-
-  // int leaf_node_num = old_bc.sub_requests[request_index];
-  int leaf_node_num = new_bc.beamRequestsInfo[request_index].sub_request_num;
-
-  if (new_bc.beamRequestsInfo[request_index].current_depth ==
-      1) { // TODO: check if this is correct
-    // for (int j = 0; j < beam_size; j++) {
-    //   new_bc.beamRequestsInfo[request_index].parent_id[j] = j;
-    //   new_bc.beamRequestsInfo[request_index].probs[j] =
-    //       tree.treeLayers[depth].probs[j]; // ?
-    //   new_bc.beamRequestsInfo[request_index].tokens[j] =
-    //       tree.treeLayers[depth].tokens[j]; // ?
-    // }
-    // Do nothing
-    // assert(false);
-  } else {
-    for (int j = 0; j < leaf_node_num; j++) {
-      new_bc.beamRequestsInfo[request_index].parent_id[j] =
-          tree.treeLayers[depth].parent_ids[j];
-      new_bc.beamRequestsInfo[request_index].probs[j] =
-          tree.treeLayers[depth].probs[j];
-      new_bc.beamRequestsInfo[request_index].tokens[j] =
-          tree.treeLayers[depth].tokens[j];
-      // std::cout << "token: " << j << ": "
-      //           << new_bc.beamRequestsInfo[request_index].tokens[j] <<
-      //           "\n";
-    }
-  }
-  if (verbose) {
-    std::cout << "-----------after parent id exchange-----------" << std::endl;
-    for (int j = 0; j < beam_size; j++) {
-      std::cout << "after request id: " << request_index << "beam id = " << j
-                << "parent: "
-                << new_bc.beamRequestsInfo[request_index].parent_id[j]
-                << "token: " << new_bc.beamRequestsInfo[request_index].tokens[j]
-                << "probs: " << new_bc.beamRequestsInfo[request_index].probs[j]
-                << std::endl;
-    }
-  }
-}
-
-// bit mask related function
+// bitmask related functions
 
 // prompt phase, init task
 void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,

From 7688f3761fc226f9fdf6507aad37c2daf9d2f86a Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 21:11:15 -0400
Subject: [PATCH 056/667] Modified the name of the APIs from
 prepare_batch_config_xxx to get_xxx_batch_config

---
 include/flexflow/request_manager.h | 122 +++++++++++++----------------
 src/runtime/model.cc               |  18 ++---
 src/runtime/request_manager.cc     |  26 +++---
 3 files changed, 77 insertions(+), 89 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d638de8dc..f3e18851f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -199,15 +199,15 @@ class RequestManager {
                               Legion::Context ctx,
                               Legion::Runtime *runtime);
   TreeSearchBatchConfig
-      prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
-                              InferenceResult const &result,
-                              int model_id);
+      get_first_spec_batch_config(TreeVerifyBatchConfig const &old_bc,
+                                  InferenceResult const &result,
+                                  int model_id);
   TreeSearchBatchConfigFuture
-      prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
-                              InferenceResultFuture const &result,
-                              int model_id,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
+      get_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
+                                  InferenceResultFuture const &result,
+                                  int model_id,
+                                  Legion::Context ctx,
+                                  Legion::Runtime *runtime);
 
   TreeVerifyBatchConfig prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfig> const &old_batches);
@@ -222,48 +222,61 @@ class RequestManager {
                          int first_token_depth_in_request);
   /* Old APIs for reference */
 
-  /* New APIs */
-  // Given the last speculation result, prepare the next speculation batch.
-  TreeSearchBatchConfig
-      prepare_next_batch_spec(SsmInferenceResult const &ssm_inference_result);
+  /*********** New APIs ***********/
+  // Prepare the next speculation batch config. This function is called before
+  // the second step of the speculation.
+  TreeSearchBatchConfig get_next_spec_batch_config();
+
   // A wrapper function.
   TreeSearchBatchConfigFuture
-      prepare_next_batch_spec(TreeSearchBatchConfigFuture const &old_bc,
-                              SsmInferenceResultFuture const &result,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
-  // Given the verification result, prepare the first speculation batch.
-  TreeSearchBatchConfig
-      prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
-                              InferenceResult const &result,
-                              int model_id);
+      get_next_spec_batch_config(TreeSearchBatchConfigFuture const &old_bc,
+                                 SsmInferenceResultFuture const &result,
+                                 Legion::Context ctx,
+                                 Legion::Runtime *runtime);
+
+  // A wrapper function.
+  static TreeSearchBatchConfig get_next_spec_batch_config_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
+  // Prepare the first speculation batch config. This function is called before
+  // the first step of the speculation. The difference with
+  // prepare_next_batch_config_spec is that we put the info of the committed
+  // tokens into the batch config in the first speculation step to commit the KV
+  // cache of the small model.
+  TreeSearchBatchConfig get_first_spec_batch_config();
+
   // A wrapper function.
   TreeSearchBatchConfigFuture
-      prepare_next_batch_init(TreeVerifyBatchConfigFuture const &old_bc,
-                              InferenceResultFuture const &result,
-                              int model_id,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
-  // Given the speculation result, prepare the verification batch.
-  TreeSearchBatchConfig prepare_next_batch_verify(
-      std::vector<TreeSearchBatchConfig> const &old_batches);
+      get_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
+                                  InferenceResultFuture const &result,
+                                  int model_id,
+                                  Legion::Context ctx,
+                                  Legion::Runtime *runtime);
+
+  // A wrapper function.
+  static TreeSearchBatchConfig get_first_spec_batch_config_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+
+  TreeVerifyBatchConfig prepare_verify_batch_config();
+
   // A wrapper function.
-  TreeSearchBatchConfigFuture prepare_next_batch_verify(
+  TreeVerifyBatchConfigFuture prepare_verify_batch_config(
       std::vector<TreeSearchBatchConfigFuture> const &old_batches,
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  // This function takes the small model inference results and the last
-  // speculation batch config and use the information to update the token tree
-  // stored in RequestManager::all_requests.
-  void store_spec_metadata(TreeSearchBatchConfig const &old_bc,
-                           SsmInferenceResult const &result);
-  // Put the last layer of the token tree stored in RequestManager::all_requests
-  // into new_bc::beamRequestsInfo .
-  void update_spec_metadata(TreeSearchBatchConfig &new_bc,
-                            TreeSearchBatchConfig const &old_bc,
-                            Token &tree,
-                            int request_index);
+  static TreeVerifyBatchConfig get_verify_batch_config_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+  /*********** New APIs ***********/
 
   // This function takes the tree stored in the token trees in
   // RequestManager::all_requests, and convert them into serialized version.
@@ -272,7 +285,6 @@ class RequestManager {
       traverse_spec_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
                          int first_token_depth_in_request);
-  /* New APIs */
 
   // remove guid after put the cached tree in request
   std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(
@@ -323,28 +335,12 @@ class RequestManager {
       Legion::Runtime *runtime);
 
   // A wrapper function.
-  static TreeSearchBatchConfig prepare_next_batch_init_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-  /* Old APIs for reference */
-
-  /* New APIs */
-  static TreeSearchBatchConfig prepare_next_batch_spec_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static TreeSearchBatchConfig prepare_next_batch_init_task(
+  static TreeSearchBatchConfig get_first_spec_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
-  /* New APIs */
 
-  /* Old APIs for reference */
   static TreeVerifyBatchConfig prepare_next_batch_verify_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
@@ -352,14 +348,6 @@ class RequestManager {
       Legion::Runtime *runtime);
   /* Old APIs for reference */
 
-  /* New APIs */
-  static TreeSearchBatchConfig prepare_next_batch_verify_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-  /* New APIs */
-
   void update_inference_results(InferenceResult const &results);
   bool update_inference_results(SsmInferenceResult const &ssm_inference_result);
   BatchConfig get_next_batch_config();
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index f782f0886..6550f890e 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4503,16 +4503,15 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     if (pre_register) {
       Runtime::preregister_task_variant<
           TreeSearchBatchConfig,
-          RequestManager::prepare_next_batch_init_task>(
+          RequestManager::get_first_spec_batch_config_task>(
           registrar, "RequestManager Prepare Next Batch (Init Beam) Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime
-          ->register_task_variant<TreeSearchBatchConfig,
-                                  RequestManager::prepare_next_batch_init_task>(
-              registrar);
+      runtime->register_task_variant<
+          TreeSearchBatchConfig,
+          RequestManager::get_first_spec_batch_config_task>(registrar);
     }
   }
   // RequestManager prepare_next_batch_verify
@@ -4525,15 +4524,16 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     if (pre_register) {
       Runtime::preregister_task_variant<
           TreeVerifyBatchConfig,
-          RequestManager::prepare_next_batch_verify_task>(
+          RequestManager::get_verify_batch_config_task>(
           registrar, "RequestManager Prepare Next Batch (Verify) Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<
-          TreeVerifyBatchConfig,
-          RequestManager::prepare_next_batch_verify_task>(registrar);
+      runtime
+          ->register_task_variant<TreeVerifyBatchConfig,
+                                  RequestManager::get_verify_batch_config_task>(
+              registrar);
     }
   }
   // RequestManager background serving task
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2cf8efa6b..a2e9ad0c3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -557,7 +557,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/
-TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
+TreeSearchBatchConfigFuture RequestManager::get_first_spec_batch_config(
     TreeVerifyBatchConfigFuture const &old_bc,
     InferenceResultFuture const &result,
     int model_id,
@@ -573,7 +573,7 @@ TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_init(
   return runtime->execute_task(ctx, launcher);
 }
 
-TreeSearchBatchConfig RequestManager::prepare_next_batch_init_task(
+TreeSearchBatchConfig RequestManager::get_first_spec_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
@@ -584,13 +584,13 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_init_task(
   InferenceResult const &result =
       Future(task->futures[1]).get_result<InferenceResult>();
   int model_id = Future(task->futures[2]).get_result<int>();
-  return rm->prepare_next_batch_init(bc, result, model_id);
+  return rm->get_first_spec_batch_config(bc, result, model_id);
 }
 
-TreeSearchBatchConfig
-    RequestManager::prepare_next_batch_init(TreeVerifyBatchConfig const &old_bc,
-                                            InferenceResult const &result,
-                                            int model_id) {
+TreeSearchBatchConfig RequestManager::get_first_spec_batch_config(
+    TreeVerifyBatchConfig const &old_bc,
+    InferenceResult const &result,
+    int model_id) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
@@ -995,7 +995,7 @@ TreeSearchBatchConfig
 }
 
 /***** Beam Search Phase *****/
-TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_spec(
+TreeSearchBatchConfigFuture RequestManager::get_next_spec_batch_config(
     TreeSearchBatchConfigFuture const &old_bc,
     SsmInferenceResultFuture const &result,
     Context ctx,
@@ -1009,7 +1009,7 @@ TreeSearchBatchConfigFuture RequestManager::prepare_next_batch_spec(
   return runtime->execute_task(ctx, launcher);
 }
 
-TreeSearchBatchConfig RequestManager::prepare_next_batch_spec_task(
+TreeSearchBatchConfig RequestManager::get_next_spec_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
@@ -1023,7 +1023,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_batch_spec_task(
 }
 
 // update beam search metadata
-TreeSearchBatchConfig RequestManager::prepare_next_batch_spec(
+TreeSearchBatchConfig RequestManager::get_next_spec_batch_config(
     SsmInferenceResult const &ssm_inference_result) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
@@ -1141,7 +1141,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task(
 }
 
 /* New APIs */
-TreeSearchBatchConfig RequestManager::prepare_next_batch_verify_task(
+TreeSearchBatchConfig RequestManager::get_verify_batch_config_task(
     Legion::Task const *task,
     std::vector<Legion::PhysicalRegion> const &regions,
     Legion::Context ctx,
@@ -1451,7 +1451,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
 }
 
 /* New APIs */
-TreeSearchBatchConfig RequestManager::prepare_next_batch_verify(
+TreeSearchBatchConfig RequestManager::prepare_verify_batch_config(
     std::vector<TreeSearchBatchConfig> const &old_batches) {
   if (verbose) {
     std::cout
@@ -2258,7 +2258,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
       }
     }
     auto const &next_batch = batch_pipeline.back();
-    TreeSearchBatchConfigFuture beam_bcf = prepare_next_batch_init(
+    TreeSearchBatchConfigFuture beam_bcf = get_first_spec_batch_config(
         next_batch.first, next_batch.second, 0, ctx, runtime);
     std::vector<TreeSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
     for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {

From c3550e9e0c8cb0465bbeeac3aeb3fafe6d1bef6b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 16 Apr 2024 21:27:53 -0400
Subject: [PATCH 057/667] 1. Removed the the old API append_bitmask. 2.
 Modified the implementation of get_next_spec_batch_config, removed the part
 overlap with update_inference_results. 3. Add code to maintain the bit mask
 in update_inference_results. 4. Remove the argument BatchConfig::BitMask
 &bitmask from append_bitmask.

---
 include/flexflow/request_manager.h |  6 +-
 src/runtime/request_manager.cc     | 94 ++----------------------------
 2 files changed, 6 insertions(+), 94 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f3e18851f..a72c0550e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -155,11 +155,7 @@ class RequestManager {
   void register_output_filepath(std::string const &);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
   void init_bitmask(BatchConfig::BitMask &bitmask, int initLength);
-  void append_bitmask(BatchConfig::BitMask &bitmask,
-                      int newNodes,
-                      BeamTree const tree,
-                      int currentDepth);
-  void append_bitmask(RequestGuid guid, BatchConfig::BitMask &bitmask);
+  void append_bitmask(RequestGuid guid);
   void update_bitmask(BatchConfig::BitMask &bitmask,
                       int initLength,
                       int non_tree_size);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a2e9ad0c3..1c61c10df 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -994,7 +994,7 @@ TreeSearchBatchConfig RequestManager::get_first_spec_batch_config(
   return new_bc;
 }
 
-/***** Beam Search Phase *****/
+/***** Speculative Decoding Phase *****/
 TreeSearchBatchConfigFuture RequestManager::get_next_spec_batch_config(
     TreeSearchBatchConfigFuture const &old_bc,
     SsmInferenceResultFuture const &result,
@@ -1022,26 +1022,12 @@ TreeSearchBatchConfig RequestManager::get_next_spec_batch_config_task(
   return rm->prepare_next_batch_beam(bc, result);
 }
 
-// update beam search metadata
-TreeSearchBatchConfig RequestManager::get_next_spec_batch_config(
-    SsmInferenceResult const &ssm_inference_result) {
+TreeSearchBatchConfig RequestManager::get_next_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_spec ###############\n";
-  }
-  if (verbose) {
-    std::cout << "print all results" << "\n";
-    for (int i = 0; i < 40; i++) {
-      std::cout << ssm_inference_result.token_ids[i] << ", ";
-    }
     std::cout << "Current tree depth: " << current_speculation_step << "\n";
-    std::cout << "Number of tokens in each requests: " << std::endl;
   }
-
-  // TODO: separate this
-  // Store small model's inference result to the token tree struct
-  update_inference_results(ssm_inference_result);
-
   // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;
   // We assume that only one small model is in use now
@@ -1536,6 +1522,7 @@ bool RequestManager::update_inference_results(
         }
       }
     }
+    append_bitmask(guid);
   }
 }
 
@@ -1588,81 +1575,10 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
   //           << "\n";
 }
 
-// prepare next beam, append layers to the tree
-void RequestManager::append_bitmask(BatchConfig::BitMask &bitmask,
-                                    int newNodes,
-                                    BeamTree const tree,
-                                    int currentDepth) {
-  int pre_tree_size = bitmask.tree_size;
-  bitmask.tree_size += newNodes;
-  bitmask.layer_size = newNodes;
-  assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM &&
-         "do not support tree size > 64");
-  // preBeamSize: replicate num
-
-  // add relationship with input/prompt
-  for (int i = 0; i < bitmask.prompt_size; i++) {
-    for (int j = pre_tree_size; j < bitmask.tree_size; j++) {
-      bitmask.mask[i] |= (1 << j);
-      // std::cout << "see bit mask append: " << i << ", to" << j
-      //           << std::bitset<64>(bitmask.mask[i]) << "\n";
-    }
-  }
-
-  // std::cout << "bitmask.tree_size: " << bitmask.tree_size << ", "
-  //           << pre_tree_size << ", " << bitmask.prompt_size << ", "
-  //           << preBeamSize << "\n";
-
-  // int num_groups = newNodes / preBeamSize;
-  // int group_size = newNodes / num_groups;
-  // add relations to branch
-  // requests in same groups share same relations, except the last token.
-
-  // set middle layers
-  //  skip the root prompt/tokens
-  int token_idx = bitmask.prompt_size;
-  int new_nodes_start_idx = pre_tree_size;
-  // std::cout << "new nodes start " << new_nodes_start_idx << "\n";
-  for (int i = 1; i < currentDepth; i++) {
-    new_nodes_start_idx = pre_tree_size;
-    int nodes_this_layer = tree.treeLayers[i].nodes_num_this_layer;
-    // std::cout << "tree layer: " << i << " nodes:" << nodes_this_layer
-    //           << "group size: " << newNodes / nodes_this_layer << "\n";
-    for (int j = 0; j < nodes_this_layer; j++) {
-      int group_size = newNodes / nodes_this_layer;
-      for (int k = 0; k < group_size; k++) {
-        bitmask.mask[token_idx] |= (1 << new_nodes_start_idx);
-        new_nodes_start_idx += 1;
-      }
-      token_idx += 1;
-    }
-  }
-
-  assert(token_idx == pre_tree_size);
-  assert(currentDepth <= 1 || new_nodes_start_idx == bitmask.tree_size);
-
-  // assert(currentDepth <= 2);
-  // set last layer, all tokens are only relevant to it self;
-  for (int i = token_idx; i < bitmask.tree_size; i++) {
-    bitmask.mask[i] |= (1 << i);
-    // std::cout << "set rel: " << i << "to: " << i << "\n";
-  }
-
-  // if(bitmask.non_tree_cache_size == 19 && bitmask.tree_size > 2){
-  //   assert(false);
-  // }
-
-  // std::cout << "see bit mask append" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask append" << bitmask.non_tree_cache_size <<
-  // "\n"; std::cout << "see bit mask append" <<
-  // std::bitset<64>(bitmask.mask[0])
-  //           << "\n";
-}
-
-void RequestManager::append_bitmask(RequestGuid guid,
-                                    BatchConfig::BitMask &bitmask) {
+void RequestManager::append_bitmask(RequestGuid guid) {
   // This function changes the bitmask in place
   Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
   std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
       request.speculative_token_trees[0].tree_layers.back();
   int new_layer_size = tree_layer.size();

From 2c4d4dadb9895bba39ba68ab8b036b071b2e6ce8 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyu.wang@yahooinc.com>
Date: Wed, 17 Apr 2024 00:24:12 -0400
Subject: [PATCH 058/667] create legion task for new API

---
 include/flexflow/model.h           |  1 +
 include/flexflow/request_manager.h | 21 +++++++--
 src/runtime/model.cc               | 20 ++++++++
 src/runtime/request_manager.cc     | 76 ++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 95be9ab58..5a2d51094 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -244,6 +244,7 @@ enum TaskIDs {
   RM_LOAD_TOKENS_TASK_ID,
   RM_LOAD_POSITION_TASK_ID,
   RM_LOAD_BATCH_CONFIG_TASK_ID,
+  RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
   RM_PREPARE_NEXT_BATCH_TASK_ID,
   RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
   RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e20612e87..2bc6ed6eb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -96,7 +96,7 @@ struct BeamTree {
 
 class RequestManager {
 public:
-  enum Status {
+  enum State {
     PREFILLING = 1001,
     DECODING = 1002,
     SSM_SPEC = 1003,
@@ -263,8 +263,19 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  void update_inference_results(std::vector<InferenceResult> const &results);
-  BatchConfig get_next_batch_config();
+
+  // API for rm state machine
+  BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
+                                          Context ctx,
+                                          Runtime *runtime);
+  static BatchConfig get_next_batch_config_task(      
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+  BatchConfig get_next_batch_config(InferenceResult const &result);
+  void update_inference_results(InferenceResult const &result);
+  BatchConfig prepare_next_batch();
 
 private:
   // configuration parameters
@@ -272,7 +283,7 @@ class RequestManager {
   int max_tokens_per_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
-  Status request_manager_status;
+  State request_manager_status;
   BackgroundServerStatus background_server_status;
 
   // tree width in each speculative step, if not specified 1
@@ -293,6 +304,8 @@ class RequestManager {
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
 
+  
+
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,
                      std::vector<std::pair<BatchConfig::TokenId, int>>>
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 1fa281777..4a8aa975c 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4452,6 +4452,26 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  // RequestMang get_next_batch_config
+  {
+    TaskVariantRegistrar registrar(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
+                                   "RequestManager Get Next Batch Config");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          BatchConfig,
+          RequestManager::get_next_batch_config>(
+          registrar, "RequestManager Get Next Batch Config Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<BatchConfig, 
+                                     RequestManager::get_next_batch_config>(
+          registrar);
+    }
+  }
   // RequestManager prepare_next_batch
   {
     TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 340fa5177..c57d8942a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -338,6 +338,82 @@ size_t RequestManager::get_num_processed_requests() {
   return num_processed_requests;
 }
 
+BatchConfigFuture RequestManager::get_next_batch_config(
+  InferenceResultFuture const &result,
+  Context ctx,
+  Runtime *runtime) {
+  RequestManager *rm = this;
+  TaskLauncher launcher(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
+                        TaskArgument(&rm, sizeof(RequestManager *)));
+  launcher.add_future(result);
+  return runtime->execute_task(ctx, launcher);
+}
+
+BatchConfig RequestManager::get_next_batch_config_task(
+      Task const *task,
+      std::vector<PhysicalRegion> const &regions,
+      Context ctx,
+      Runtime *runtime) {
+  RequestManager *rm = *((RequestManager **)task->args);
+  InferenceResult const &result =
+      Future(task->futures[0]).get_result<InferenceResult>();
+  return rm->get_next_batch_config(result);
+}
+
+BatchConfig RequestManager::get_next_batch_config(InferenceResult const &result) {
+  update_inference_results(result);
+  return prepare_next_batch();
+}
+
+void RequestManager::update_inference_results(InferenceResult const &result) {
+  // Update the inference results
+  for (int i = 0; i < result.num_tokens; i++) {
+    size_t guid = result.request_guids[i];
+    Request &request = all_requests[guid];
+    if (request.tokens.size() < request.max_sequence_length) {
+      request.tokens.push_back(result.token_ids[i]);
+    }
+  }
+}
+
+BatchConfig RequestManager::prepare_next_batch() {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  BatchConfig bc;
+  bc.num_tokens = 0;
+  int num_generation_tokens = 0;
+  int num_active_req = -1;
+  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+    if (pending_request_queue.empty()) {
+      break;
+    }
+    Request new_request = pending_request_queue.front();
+    pending_request_queue.pop();
+    all_requests[new_request.guid] = new_request;
+    bc.requestsInfo[i].first_token_depth_in_request = 0;
+    bc.requestsInfo[i].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[i].request_guid = new_request.guid;
+    bc.requestsInfo[i].num_tokens_in_batch =
+        std::min(get_max_tokens_per_batch(), (int)new_request.tokens.size());
+    bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length;
+    bc.request_completed[i] = false;
+    bc.requestsInfo[i].prompt_phase = true;
+    num_active_req++;
+    bc.requestsInfo[num_active_req].batch_config_request_id = i;
+    for (int j = 0; j < bc.requestsInfo[i].num_tokens_in_batch; j++) {
+      int depth = bc.requestsInfo[i].first_token_depth_in_request + j;
+      bc.tokensInfo[bc.num_tokens].request_index = i;
+      bc.tokensInfo[bc.num_tokens].abs_depth_in_request = depth;
+      assert(depth < new_request.tokens.size());
+      bc.tokensInfo[bc.num_tokens].token_id = new_request.tokens[depth];
+      bc.num_tokens++;
+    }
+  }
+  bc.num_generation_tokens = num_generation_tokens;
+  return bc;
+}
+
+
+
 BatchConfigFuture
     RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc,
                                        InferenceResultFuture const &result,

From 0492728a5f013e5a946e479f9384eda05de65f1b Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Wed, 17 Apr 2024 00:40:13 -0400
Subject: [PATCH 059/667] fix format

---
 deps/legion                        |  2 +-
 include/flexflow/request_manager.h |  5 +----
 src/runtime/model.cc               |  7 +++----
 src/runtime/request_manager.cc     | 26 +++++++++++++-------------
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/deps/legion b/deps/legion
index 24e8c4523..626b55689 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835
+Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 2bc6ed6eb..1c284ce97 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -263,12 +263,11 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-
   // API for rm state machine
   BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
                                           Context ctx,
                                           Runtime *runtime);
-  static BatchConfig get_next_batch_config_task(      
+  static BatchConfig get_next_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -304,8 +303,6 @@ class RequestManager {
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
 
-  
-
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,
                      std::vector<std::pair<BatchConfig::TokenId, int>>>
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 4a8aa975c..1468d9c85 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4459,15 +4459,14 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<
-          BatchConfig,
-          RequestManager::get_next_batch_config>(
+      Runtime::preregister_task_variant<BatchConfig,
+                                        RequestManager::get_next_batch_config>(
           registrar, "RequestManager Get Next Batch Config Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BatchConfig, 
+      runtime->register_task_variant<BatchConfig,
                                      RequestManager::get_next_batch_config>(
           registrar);
     }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c57d8942a..fc860a1b6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -339,9 +339,7 @@ size_t RequestManager::get_num_processed_requests() {
 }
 
 BatchConfigFuture RequestManager::get_next_batch_config(
-  InferenceResultFuture const &result,
-  Context ctx,
-  Runtime *runtime) {
+    InferenceResultFuture const &result, Context ctx, Runtime *runtime) {
   RequestManager *rm = this;
   TaskLauncher launcher(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
                         TaskArgument(&rm, sizeof(RequestManager *)));
@@ -350,17 +348,18 @@ BatchConfigFuture RequestManager::get_next_batch_config(
 }
 
 BatchConfig RequestManager::get_next_batch_config_task(
-      Task const *task,
-      std::vector<PhysicalRegion> const &regions,
-      Context ctx,
-      Runtime *runtime) {
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
   RequestManager *rm = *((RequestManager **)task->args);
   InferenceResult const &result =
       Future(task->futures[0]).get_result<InferenceResult>();
   return rm->get_next_batch_config(result);
 }
 
-BatchConfig RequestManager::get_next_batch_config(InferenceResult const &result) {
+BatchConfig
+    RequestManager::get_next_batch_config(InferenceResult const &result) {
   update_inference_results(result);
   return prepare_next_batch();
 }
@@ -412,8 +411,6 @@ BatchConfig RequestManager::prepare_next_batch() {
   return bc;
 }
 
-
-
 BatchConfigFuture
     RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc,
                                        InferenceResultFuture const &result,
@@ -1107,7 +1104,8 @@ BeamSearchBatchConfig
     std::cout << "\n############### prepare_next_batch_beam ###############\n";
   }
   if (verbose) {
-    std::cout << "print all results" << "\n";
+    std::cout << "print all results"
+              << "\n";
     for (int i = 0; i < 40; i++) {
       std::cout << result.token_ids[i] << ", ";
     }
@@ -1767,7 +1765,8 @@ void RequestManager::store_beam_metadata(BeamSearchBatchConfig const &old_bc,
       if (depth == 1) {
         // store the last input into the tree;
         if (verbose) {
-          std::cout << "try to store the input" << "\n";
+          std::cout << "try to store the input"
+                    << "\n";
         }
 
         request.beam_trees.at(old_bc.model_id).treeLayers[0].tokens[0] =
@@ -2024,7 +2023,8 @@ bool PreOrder(
     if (verbose) {
       std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id]
                 << "\n";
-      std::cout << "return true" << "\n";
+      std::cout << "return true"
+                << "\n";
     }
     return true;
   }

From 12457ae1e0a5cde061e8eb78732c18f3f85978b1 Mon Sep 17 00:00:00 2001
From: zwang86 <46699021+zwang86@users.noreply.github.com>
Date: Wed, 17 Apr 2024 10:12:31 -0400
Subject: [PATCH 060/667] fix small issue

---
 include/flexflow/request_manager.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 1c284ce97..4cb718abd 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -265,8 +265,8 @@ class RequestManager {
 
   // API for rm state machine
   BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
-                                          Context ctx,
-                                          Runtime *runtime);
+                                          Legion::Context ctx,
+                                          Legion::Runtime *runtime);
   static BatchConfig get_next_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,

From a4da38a5690475372a280bfafffc91822eb621e7 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 10:13:54 -0400
Subject: [PATCH 061/667] 1. Add a API void RequestManager::init_token_trees(),
 without implementation. 2. Modified the generation of parent_pos in
 update_inference_results. 3. Update the implementation of append_bitmask
 regarding the depth of the speculation tree.

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 43 +++++++++++++++++++++++++-----
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a72c0550e..d2fab566d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -416,7 +416,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
 
-  void RequestManager::initialize_root_of_spec_token_trees();
+  void RequestManager::init_token_trees();
   void add_token_to_spec_token_tree(RequestGuid guid,
                                     BatchConfig::TokenId token_id,
                                     int parent_pos,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1c61c10df..8eeafbd07 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1473,7 +1473,8 @@ bool RequestManager::update_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
-  assert(current_speculation_step > 0);
+  assert(current_speculation_step >= 1 &&
+         "The current speculation step should be no less than 1");
 
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
@@ -1498,6 +1499,7 @@ bool RequestManager::update_inference_results(
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
           token_tree.tree_layers[current_speculation_step - 1];
+      int parent_pos = 0;
       for (auto parent_it = parent_tree_layer.begin();
            parent_it != parent_tree_layer.end();
            parent_it++) {
@@ -1516,10 +1518,11 @@ bool RequestManager::update_inference_results(
                 guid,
                 ssm_inference_result.token_ids[result_index],
                 ssm_inference_result.probs[result_index] * parent_prob,
-                ssm_inference_result.parent_id[result_index]);
+                parent_pos);
             result_index++;
           }
         }
+        parent_pos++;
       }
     }
     append_bitmask(guid);
@@ -1576,11 +1579,22 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
 }
 
 void RequestManager::append_bitmask(RequestGuid guid) {
-  // This function changes the bitmask in place
+  // This method changes the bitmask in place
+  // This method is called after the first small model decoding step
+  assert(current_speculation_step >= 1 &&
+         "The current speculation step should be no less than 1");
+
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
+  TokenTree &token_tree = request.speculative_token_trees[0];
+
+  if (token_tree.tree_layers.size() <= current_speculation_step) {
+    // This request has no token added in this and the following small model
+    // inference steps, skip it
+    return;
+  }
   std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-      request.speculative_token_trees[0].tree_layers.back();
+      request.speculative_token_trees[0].tree_layers[current_speculation_step];
   int new_layer_size = tree_layer.size();
   int last_layer_size = bitmask.current_layer_size;
   int previous_tree_size = bitmask.tree_size;
@@ -1597,8 +1611,12 @@ void RequestManager::append_bitmask(RequestGuid guid) {
     // Each child copy its parent's mask
     // Here we assume child_ptr->parent_pos denotes the position of the parent
     // in its corresponding layer, check this
-    bitmask.bit_mask[child_offset + child_idx] =
-        bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
+    if (current_speculation_step > 1) {
+      // Root is not in the bitmask, when current_speculation_step == 1, the
+      // tokens don't have a parent to attend to
+      bitmask.bit_mask[child_offset + child_idx] =
+          bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
+    }
     // Each child attend to itself
     bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
                                                        child_idx);
@@ -2247,12 +2265,23 @@ RequestManager *RequestManager::get_request_manager() {
   return request_manager_singleton;
 }
 
+/********** Request Token Tree Related Functions **********/
+void RequestManager::init_token_trees() {
+  // TODO: implement this function
+  // Add a layer that only contains the root token of the token tree. The root
+  // token's info should be
+}
+
 void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
                                                   BatchConfig::TokenId token_id,
                                                   int parent_pos,
                                                   float joint_prob) {
   // This method assumes only one small model is used for speculation
 
+  // This is called after the first small model inference
+  assert(current_speculation_step >= 1 &&
+         "The current speculation step should be no less than 1");
+
   // First make sure there are enough layers in the speculation tree
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
@@ -2364,4 +2393,6 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
     }
   }
 }
+/********** Request Token Tree Related Functions **********/
+
 }; // namespace FlexFlow

From fa709dba80c3e3296e11b7295668f059338817ff Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 11:02:00 -0400
Subject: [PATCH 062/667] Remove the root of the token tree. Now the first
 layer of the tree is the first ssm inference results, instead of the root.

---
 src/runtime/request_manager.cc | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8eeafbd07..9239e3d4d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1053,7 +1053,7 @@ TreeSearchBatchConfig RequestManager::get_next_spec_batch_config() {
 
     // Fill in the tokens
     TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
-    if (token_tree.tree_layers.size() <= current_speculation_step) {
+    if (token_tree.tree_layers.size() < current_speculation_step) {
       // This request has no token to decode in this and the following small
       // model inference steps
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
@@ -1062,7 +1062,7 @@ TreeSearchBatchConfig RequestManager::get_next_spec_batch_config() {
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
-          token_tree.tree_layers.at(current_speculation_step);
+          token_tree.tree_layers.at(current_speculation_step - 1);
       // Exclude the current layer from the token tree, because we want the
       // start index
       new_bc.requestsInfo[request_index].first_token_index_in_request =
@@ -1493,12 +1493,22 @@ bool RequestManager::update_inference_results(
     Request &request = all_requests[guid];
 
     TokenTree &token_tree = request.speculative_token_trees[0];
-    if (token_tree.tree_layers.size() < current_speculation_step) {
+    if (token_tree.tree_layers.size() == 0 && current_speculation_step == 1) {
+      // This is the first layer of the tree
+      for (int child_idx = 0; child_idx < num_branches; child_idx++) {
+        add_token_to_spec_token_tree(
+            guid,
+            ssm_inference_result.token_ids[result_index],
+            ssm_inference_result.probs[result_index],
+            0);
+        result_index++;
+      }
+    } else if (token_tree.tree_layers.size() < current_speculation_step - 1) {
       // This means that the parent layer is empty
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
-          token_tree.tree_layers[current_speculation_step - 1];
+          token_tree.tree_layers[current_speculation_step - 2];
       int parent_pos = 0;
       for (auto parent_it = parent_tree_layer.begin();
            parent_it != parent_tree_layer.end();
@@ -1588,13 +1598,14 @@ void RequestManager::append_bitmask(RequestGuid guid) {
   BatchConfig::BitMask &bitmask = request.causal_mask;
   TokenTree &token_tree = request.speculative_token_trees[0];
 
-  if (token_tree.tree_layers.size() <= current_speculation_step) {
+  if (token_tree.tree_layers.size() < current_speculation_step) {
     // This request has no token added in this and the following small model
     // inference steps, skip it
     return;
   }
   std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-      request.speculative_token_trees[0].tree_layers[current_speculation_step];
+      request.speculative_token_trees[0]
+          .tree_layers[current_speculation_step - 1];
   int new_layer_size = tree_layer.size();
   int last_layer_size = bitmask.current_layer_size;
   int previous_tree_size = bitmask.tree_size;
@@ -1612,7 +1623,7 @@ void RequestManager::append_bitmask(RequestGuid guid) {
     // Here we assume child_ptr->parent_pos denotes the position of the parent
     // in its corresponding layer, check this
     if (current_speculation_step > 1) {
-      // Root is not in the bitmask, when current_speculation_step == 1, the
+      // When current_speculation_step == 1, the
       // tokens don't have a parent to attend to
       bitmask.bit_mask[child_offset + child_idx] =
           bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
@@ -2286,14 +2297,15 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
 
-  if (speculative_token_tree.tree_layers.size() == current_speculation_step) {
+  if (speculative_token_tree.tree_layers.size() ==
+      current_speculation_step - 1) {
     // When adding the first token, we need to add a new layer
     speculative_token_tree.add_layer();
   } else {
     // To add a token, the tree depth is either the same as the current
     // speculation step or one more than the current speculation step.
     assert(speculative_token_tree.tree_layers.size() ==
-               current_speculation_step + 1 &&
+               current_speculation_step &&
            "The depth of the token tree should be consistent with the depth of "
            "the token being added");
   }
@@ -2370,7 +2382,7 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
         std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
     request.speculative_token_trees[0]
-        .tree_layers[current_speculation_step]
+        .tree_layers[current_speculation_step - 1]
         .push_back(node_ptr);
     speculative_token_tree.tree_size++;
     speculative_token_tree.tree_node_size++;
@@ -2381,7 +2393,7 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
   // This method assumes only one small model is used for speculation
   Request &request = all_requests[guid];
 
-  if (request.speculative_token_trees[0].tree_layers.size() <=
+  if (request.speculative_token_trees[0].tree_layers.size() <
       current_speculation_step) {
     // There are no tokens in the last layer
     return;

From 463646404bce6580e95c95db5504cf77179d0134 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 11:07:01 -0400
Subject: [PATCH 063/667] Change the name of the APIs

---
 include/flexflow/request_manager.h | 44 ++++++++++++++----------------
 src/runtime/model.cc               | 13 ++++-----
 src/runtime/request_manager.cc     | 12 ++++----
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d2fab566d..96ff8526c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -199,11 +199,11 @@ class RequestManager {
                                   InferenceResult const &result,
                                   int model_id);
   TreeSearchBatchConfigFuture
-      get_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
-                                  InferenceResultFuture const &result,
-                                  int model_id,
-                                  Legion::Context ctx,
-                                  Legion::Runtime *runtime);
+      prepare_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
+                                      InferenceResultFuture const &result,
+                                      int model_id,
+                                      Legion::Context ctx,
+                                      Legion::Runtime *runtime);
 
   TreeVerifyBatchConfig prepare_next_batch_verify(
       std::vector<TreeSearchBatchConfig> const &old_batches);
@@ -221,17 +221,17 @@ class RequestManager {
   /*********** New APIs ***********/
   // Prepare the next speculation batch config. This function is called before
   // the second step of the speculation.
-  TreeSearchBatchConfig get_next_spec_batch_config();
+  TreeSearchBatchConfig prepare_next_spec_batch_config();
 
   // A wrapper function.
   TreeSearchBatchConfigFuture
-      get_next_spec_batch_config(TreeSearchBatchConfigFuture const &old_bc,
-                                 SsmInferenceResultFuture const &result,
-                                 Legion::Context ctx,
-                                 Legion::Runtime *runtime);
+      prepare_next_spec_batch_config(TreeSearchBatchConfigFuture const &old_bc,
+                                     SsmInferenceResultFuture const &result,
+                                     Legion::Context ctx,
+                                     Legion::Runtime *runtime);
 
   // A wrapper function.
-  static TreeSearchBatchConfig get_next_spec_batch_config_task(
+  static TreeSearchBatchConfig prepare_next_spec_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -242,18 +242,18 @@ class RequestManager {
   // prepare_next_batch_config_spec is that we put the info of the committed
   // tokens into the batch config in the first speculation step to commit the KV
   // cache of the small model.
-  TreeSearchBatchConfig get_first_spec_batch_config();
+  TreeSearchBatchConfig prepare_first_spec_batch_config();
 
   // A wrapper function.
   TreeSearchBatchConfigFuture
-      get_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
-                                  InferenceResultFuture const &result,
-                                  int model_id,
-                                  Legion::Context ctx,
-                                  Legion::Runtime *runtime);
+      prepare_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
+                                      InferenceResultFuture const &result,
+                                      int model_id,
+                                      Legion::Context ctx,
+                                      Legion::Runtime *runtime);
 
   // A wrapper function.
-  static TreeSearchBatchConfig get_first_spec_batch_config_task(
+  static TreeSearchBatchConfig prepare_first_spec_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -267,7 +267,7 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  static TreeVerifyBatchConfig get_verify_batch_config_task(
+  static TreeVerifyBatchConfig prepare_verify_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -331,7 +331,7 @@ class RequestManager {
       Legion::Runtime *runtime);
 
   // A wrapper function.
-  static TreeSearchBatchConfig get_first_spec_batch_config_task(
+  static TreeSearchBatchConfig prepare_first_spec_batch_config_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -357,10 +357,6 @@ class RequestManager {
   Status request_manager_status;
   BackgroundServerStatus background_server_status;
 
-  // tree width in each speculative step, if not specified 1
-  [[deprecated("This field will be removed")]]
-  std::vector<int> spec_infer_tree_width; // Old version, delete after refactor
-
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
   ModelType model_type;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 6550f890e..ff2461239 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4503,7 +4503,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     if (pre_register) {
       Runtime::preregister_task_variant<
           TreeSearchBatchConfig,
-          RequestManager::get_first_spec_batch_config_task>(
+          RequestManager::prepare_first_spec_batch_config_task>(
           registrar, "RequestManager Prepare Next Batch (Init Beam) Task");
     } else {
       if (enable_control_replication) {
@@ -4511,7 +4511,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       }
       runtime->register_task_variant<
           TreeSearchBatchConfig,
-          RequestManager::get_first_spec_batch_config_task>(registrar);
+          RequestManager::prepare_first_spec_batch_config_task>(registrar);
     }
   }
   // RequestManager prepare_next_batch_verify
@@ -4524,16 +4524,15 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     if (pre_register) {
       Runtime::preregister_task_variant<
           TreeVerifyBatchConfig,
-          RequestManager::get_verify_batch_config_task>(
+          RequestManager::prepare_verify_batch_config_task>(
           registrar, "RequestManager Prepare Next Batch (Verify) Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime
-          ->register_task_variant<TreeVerifyBatchConfig,
-                                  RequestManager::get_verify_batch_config_task>(
-              registrar);
+      runtime->register_task_variant<
+          TreeVerifyBatchConfig,
+          RequestManager::prepare_verify_batch_config_task>(registrar);
     }
   }
   // RequestManager background serving task
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9239e3d4d..19cbf3820 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -557,7 +557,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/
-TreeSearchBatchConfigFuture RequestManager::get_first_spec_batch_config(
+TreeSearchBatchConfigFuture RequestManager::prepare_first_spec_batch_config(
     TreeVerifyBatchConfigFuture const &old_bc,
     InferenceResultFuture const &result,
     int model_id,
@@ -573,7 +573,7 @@ TreeSearchBatchConfigFuture RequestManager::get_first_spec_batch_config(
   return runtime->execute_task(ctx, launcher);
 }
 
-TreeSearchBatchConfig RequestManager::get_first_spec_batch_config_task(
+TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
@@ -995,7 +995,7 @@ TreeSearchBatchConfig RequestManager::get_first_spec_batch_config(
 }
 
 /***** Speculative Decoding Phase *****/
-TreeSearchBatchConfigFuture RequestManager::get_next_spec_batch_config(
+TreeSearchBatchConfigFuture RequestManager::prepare_next_spec_batch_config(
     TreeSearchBatchConfigFuture const &old_bc,
     SsmInferenceResultFuture const &result,
     Context ctx,
@@ -1009,7 +1009,7 @@ TreeSearchBatchConfigFuture RequestManager::get_next_spec_batch_config(
   return runtime->execute_task(ctx, launcher);
 }
 
-TreeSearchBatchConfig RequestManager::get_next_spec_batch_config_task(
+TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
@@ -1022,7 +1022,7 @@ TreeSearchBatchConfig RequestManager::get_next_spec_batch_config_task(
   return rm->prepare_next_batch_beam(bc, result);
 }
 
-TreeSearchBatchConfig RequestManager::get_next_spec_batch_config() {
+TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_spec ###############\n";
@@ -2203,7 +2203,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
       }
     }
     auto const &next_batch = batch_pipeline.back();
-    TreeSearchBatchConfigFuture beam_bcf = get_first_spec_batch_config(
+    TreeSearchBatchConfigFuture beam_bcf = prepare_first_spec_batch_config(
         next_batch.first, next_batch.second, 0, ctx, runtime);
     std::vector<TreeSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
     for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {

From 335332f43d566cba47e7f9518d20b96ebbd929f6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 11:13:54 -0400
Subject: [PATCH 064/667] Made SsmInferenceResult a child class of
 InferenceResult

---
 include/flexflow/batch_config.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index c3f72e6c4..6a59bb21f 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -182,8 +182,7 @@ class TreeSearchBatchConfig : public BatchConfig {
   int model_id;
 };
 
-struct SsmInferenceResult {
-  static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
+class SsmInferenceResult : public InferenceResult {
   BatchConfig::TokenId
       token_ids[MAX_NUM_TOKENS *
                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];

From 06609c5c63d88de28580f8cdf900368af08bf3ff Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 11:39:22 -0400
Subject: [PATCH 065/667] Add bool RequestManager::update_ssm_inference_results
 as a helper function.

---
 include/flexflow/request_manager.h | 8 ++++++--
 src/runtime/request_manager.cc     | 9 ++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5b5f3a1c0..3711f36b3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -382,7 +382,8 @@ class RequestManager {
 
   // Added to make the request manager stateful. During the processing of the
   // first small model inference results, the step equals to 1. That is, every
-  // time a small model inference task is launched, the step is increased by 1.
+  // time a small model inference task is launched, the step is increased
+  // by 1.
   int current_speculation_step = 0;
   // Maps the index of the request in the batch config to the request guid.
   int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
@@ -421,7 +422,10 @@ class RequestManager {
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
 
-  void RequestManager::init_token_trees();
+  bool RequestManager::update_ssm_inference_results(
+      SsmInferenceResult const &ssm_inference_result);
+  // Helper functions related to token trees
+  void RequestManager::init_token_trees(RequestGuid guid);
   void add_token_to_spec_token_tree(RequestGuid guid,
                                     BatchConfig::TokenId token_id,
                                     int parent_pos,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 19076dfe9..63e7ed24c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1542,7 +1542,7 @@ void RequestManager::initialize_root_of_spec_token_trees() {
   }
 }
 
-bool RequestManager::update_inference_results(
+bool RequestManager::update_ssm_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
@@ -2350,10 +2350,9 @@ RequestManager *RequestManager::get_request_manager() {
 }
 
 /********** Request Token Tree Related Functions **********/
-void RequestManager::init_token_trees() {
-  // TODO: implement this function
-  // Add a layer that only contains the root token of the token tree. The root
-  // token's info should be
+void RequestManager::init_token_trees(RequestGuid guid) {
+  Request &request = all_requests[guid];
+  request.speculative_token_trees.clear();
 }
 
 void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,

From 7490a5a2cb7f843534ae4079bf93cebc97a0108f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 11:48:49 -0400
Subject: [PATCH 066/667] Remove unused API

---
 src/runtime/request_manager.cc | 217 ---------------------------------
 1 file changed, 217 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 63e7ed24c..54e1e259c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -410,223 +410,6 @@ BatchConfig RequestManager::prepare_next_batch() {
   bc.num_generation_tokens = num_generation_tokens;
   return bc;
 }
-
-BatchConfigFuture
-    RequestManager::prepare_next_batch(BatchConfigFuture const &old_bc,
-                                       InferenceResultFuture const &result,
-                                       Context ctx,
-                                       Runtime *runtime) {
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
-  launcher.add_future(result);
-  return runtime->execute_task(ctx, launcher);
-}
-
-BatchConfig RequestManager::prepare_next_batch_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  InferenceResult const &result =
-      Future(task->futures[1]).get_result<InferenceResult>();
-  return rm->prepare_next_batch(*bc, result);
-}
-
-BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
-                                               InferenceResult const &result) {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-
-  // Step 1: append result from previous iteration to request's tokens
-  for (int i = 0; i < old_bc.num_tokens; i++) {
-    size_t guid =
-        old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
-    Request &request = all_requests[guid];
-    if (old_bc.tokensInfo[i].abs_index_in_request + 1 < request.tokens.size()) {
-      // This is a prompt token
-      continue;
-    } else {
-      assert(old_bc.tokensInfo[i].abs_index_in_request + 1 ==
-             request.tokens.size());
-      // This is a decoding token
-      log_req_mgr.print("Output token is: %d", result.token_ids[i]);
-      request.tokens.push_back(result.token_ids[i]);
-      // std::string output = this->tokenizer_->Decode(request.tokens);
-      // log_req_mgr.print("Output: %s", output.c_str());
-    }
-  }
-  int num_generation_tokens = 0;
-  int num_active_req = -1;
-
-  // Step 2: prepare the next batch for existing requests
-  BatchConfig new_bc;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_available[i]) { // add new requests to the next batch
-      continue;
-    } else {
-      assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
-      Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
-      int processed_tokens =
-          old_bc.requestsInfo[i].first_token_index_in_request +
-          old_bc.requestsInfo[i].num_tokens_in_batch;
-      assert(processed_tokens < request.tokens.size());
-      bool request_completed = false;
-      // printf("model_type = %d\n", this->model_type);
-      if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
-        request_completed = true;
-      } else if (request.tokens.back() == eos_token_id) {
-        // Encounter EOS token id
-        request_completed = true;
-      }
-      if (request_completed) {
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        // Unlike Huggingface, the sentencepiece C++ library automatically
-        // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
-            request.tokens.at(0) == bos_token_id) {
-          output = "<s> " + output;
-        }
-        {
-          // update generation result
-          GenerationResult &gr = request_generation_results[request.guid];
-          assert(gr.guid == request.guid);
-          gr.output_tokens = request.tokens;
-          gr.output_text = output;
-        }
-        request.status = Request::COMPLETED;
-        trigger_request_completion_future(request.guid);
-        log_req_mgr.print("[Done] guid(%zu) final_length(%zu)",
-                          old_bc.requestsInfo[i].request_guid,
-                          request.tokens.size());
-        log_req_mgr.print("Final output: %s", output.c_str());
-        num_processed_requests++;
-        ProfileInfo profile_info = profiling_requests[request.guid];
-        profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
-        total_request_run_time +=
-            profile_info.finish_time - profile_info.start_time;
-        profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
-        // Write output to file if needed:
-        if (!output_filepath.empty()) {
-          std::ofstream outputFile(output_filepath, std::ios::app);
-          if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
-              }
-            }
-            outputFile << std::endl;
-            outputFile << output;
-            outputFile.close();
-          } else {
-            std::cout << "Unable to open the output file: " << output_filepath
-                      << std::endl;
-            assert(false);
-          }
-        }
-
-      } else {
-        new_bc.request_available[i] = false;
-        new_bc.requestsInfo[i].first_token_index_in_request = processed_tokens;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid =
-            old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
-        num_active_req++;
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        if (new_bc.requestsInfo[i].first_token_index_in_request + 1 ==
-            request.tokens.size()) {
-          // Incremental phase
-          new_bc.requestsInfo[i].num_tokens_in_batch = 1;
-          num_generation_tokens++;
-          new_bc.requestsInfo[i].prompt_phase = false;
-        } else {
-          // Prompt phase
-          new_bc.requestsInfo[i].num_tokens_in_batch =
-              std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                       (int)request.tokens.size() -
-                           new_bc.requestsInfo[i].first_token_index_in_request);
-          new_bc.requestsInfo[i].prompt_phase = true;
-        }
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_index_in_request + j;
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = depth;
-          assert(depth < request.tokens.size());
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
-          new_bc.num_tokens++;
-        }
-        // Update profiling
-        profiling_requests[new_bc.requestsInfo[i].request_guid]
-            .llm_decoding_steps++;
-      }
-    }
-  }
-  new_bc.num_generation_tokens = num_generation_tokens;
-
-  // Step 3: add new requests to the next batch
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (new_bc.request_available[i]) {
-      if (!pending_request_queue.empty() &&
-          new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
-        // all_requests[new_request.guid] = new_request;
-
-        new_bc.requestsInfo[i].first_token_index_in_request = 0;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid = new_request.guid;
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                     (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
-        new_bc.request_available[i] = false;
-        new_bc.requestsInfo[i].prompt_phase = true;
-        num_active_req++;
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 1;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_index_in_request + j;
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = depth;
-          assert(depth < new_request.tokens.size());
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_request.tokens[depth];
-          new_bc.num_tokens++;
-        }
-        if (new_bc.num_tokens == get_max_tokens_per_batch()) {
-          break;
-        }
-      }
-    }
-  }
-
-  return new_bc;
-}
-
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/

From 469bcc583ad40bc9da437caa29bf1f933f8394c1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 12:06:59 -0400
Subject: [PATCH 067/667] Cleaned unused APIs and removed unused wrapper
 functions.

---
 include/flexflow/request_manager.h | 129 +++----------------------
 src/runtime/request_manager.cc     | 145 +----------------------------
 2 files changed, 19 insertions(+), 255 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3711f36b3..8441bc1d0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -178,102 +178,10 @@ class RequestManager {
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
 
-  // Methods for preparing next batches
-  BatchConfig prepare_next_batch(BatchConfig const &bc,
-                                 InferenceResult const &result);
-  BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
-                                       InferenceResultFuture const &result,
-                                       Legion::Context ctx,
-                                       Legion::Runtime *runtime);
-  /* Old APIs for reference */
-  TreeSearchBatchConfig
-      prepare_next_batch_beam(TreeSearchBatchConfig const &old_bc,
-                              SsmInferenceResult const &result);
-  TreeSearchBatchConfigFuture
-      prepare_next_batch_beam(TreeSearchBatchConfigFuture const &old_bc,
-                              SsmInferenceResultFuture const &result,
-                              Legion::Context ctx,
-                              Legion::Runtime *runtime);
-  TreeSearchBatchConfig
-      get_first_spec_batch_config(TreeVerifyBatchConfig const &old_bc,
-                                  InferenceResult const &result,
-                                  int model_id);
-  TreeSearchBatchConfigFuture
-      prepare_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
-                                      InferenceResultFuture const &result,
-                                      int model_id,
-                                      Legion::Context ctx,
-                                      Legion::Runtime *runtime);
-
-  TreeVerifyBatchConfig prepare_next_batch_verify(
-      std::vector<TreeSearchBatchConfig> const &old_batches);
-  TreeVerifyBatchConfigFuture prepare_next_batch_verify(
-      std::vector<TreeSearchBatchConfigFuture> const &old_batches,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
   std::vector<std::pair<BatchConfig::TokenId, int>>
       traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
                          int request_index,
                          int first_token_depth_in_request);
-  /* Old APIs for reference */
-
-  /*********** New APIs ***********/
-  // Prepare the next speculation batch config. This function is called before
-  // the second step of the speculation.
-  TreeSearchBatchConfig prepare_next_spec_batch_config();
-
-  // A wrapper function.
-  TreeSearchBatchConfigFuture
-      prepare_next_spec_batch_config(TreeSearchBatchConfigFuture const &old_bc,
-                                     SsmInferenceResultFuture const &result,
-                                     Legion::Context ctx,
-                                     Legion::Runtime *runtime);
-
-  // A wrapper function.
-  static TreeSearchBatchConfig prepare_next_spec_batch_config_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  // Prepare the first speculation batch config. This function is called before
-  // the first step of the speculation. The difference with
-  // prepare_next_batch_config_spec is that we put the info of the committed
-  // tokens into the batch config in the first speculation step to commit the KV
-  // cache of the small model.
-  TreeSearchBatchConfig prepare_first_spec_batch_config();
-
-  // A wrapper function.
-  TreeSearchBatchConfigFuture
-      prepare_first_spec_batch_config(TreeVerifyBatchConfigFuture const &old_bc,
-                                      InferenceResultFuture const &result,
-                                      int model_id,
-                                      Legion::Context ctx,
-                                      Legion::Runtime *runtime);
-
-  // A wrapper function.
-  static TreeSearchBatchConfig prepare_first_spec_batch_config_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  TreeVerifyBatchConfig prepare_verify_batch_config();
-
-  // A wrapper function.
-  TreeVerifyBatchConfigFuture prepare_verify_batch_config(
-      std::vector<TreeSearchBatchConfigFuture> const &old_batches,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static TreeVerifyBatchConfig prepare_verify_batch_config_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-  /*********** New APIs ***********/
-
   // This function takes the tree stored in the token trees in
   // RequestManager::all_requests, and convert them into serialized version.
   // Called by prepare_next_batch_verify().
@@ -322,28 +230,6 @@ class RequestManager {
       Legion::Context ctx,
       Legion::Runtime *runtime);
 
-  /* Old APIs for reference */
-  // A wrapper function.
-  static TreeSearchBatchConfig prepare_next_batch_beam_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  // A wrapper function.
-  static TreeSearchBatchConfig prepare_first_spec_batch_config_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-
-  static TreeVerifyBatchConfig prepare_next_batch_verify_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-  /* Old APIs for reference */
-
   // API for rm state machine
   BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
                                           Legion::Context ctx,
@@ -422,8 +308,23 @@ class RequestManager {
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
 
+  /* ---------- New Helper Functions ---------- */
+  // Prepare the next speculation batch config. This function is called before
+  // the second step of the speculation.
+  TreeSearchBatchConfig prepare_next_spec_batch_config();
+  // Prepare the first speculation batch config. This function is called before
+  // the first step of the speculation. The difference with
+  // prepare_next_batch_config_spec is that we put the info of the committed
+  // tokens into the batch config in the first speculation step to commit the KV
+  // cache of the small model.
+  TreeSearchBatchConfig prepare_first_spec_batch_config();
+  TreeVerifyBatchConfig prepare_verify_batch_config();
+  bool RequestManager::update_llm_verify_results(
+      InferenceResult const &llm_verify_result);
   bool RequestManager::update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
+  /* ---------- New Helper Functions ---------- */
+
   // Helper functions related to token trees
   void RequestManager::init_token_trees(RequestGuid guid);
   void add_token_to_spec_token_tree(RequestGuid guid,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 54e1e259c..ea53932ba 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -413,40 +413,7 @@ BatchConfig RequestManager::prepare_next_batch() {
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/
-TreeSearchBatchConfigFuture RequestManager::prepare_first_spec_batch_config(
-    TreeVerifyBatchConfigFuture const &old_bc,
-    InferenceResultFuture const &result,
-    int model_id,
-    Context ctx,
-    Runtime *runtime) {
-
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
-  launcher.add_future(result);
-  launcher.add_future(Future::from_value<int>(model_id));
-  return runtime->execute_task(ctx, launcher);
-}
-
-TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  TreeVerifyBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-  InferenceResult const &result =
-      Future(task->futures[1]).get_result<InferenceResult>();
-  int model_id = Future(task->futures[2]).get_result<int>();
-  return rm->get_first_spec_batch_config(bc, result, model_id);
-}
-
-TreeSearchBatchConfig RequestManager::get_first_spec_batch_config(
-    TreeVerifyBatchConfig const &old_bc,
-    InferenceResult const &result,
-    int model_id) {
+TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
@@ -851,33 +818,6 @@ TreeSearchBatchConfig RequestManager::get_first_spec_batch_config(
 }
 
 /***** Speculative Decoding Phase *****/
-TreeSearchBatchConfigFuture RequestManager::prepare_next_spec_batch_config(
-    TreeSearchBatchConfigFuture const &old_bc,
-    SsmInferenceResultFuture const &result,
-    Context ctx,
-    Runtime *runtime) {
-
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  launcher.add_future(old_bc);
-  launcher.add_future(result);
-  return runtime->execute_task(ctx, launcher);
-}
-
-TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  TreeSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
-  SsmInferenceResult const &result =
-      Future(task->futures[1]).get_result<SsmInferenceResult>();
-  return rm->prepare_next_batch_beam(bc, result);
-}
-
 TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
@@ -954,51 +894,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
 }
 
 /***** Verify Phase *****/
-
-TreeVerifyBatchConfigFuture RequestManager::prepare_next_batch_verify(
-    std::vector<TreeSearchBatchConfigFuture> const &old_batches,
-    Context ctx,
-    Runtime *runtime) {
-
-  RequestManager *rm = this;
-  TaskLauncher launcher(RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
-                        TaskArgument(&rm, sizeof(RequestManager *)));
-  for (auto const &bcf : old_batches) {
-    launcher.add_future(bcf);
-  }
-  return runtime->execute_task(ctx, launcher);
-}
-
-TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  std::vector<TreeSearchBatchConfig> old_batches;
-  for (auto const &bcf : task->futures) {
-    old_batches.push_back(Future(bcf).get_result<TreeSearchBatchConfig>());
-  }
-  return rm->prepare_next_batch_verify(old_batches);
-}
-
-/* New APIs */
-TreeSearchBatchConfig RequestManager::get_verify_batch_config_task(
-    Legion::Task const *task,
-    std::vector<Legion::PhysicalRegion> const &regions,
-    Legion::Context ctx,
-    Legion::Runtime *runtime) {
-  RequestManager *rm = *((RequestManager **)task->args);
-  std::vector<TreeSearchBatchConfig> old_batches;
-  for (auto const &bcf : task->futures) {
-    old_batches.push_back(Future(bcf).get_result<TreeSearchBatchConfig>());
-  }
-  return rm->prepare_next_batch_verify(old_batches);
-}
-/* New APIs */
-
-TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
-    std::vector<TreeSearchBatchConfig> const &old_batches) {
+TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   if (verbose) {
@@ -1292,39 +1188,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
   return new_bc;
 }
 
-/* New APIs */
-TreeSearchBatchConfig RequestManager::prepare_verify_batch_config(
-    std::vector<TreeSearchBatchConfig> const &old_batches) {
-  if (verbose) {
-    std::cout
-        << "\n############### prepare_next_batch_verify ###############\n";
-  }
-
-  assert(old_batches.size() > 0);
-
-  TreeVerifyBatchConfig new_bc;
-  new_bc.num_tokens_to_commit = 0;
-  new_bc.num_tokens = 0;
-
-  return new_bc;
-}
-/* New APIs */
-void RequestManager::initialize_root_of_spec_token_trees() {
-  // This method assumes only one small model is used for speculation
-
-  // TODO: Do we need to iterate over all requests?
-  for (auto &request_pair : all_requests) {
-    Request &request = request_pair.second;
-    TokenTree &token_tree = request.speculative_token_trees[0];
-    token_tree.tree_layers.clear();
-    token_tree.add_layer();
-    token_tree.tree_layers[0].emplace_back(
-        // TODO: Make sure every request has at least one token,
-        // otherwise, we need to handle this case
-        std::make_shared<TokenTreeNode>(request.tokens.back(), 0, 1.0));
-  }
-}
-
 bool RequestManager::update_ssm_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
@@ -2132,7 +1995,7 @@ RequestManager *RequestManager::get_request_manager() {
   return request_manager_singleton;
 }
 
-/********** Request Token Tree Related Functions **********/
+/* --------- Request Token Tree Related Functions --------- */
 void RequestManager::init_token_trees(RequestGuid guid) {
   Request &request = all_requests[guid];
   request.speculative_token_trees.clear();
@@ -2260,6 +2123,6 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
     }
   }
 }
-/********** Request Token Tree Related Functions **********/
+/* --------- Request Token Tree Related Functions --------- */
 
 }; // namespace FlexFlow

From 7daaed079db5b83bf4a3bd350df3df209944e31e Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyu.wang@yahooinc.com>
Date: Wed, 17 Apr 2024 13:52:20 -0400
Subject: [PATCH 068/667] add missing task option to mapper

---
 src/mapper/mapper.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index c293aecb1..3bd876764 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -287,7 +287,8 @@ void FFMapper::select_task_options(const MapperContext ctx,
       (task.task_id == RM_PREPARE_NEXT_BATCH_INIT_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID) ||
       (task.task_id == RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID) ||
-      (task.task_id == RM_BACKGROUND_SERVING_TASK_ID)) {
+      (task.task_id == RM_BACKGROUND_SERVING_TASK_ID) ||
+      (task.task_id == RM_GET_NEXT_BATCH_CONFIG_TASK_ID)) {
     output.initial_proc = all_cpus[0];
     return;
   }

From 80c99318c0f5d939d85ecdbbe158dd68436c8796 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 14:38:41 -0400
Subject: [PATCH 069/667] Add a BatchConfig::PerRequestInfo.last_batch_offset
 for small model kv cache commit.

---
 include/flexflow/batch_config.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 6a59bb21f..0d493f3ae 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -69,9 +69,11 @@ class BatchConfig {
     int num_tokens_in_batch;
   };
   struct PerTokenInfo {
+    TokenId token_id;
     int abs_index_in_request;
     int request_index;
-    TokenId token_id;
+    // This offset is only used for small model KV cache commit
+    int last_batch_offset = -1;
   };
 
   class BitMask {

From 165b6f93a000a4b27da88bae995ffbe84f714055 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 14:39:28 -0400
Subject: [PATCH 070/667] Add some task descriptions.

---
 include/flexflow/request_manager.h |  5 ++---
 src/runtime/request_manager.cc     | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8441bc1d0..d115bfb35 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -319,9 +319,8 @@ class RequestManager {
   // cache of the small model.
   TreeSearchBatchConfig prepare_first_spec_batch_config();
   TreeVerifyBatchConfig prepare_verify_batch_config();
-  bool RequestManager::update_llm_verify_results(
-      InferenceResult const &llm_verify_result);
-  bool RequestManager::update_ssm_inference_results(
+  void update_llm_verify_results(InferenceResult const &llm_verify_result);
+  bool update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
   /* ---------- New Helper Functions ---------- */
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ea53932ba..cdcdb39ee 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -419,6 +419,15 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
   }
 
+  // TODO: Clean up the code, this method does the following:
+  // 1. Commit the verified tokens to through the batch config. We can do this
+  // request by request. Put the information of the committed tokens into
+  // BatchConfig::TokensInfo. TODO: where to store those tokens?
+  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
+  // TreeSearchBatchConfig.
+  // Please refer to the implementation of prepare_next_spec_batch_config() for
+  // more details.
+
   // Step 1: use result to update requests
   TreeSearchBatchConfig new_bc;
   new_bc.num_tokens = 0;
@@ -902,6 +911,16 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
         << "\n############### prepare_next_batch_verify ###############\n";
   }
 
+  // TODO: Clean up the code, this method does the following:
+  // 1. Commit the verified tokens through a TreeVerifyBatchConfig . We can do
+  // this request by request. Put the information of the committed tokens into
+  // TreeVerifyBatchConfig::committed_tokens. TODO: where to store those tokens?
+  // 2. Load the tokens on the token tree to TreeVerifyBatchConfig::tokensInfo.
+  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
+  // TreeSearchBatchConfig.
+  // Please refer to the implementation of prepare_next_spec_batch_config() for
+  // more details.
+
   assert(old_batches.size() > 0);
 
   TreeVerifyBatchConfig new_bc;
@@ -1188,6 +1207,11 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   return new_bc;
 }
 
+void RequestManager::update_llm_verify_results(
+    InferenceResult const &llm_verify_result) {
+  // TODO: Implement this function
+}
+
 bool RequestManager::update_ssm_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,

From 08942f2e8b0fc7e6dd4b0cd1a4bcba8c9478c8eb Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 15:56:43 -0400
Subject: [PATCH 071/667] Made fields in SsmInferenceResult public.

---
 include/flexflow/batch_config.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 0d493f3ae..e7df7adea 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -185,6 +185,7 @@ class TreeSearchBatchConfig : public BatchConfig {
 };
 
 class SsmInferenceResult : public InferenceResult {
+public:
   BatchConfig::TokenId
       token_ids[MAX_NUM_TOKENS *
                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];

From 3b380edc1d92503d07a3bd77d76d0abf68164035 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 15:57:38 -0400
Subject: [PATCH 072/667] 1. Moved committed_tokens from ReqestManager to
 Request. Two different committed_tokens are added to Request, for LLM and
 SSM, respectively. This is necessary because due to pruning, the token tree
 seen by the SSM and the LLM are different. So we have to maintain the
 committed tokens for both of them. 2. Remove the merge_dfs_trees API as we no
 longer need it. 3. Add some task descriptions.

---
 include/flexflow/request_manager.h | 19 ++++----
 src/runtime/request_manager.cc     | 76 +++++-------------------------
 2 files changed, 22 insertions(+), 73 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d115bfb35..2a1142ba4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -77,6 +77,16 @@ struct Request {
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
   BatchConfig::BitMask causal_mask;
+  // Committed tokens
+  struct CommittedToken {
+    int absolute_index;
+    int request_offset; // Equivalent to the order of the token in the request
+                        // speculative token tree
+  };
+  // Here we have to maintain two versions of the committed tokens because the
+  // tree seen by the LLM and the SSM is different due to the pruning
+  std::vector<CommittedToken> llm_committed_tokens;
+  std::vector<CommittedToken> ssm_committed_tokens;
 };
 
 class TokenTreeNode {
@@ -190,13 +200,6 @@ class RequestManager {
                          int request_index,
                          int first_token_depth_in_request);
 
-  // remove guid after put the cached tree in request
-  std::vector<std::pair<BatchConfig::TokenId, int>> merge_dfs_trees(
-      std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-          input_trees,
-      int root_depth,
-      RequestGuid guid);
-
   std::vector<std::pair<BatchConfig::TokenId, int>> traverse_verify_tree(
       size_t guid,
       std::vector<std::pair<BatchConfig::TokenId, int>> const
@@ -288,8 +291,6 @@ class RequestManager {
   std::unordered_map<RequestGuid,
                      std::vector<std::pair<BatchConfig::TokenId, int>>>
       dfs_tree_inputs;
-  std::unordered_map<RequestGuid, std::vector<std::pair<int, int>>>
-      committed_tokens;
 
   // Multi-model support
   std::vector<FFModel *> ssm_models;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cdcdb39ee..fc21b2574 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -915,7 +915,9 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   // 1. Commit the verified tokens through a TreeVerifyBatchConfig . We can do
   // this request by request. Put the information of the committed tokens into
   // TreeVerifyBatchConfig::committed_tokens. TODO: where to store those tokens?
-  // 2. Load the tokens on the token tree to TreeVerifyBatchConfig::tokensInfo.
+  // 2. Load the tokens on the token tree that are not yet pruned to
+  // TreeVerifyBatchConfig::tokensInfo.
+  // 3. Update the causal mask for the large model.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
   // Please refer to the implementation of prepare_next_spec_batch_config() for
@@ -1210,10 +1212,19 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
 void RequestManager::update_llm_verify_results(
     InferenceResult const &llm_verify_result) {
   // TODO: Implement this function
+  // We may have two types of InferenceResults, one is the results from sampling
+  // the large model, the other is the top-p / top-k logits of the large model,
+  // we can first implement the former one
+  // For the latter one, we have to add a CPU based verify function
+  // 1. Compare the verified results with the request speculative token tree,
+  // and store all the committed tokens into committed_tokens of each request.
+  // 2. Store the verified tokens to Request.tokens.
+  // 3. Change the state of the request manager to SSM_SPEC.
 }
 
 bool RequestManager::update_ssm_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
+  // TODO: change the request manager state
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
   assert(current_speculation_step >= 1 &&
@@ -1696,69 +1707,6 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
   // }
 }
 
-std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::merge_dfs_trees(
-        std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-            input_trees,
-        int root_depth,
-        RequestGuid guid) {
-  assert(input_trees.size() == 1 && "currently using one ssm");
-  dfs_tree_inputs[guid] = input_trees.at(0);
-  return input_trees.at(0);
-
-  std::vector<std::pair<BatchConfig::TokenId, int>> merged_tree;
-
-  std::unordered_map<int, std::set<int>> childrens;
-  std::unordered_map<int, int> curr_path;
-
-  // convert <token_id, depth> pair to an integer
-  auto root = input_trees.at(0).at(0);
-  int root_id = root.first * 10000 + root.second;
-
-  for (int i = 0; i < input_trees.size(); i++) {
-    auto tree = input_trees.at(i);
-    // all trees should have the same root
-    assert(tree.at(0) == root);
-
-    for (auto const &pair : tree) {
-      int id = pair.first * 10000 + pair.second; // current node
-      curr_path[pair.second] = id;               // log node in current search
-
-      if (childrens.find(id) == childrens.end()) {
-        // init empty set
-        childrens[id] = std::set<int>();
-      }
-
-      if (pair.second > root_depth) {
-        int parent_id = curr_path[pair.second - 1];
-        childrens[parent_id].insert(id);
-      }
-    }
-  }
-
-  std::stack<int> q;
-  q.push(root_id);
-
-  while (!q.empty()) {
-    int curr = q.top();
-    q.pop();
-    merged_tree.push_back(std::make_pair(curr / 10000, curr % 10000));
-    for (int child : childrens[curr]) {
-      q.push(child);
-    }
-  }
-
-  if (verbose) {
-    for (auto &pair : merged_tree) {
-      std::cout << pair.first << ", depth: " << pair.second << std::endl;
-    }
-  }
-
-  dfs_tree_inputs[guid] = merged_tree;
-
-  return merged_tree;
-}
-
 std::vector<GenerationResult>
     FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
   RequestManager *rm = RequestManager::get_request_manager();

From c95f8366ab8a215beac29d29e52ce30a91ebd1f4 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyu.wang@yahooinc.com>
Date: Wed, 17 Apr 2024 16:00:33 -0400
Subject: [PATCH 073/667] bug fixes

---
 include/flexflow/request_manager.h | 4 ++++
 src/runtime/model.cc               | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4cb718abd..d1a1cadef 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -303,6 +303,10 @@ class RequestManager {
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
 
+  // rm state 
+  std::mutex rm_state_mutex;
+  std::vector<Request> activated_requests;
+
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,
                      std::vector<std::pair<BatchConfig::TokenId, int>>>
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 1468d9c85..b3f499684 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4460,14 +4460,14 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<BatchConfig,
-                                        RequestManager::get_next_batch_config>(
+                                        RequestManager::get_next_batch_config_task>(
           registrar, "RequestManager Get Next Batch Config Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
       runtime->register_task_variant<BatchConfig,
-                                     RequestManager::get_next_batch_config>(
+                                     RequestManager::get_next_batch_config_task>(
           registrar);
     }
   }

From f56cff24ea0869be96582a2a2c0b8b078dafed95 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 16:54:02 -0400
Subject: [PATCH 074/667] Add more descriptions

---
 src/runtime/request_manager.cc | 40 +++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index fc21b2574..91564db3c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -418,13 +418,14 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   if (verbose) {
     std::cout << "\n############### prepare_next_batch_init ###############\n";
   }
-
   // TODO: Clean up the code, this method does the following:
-  // 1. Commit the verified tokens to through the batch config. We can do this
-  // request by request. Put the information of the committed tokens into
-  // BatchConfig::TokensInfo. TODO: where to store those tokens?
+  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do this
+  // request by request. The infomation of the committed tokens are stored in
+  // Request.ssm_committed_tokens. Put the information of the committed tokens
+  // into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
+  // 3. Init causal mask.
   // Please refer to the implementation of prepare_next_spec_batch_config() for
   // more details.
 
@@ -905,20 +906,22 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
 /***** Verify Phase *****/
 TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
-
   if (verbose) {
     std::cout
         << "\n############### prepare_next_batch_verify ###############\n";
   }
-
   // TODO: Clean up the code, this method does the following:
-  // 1. Commit the verified tokens through a TreeVerifyBatchConfig . We can do
-  // this request by request. Put the information of the committed tokens into
-  // TreeVerifyBatchConfig::committed_tokens. TODO: where to store those tokens?
+  // 1. Commit the verified tokens in the last iteration through the
+  // TreeVerifyBatchConfig . We can do this request by request. The information
+  // of the committed tokens is stored in Request.llm_committed_tokens. Put the
+  // information of the committed tokens into
+  // TreeVerifyBatchConfig::committed_tokens.
   // 2. Load the tokens on the token tree that are not yet pruned to
-  // TreeVerifyBatchConfig::tokensInfo.
-  // 3. Update the causal mask for the large model.
-  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
+  // TreeVerifyBatchConfig::tokensInfo. Be careful with the abs_depth etc. (skip
+  // the pruned tokens).
+  // 3. Create the causal mask for the large model based on the small model
+  // causal mask.
+  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
   // Please refer to the implementation of prepare_next_spec_batch_config() for
   // more details.
@@ -1216,15 +1219,18 @@ void RequestManager::update_llm_verify_results(
   // the large model, the other is the top-p / top-k logits of the large model,
   // we can first implement the former one
   // For the latter one, we have to add a CPU based verify function
-  // 1. Compare the verified results with the request speculative token tree,
-  // and store all the committed tokens into committed_tokens of each request.
-  // 2. Store the verified tokens to Request.tokens.
-  // 3. Change the state of the request manager to SSM_SPEC.
+  // 1. Compare the results returned from the LLM and compare them with the
+  // SSM's speculative token tree. For the greedy construction of the
+  // speculative token tree, we can simply compare LLM's sample result at each
+  // token, while for the sampling construction of the speculative token tree,
+  // we need to implement a CPU based verify function.
+  // 2. Store the committed tokens to Request.llm_committed_tokens and
+  // Request.ssm_committed_tokens.
+  // 3. Store the verified tokens to Request.tokens.
 }
 
 bool RequestManager::update_ssm_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
-  // TODO: change the request manager state
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
   assert(current_speculation_step >= 1 &&

From 1579184f5dd5f24ee13ca405f9a89bdd1775acc3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 20:32:30 -0400
Subject: [PATCH 075/667] Modified bitmask APIs

---
 include/flexflow/request_manager.h |  7 +++---
 src/runtime/request_manager.cc     | 35 ++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e9e304a86..c12d4ab20 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -164,11 +164,10 @@ class RequestManager {
                           std::string const &path);
   void register_output_filepath(std::string const &);
   void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
-  void init_bitmask(BatchConfig::BitMask &bitmask, int initLength);
+  void init_bitmask(RequestGuid guid, int prompt_length);
   void append_bitmask(RequestGuid guid);
-  void update_bitmask(BatchConfig::BitMask &bitmask,
-                      int initLength,
-                      int non_tree_size);
+  void update_bitmask(RequestGuid guid, int num_committed_tokens);
+  BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
 
   FFModel *get_ssm_model(int model_id);
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 91564db3c..d99485bac 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -416,7 +416,8 @@ BatchConfig RequestManager::prepare_next_batch() {
 TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
-    std::cout << "\n############### prepare_next_batch_init ###############\n";
+    std::cout
+        << "\n############### prepare_first_spec_batch_config ##############\n";
   }
   // TODO: Clean up the code, this method does the following:
   // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do this
@@ -425,7 +426,6 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   // into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
-  // 3. Init causal mask.
   // Please refer to the implementation of prepare_next_spec_batch_config() for
   // more details.
 
@@ -1227,6 +1227,7 @@ void RequestManager::update_llm_verify_results(
   // 2. Store the committed tokens to Request.llm_committed_tokens and
   // Request.ssm_committed_tokens.
   // 3. Store the verified tokens to Request.tokens.
+  // 4. For requests not completed, update their causal mask.
 }
 
 bool RequestManager::update_ssm_inference_results(
@@ -1301,6 +1302,7 @@ bool RequestManager::update_ssm_inference_results(
 
 // bitmask related functions
 
+// TO BE REMOVED: START
 // prompt phase, init task
 void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,
                                   int initLength) {
@@ -1347,6 +1349,26 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
   // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0])
   //           << "\n";
 }
+// TO BE REMOVED: END
+
+void RequestManager::init_bitmask(RequestGuid guid, int prompt_length) {
+  // This method modifies the bitmask in place
+  // This method is called by update_llm_verify_results
+  // TODO: implement this function
+  // 1. Clear the causal mask because our current speculative token tree is
+  // empty.
+  // 2. Maintain all other fields.
+}
+
+void RequestManager::update_bitmask(RequestGuid guid,
+                                    int num_committed_tokens) {
+  // This method modifies the bitmask in place
+  // This method is called by update_llm_verify_results
+  // TODO: implement this function
+  // 1. Clear the causal mask because our current speculative token tree is
+  // empty.
+  // 2. Maintain all other fields.
+}
 
 void RequestManager::append_bitmask(RequestGuid guid) {
   // This method changes the bitmask in place
@@ -1395,6 +1417,15 @@ void RequestManager::append_bitmask(RequestGuid guid) {
   }
 }
 
+BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
+  // This method creates a new bitmask for LLM verification model's bitmask, it
+  // does not modify the small model's bitmask
+  // This method is called by prepare_verify_batch_config
+  // TODO: implement this function
+  // 1. Create the bitmask based on the pruned request token tree
+  // 2. Maintain all other fields
+}
+
 // prompt phase, init task
 void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
                                           int initLength) {

From abb6f32ed06eea7c4250844bc69344e51da89363 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 21:14:38 -0400
Subject: [PATCH 076/667] Remove unused APIs.

---
 include/flexflow/request_manager.h |  13 ---
 src/runtime/request_manager.cc     | 144 +----------------------------
 2 files changed, 4 insertions(+), 153 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index c12d4ab20..b1360725f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -186,19 +186,6 @@ class RequestManager {
   // Methods to check and mark request completion
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
-
-  std::vector<std::pair<BatchConfig::TokenId, int>>
-      traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
-                         int request_index,
-                         int first_token_depth_in_request);
-  // This function takes the tree stored in the token trees in
-  // RequestManager::all_requests, and convert them into serialized version.
-  // Called by prepare_next_batch_verify().
-  std::vector<std::pair<BatchConfig::TokenId, int>>
-      traverse_spec_tree(TreeSearchBatchConfig const &old_bc,
-                         int request_index,
-                         int first_token_depth_in_request);
-
   std::vector<std::pair<BatchConfig::TokenId, int>> traverse_verify_tree(
       size_t guid,
       std::vector<std::pair<BatchConfig::TokenId, int>> const
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d99485bac..8fa05472b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1217,8 +1217,8 @@ void RequestManager::update_llm_verify_results(
   // TODO: Implement this function
   // We may have two types of InferenceResults, one is the results from sampling
   // the large model, the other is the top-p / top-k logits of the large model,
-  // we can first implement the former one
-  // For the latter one, we have to add a CPU based verify function
+  // we can first implement the former one. For the latter one, we have to add a
+  // CPU based verify function.
   // 1. Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
@@ -1300,7 +1300,7 @@ bool RequestManager::update_ssm_inference_results(
   }
 }
 
-// bitmask related functions
+/* --------- Bitmask Related Functions --------- */
 
 // TO BE REMOVED: START
 // prompt phase, init task
@@ -1425,6 +1425,7 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   // 1. Create the bitmask based on the pruned request token tree
   // 2. Maintain all other fields
 }
+/* --------- Bitmask Related Functions --------- */
 
 // prompt phase, init task
 void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
@@ -1445,69 +1446,6 @@ void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
   // }
 }
 
-bool PreOrder(
-    BeamTree const &tree,
-    int max_depth,
-    int current_depth,
-    int beam_width,
-    int id,
-    std::vector<std::pair<TreeSearchBatchConfig::TokenId, int>> &serializedTree,
-    bool verbose) {
-  // terminate
-  if (current_depth >= max_depth) {
-    serializedTree.push_back(std::make_pair(
-        tree.treeLayers[current_depth].tokens[id], current_depth));
-    if (verbose) {
-      std::cout << "last tokens: " << tree.treeLayers[current_depth].tokens[id]
-                << "\n";
-      std::cout << "return true" << "\n";
-    }
-    return true;
-  }
-
-  // add to tree;
-  // std::cout<<"node: " << current_depth << ", id: " <<
-  serializedTree.push_back(
-      std::make_pair(tree.treeLayers[current_depth].tokens[id], current_depth));
-  if (verbose) {
-    std::cout << "push something: " << tree.treeLayers[current_depth].tokens[id]
-              << ", " << current_depth << std::endl;
-  }
-  int index = serializedTree.size() - 1;
-  int next_layers = current_depth + 1;
-
-  bool flag = false;
-  // recursion
-  for (int i = 0; i < beam_width; i++) {
-    int child_id = i;
-    int child_parent = tree.treeLayers[next_layers].parent_ids[i];
-
-    // for all childs, do preOrder
-    if (child_parent == id) {
-      if (verbose) {
-        std::cout << "current depth: " << current_depth << ", child_parent, "
-                  << child_parent << ", child_id, " << child_id << "\n";
-      }
-      bool res = PreOrder(tree,
-                          max_depth,
-                          current_depth + 1,
-                          beam_width,
-                          child_id,
-                          serializedTree,
-                          verbose);
-      flag = flag || res;
-    }
-  }
-  // if (!flag) {
-  //   // no child for this token, delete it
-  //   std::cout << "delete a node: " <<
-  //   tree.treeLayers[current_depth].tokens[id]
-  //             << ", " << current_depth << std::endl;
-  //   serializedTree.erase(serializedTree.begin() + index);
-  // }
-  return flag;
-}
-
 std::vector<std::pair<BatchConfig::TokenId, int>>
     RequestManager::traverse_verify_tree(
         size_t guid,
@@ -1670,80 +1608,6 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
   return verifiedTree;
 }
 
-std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::traverse_beam_tree(TreeSearchBatchConfig const &old_bc,
-                                       int request_index,
-                                       int first_token_depth_in_request) {
-  if (verbose) {
-    std::cout << "[Traverse Beam Tree] request_index: " << request_index
-              << "\n";
-    std::cout << "[Traverse Beam Tree] max_depth: "
-              << old_bc.beamRequestsInfo[request_index].max_depth << "\n";
-    std::cout << "[Traverse Beam Tree] current_depth: "
-              << old_bc.beamRequestsInfo[request_index].current_depth << "\n";
-    std::cout << "[Traverse Beam Tree] beam_width: "
-              << old_bc.beamRequestsInfo[request_index].beam_size << "\n";
-    std::cout << "[Traverse Beam Tree] start index: "
-              << first_token_depth_in_request << "\n";
-  }
-
-  auto guid = old_bc.requestsInfo[request_index].request_guid;
-  Request &request = all_requests[guid];
-  // std::cout << "request.beam_trees.size(): " << request.beam_trees.size()
-  //           << std::endl;
-  BeamTree tree = request.beam_trees.at(old_bc.model_id);
-
-  // std::cout << "print beam tree: "
-  //           << "\n";
-  std::vector<std::pair<BatchConfig::TokenId, int>> serializedTree;
-  for (int i = 0; i <= old_bc.beamRequestsInfo[request_index].max_depth; i++) {
-    // std::cout << "tree layer: " << i
-    //           << ", num_nodes: " << tree.treeLayers[i].nodes_num_this_layer
-    //           << "\n";
-    // push tokens into tree
-    for (int j = 0; j < tree.treeLayers[i].nodes_num_this_layer; j++) {
-      // std::cout << "token: " << tree.treeLayers[i].tokens[j] << "\n";
-      serializedTree.push_back(std::make_pair(tree.treeLayers[i].tokens[j], i));
-    }
-  }
-  // token, index
-  // todo make this one global for different stages
-
-  // PreOrder(tree,
-  //          old_bc.beamRequestsInfo[request_index].max_depth,
-  //          0,
-  //          old_bc.beamRequestsInfo[request_index].beam_size,
-  //          0,
-  //          serializedTree,
-  //          verbose);
-
-  // print it
-  if (verbose) {
-    std::cout << "Print serialized tree: size:" << request_index
-              << serializedTree.size() << "\n";
-  }
-  for (int k = 0; k < serializedTree.size(); k++) {
-    serializedTree.at(k).second += first_token_depth_in_request;
-    if (verbose) {
-      std::cout << "token id: " << serializedTree.at(k).first
-                << ", depth: " << serializedTree.at(k).second << "\n";
-    }
-  }
-
-  // if (dfs_tree_inputs.find(old_bc.requestsInfo[request_index].request_guid)
-  // !=
-  //     dfs_tree_inputs.end()) {
-  //   dfs_tree_inputs[old_bc.requestsInfo[request_index].request_guid] =
-  //       serializedTree;
-  // } else {
-  //   dfs_tree_inputs.insert(std::make_pair(
-  //       old_bc.requestsInfo[request_index].request_guid, serializedTree));
-  // }
-
-  return serializedTree;
-  // }
-}
-
 std::vector<GenerationResult>
     FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
   RequestManager *rm = RequestManager::get_request_manager();

From ae143b55d58cced98ae7567775d76df736815865 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 21:57:04 -0400
Subject: [PATCH 077/667] Keep old APIs for reference.

---
 src/runtime/request_manager.cc | 201 ++++++++++++++++++++-------------
 1 file changed, 122 insertions(+), 79 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8fa05472b..99eeed512 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -412,7 +412,7 @@ BatchConfig RequestManager::prepare_next_batch() {
 }
 /* ----- Speculative Inference Specific functions ----- */
 
-/***** Request Init Phase *****/
+// TO BE REMOVED: START
 TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
@@ -827,83 +827,6 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   return new_bc;
 }
 
-/***** Speculative Decoding Phase *****/
-TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-  if (verbose) {
-    std::cout << "\n############### prepare_next_batch_spec ###############\n";
-    std::cout << "Current tree depth: " << current_speculation_step << "\n";
-  }
-  // Prepare the next batch for existing requests
-  TreeSearchBatchConfig new_bc;
-  // We assume that only one small model is in use now
-  new_bc.model_id = 0;
-  new_bc.num_tokens = 0;
-  new_bc.num_available_requests = 0;
-
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
-       ++request_index) {
-    if (!request_available[request_index]) {
-      new_bc.request_available[request_index] = false;
-      continue;
-    }
-    int guid = guid_of_requests[request_index];
-    Request &request = all_requests[guid];
-    assert(request.status == Request::RUNNING);
-    new_bc.request_available[request_index] = true;
-    new_bc.num_available_requests++;
-    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
-        new_bc.num_tokens;
-    // TODO: check this profiling
-    profiling_requests[request.guid].ssm_decoding_steps += 1;
-
-    // Fill in the tokens
-    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
-    if (token_tree.tree_layers.size() < current_speculation_step) {
-      // This request has no token to decode in this and the following small
-      // model inference steps
-      new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
-      new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() + token_tree.tree_node_size;
-      continue;
-    } else {
-      std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
-          token_tree.tree_layers.at(current_speculation_step - 1);
-      // Exclude the current layer from the token tree, because we want the
-      // start index
-      new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() + token_tree.tree_node_size -
-          current_layer.size();
-      new_bc.requestsInfo[request_index].num_tokens_in_batch =
-          current_layer.size();
-
-      int child_index = 0;
-      for (auto const &node_ptr : current_layer) {
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-            new_bc.requestsInfo[request_index].first_token_index_in_request +
-            child_index;
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
-
-        new_bc.num_tokens++;
-        child_index++;
-      }
-    }
-
-    // TODO: we should call append_bitmask at some point before this
-    // Copy the causal mask
-    new_bc.causalMask[request_index] = request.causal_mask;
-  }
-
-  new_bc.num_available_requests = num_available_requests;
-  if (verbose) {
-    std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
-    new_bc.print();
-  }
-  return new_bc;
-}
-
-/***** Verify Phase *****/
 TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
@@ -1211,6 +1134,124 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
 
   return new_bc;
 }
+// TO BE REMOVED: END
+
+/***** Request Init Phase *****/
+TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  if (verbose) {
+    std::cout
+        << "\n############### prepare_first_spec_batch_config ##############\n";
+  }
+  // TODO: Clean up the code, this method does the following:
+  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do this
+  // request by request. The infomation of the committed tokens are stored in
+  // Request.ssm_committed_tokens. Put the information of the committed tokens
+  // into BatchConfig.TokensInfo.
+  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
+  // TreeSearchBatchConfig.
+  // Please refer to the implementation of prepare_next_spec_batch_config() for
+  // more details.
+}
+
+/***** Speculative Decoding Phase *****/
+TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  if (verbose) {
+    std::cout << "\n############### prepare_next_batch_spec ###############\n";
+    std::cout << "Current tree depth: " << current_speculation_step << "\n";
+  }
+  // Prepare the next batch for existing requests
+  TreeSearchBatchConfig new_bc;
+  // We assume that only one small model is in use now
+  new_bc.model_id = 0;
+  new_bc.num_tokens = 0;
+  new_bc.num_available_requests = 0;
+
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      new_bc.request_available[request_index] = false;
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    new_bc.request_available[request_index] = true;
+    new_bc.num_available_requests++;
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+    // TODO: check this profiling
+    profiling_requests[request.guid].ssm_decoding_steps += 1;
+
+    // Fill in the tokens
+    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    if (token_tree.tree_layers.size() < current_speculation_step) {
+      // This request has no token to decode in this and the following small
+      // model inference steps
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.tokens.size() + token_tree.tree_node_size;
+      continue;
+    } else {
+      std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
+          token_tree.tree_layers.at(current_speculation_step - 1);
+      // Exclude the current layer from the token tree, because we want the
+      // start index
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.tokens.size() + token_tree.tree_node_size -
+          current_layer.size();
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
+          current_layer.size();
+
+      int child_index = 0;
+      for (auto const &node_ptr : current_layer) {
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            new_bc.requestsInfo[request_index].first_token_index_in_request +
+            child_index;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
+
+        new_bc.num_tokens++;
+        child_index++;
+      }
+    }
+
+    // Copy the causal mask
+    new_bc.causalMask[request_index] = request.causal_mask;
+  }
+
+  new_bc.num_available_requests = num_available_requests;
+  if (verbose) {
+    std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
+  return new_bc;
+}
+
+/***** Verify Phase *****/
+TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  if (verbose) {
+    std::cout
+        << "\n############### prepare_next_batch_verify ###############\n";
+  }
+  // TODO: Clean up the code, this method does the following:
+  // 1. Commit the verified tokens in the last iteration through the
+  // TreeVerifyBatchConfig . We can do this request by request. The information
+  // of the committed tokens is stored in Request.llm_committed_tokens. Put the
+  // information of the committed tokens into
+  // TreeVerifyBatchConfig::committed_tokens.
+  // 2. Load the tokens on the token tree that are not yet pruned to
+  // TreeVerifyBatchConfig::tokensInfo. Be careful with the abs_depth etc. (skip
+  // the pruned tokens).
+  // 3. Create the causal mask for the large model based on the small model
+  // causal mask.
+  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
+  // TreeSearchBatchConfig.
+  // Please refer to the implementation of prepare_next_spec_batch_config() for
+  // more details.
+}
 
 void RequestManager::update_llm_verify_results(
     InferenceResult const &llm_verify_result) {
@@ -1227,7 +1268,9 @@ void RequestManager::update_llm_verify_results(
   // 2. Store the committed tokens to Request.llm_committed_tokens and
   // Request.ssm_committed_tokens.
   // 3. Store the verified tokens to Request.tokens.
-  // 4. For requests not completed, update their causal mask.
+  // 4. Some requests may be completed after appending the verified tokens,
+  // maintain the complete requests, and start a prefilling iteration.
+  // 5. For requests not completed, update their causal mask.
 }
 
 bool RequestManager::update_ssm_inference_results(

From 8334d41c40a8025f3413e1a00707ac49991ea2c8 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 17 Apr 2024 22:45:40 -0400
Subject: [PATCH 078/667] Change on some comments.

---
 src/runtime/request_manager.cc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 99eeed512..85c941dfd 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1152,6 +1152,9 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   // TreeSearchBatchConfig.
   // Please refer to the implementation of prepare_next_spec_batch_config() for
   // more details.
+  TreeSearchBatchConfig new_bc;
+
+  return new_bc;
 }
 
 /***** Speculative Decoding Phase *****/
@@ -1238,16 +1241,16 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   }
   // TODO: Clean up the code, this method does the following:
   // 1. Commit the verified tokens in the last iteration through the
-  // TreeVerifyBatchConfig . We can do this request by request. The information
-  // of the committed tokens is stored in Request.llm_committed_tokens. Put the
-  // information of the committed tokens into
-  // TreeVerifyBatchConfig::committed_tokens.
+  // TreeVerifyBatchConfig. We can do this request by request.
+  // The information of the committed tokens is stored in
+  // Request.llm_committed_tokens. Put the information of the committed tokens
+  // into TreeVerifyBatchConfig.committed_tokens.
   // 2. Load the tokens on the token tree that are not yet pruned to
-  // TreeVerifyBatchConfig::tokensInfo. Be careful with the abs_depth etc. (skip
+  // TreeVerifyBatchConfig.tokensInfo. Be careful with the abs_depth etc. (skip
   // the pruned tokens).
   // 3. Create the causal mask for the large model based on the small model
-  // causal mask.
-  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
+  // causal mask (call create_llm_bitmask()).
+  // 4. Maintain TreeVerifyBatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
   // Please refer to the implementation of prepare_next_spec_batch_config() for
   // more details.

From 4edd570d4c6fa090dc613136a1dbc58a7cdb6c2d Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyu.wang@yahooinc.com>
Date: Wed, 17 Apr 2024 23:41:43 -0400
Subject: [PATCH 079/667] update prepare_next_batch

---
 include/flexflow/request_manager.h |   8 +-
 src/runtime/request_manager.cc     | 167 +++++++++++++++++++++++------
 2 files changed, 144 insertions(+), 31 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d1a1cadef..9e28c0ac1 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -70,6 +70,9 @@ struct Request {
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
+  int first_token_offset_in_batch;
+  int num_tokens_in_batch;
+
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
@@ -276,6 +279,9 @@ class RequestManager {
   void update_inference_results(InferenceResult const &result);
   BatchConfig prepare_next_batch();
 
+  int get_num_active_requests();
+  int get_empty_request_index();
+
 private:
   // configuration parameters
   int max_requests_per_batch;
@@ -305,7 +311,7 @@ class RequestManager {
 
   // rm state 
   std::mutex rm_state_mutex;
-  std::vector<Request> activated_requests;
+  int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
 
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index fc860a1b6..b734ec5da 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -338,6 +338,25 @@ size_t RequestManager::get_num_processed_requests() {
   return num_processed_requests;
 }
 
+int RequestManager::get_num_active_requests() {
+  int count = 0;
+  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+    if (guid_of_requests[i] != INVALID_GUID) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int RequestManager::get_empty_request_index() {
+  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+    if (guid_of_requests[i] == INVALID_GUID) {
+      return i;
+    }
+  }
+  return -1;
+}
+
 BatchConfigFuture RequestManager::get_next_batch_config(
     InferenceResultFuture const &result, Context ctx, Runtime *runtime) {
   RequestManager *rm = this;
@@ -366,48 +385,136 @@ BatchConfig
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
-  for (int i = 0; i < result.num_tokens; i++) {
-    size_t guid = result.request_guids[i];
-    Request &request = all_requests[guid];
-    if (request.tokens.size() < request.max_sequence_length) {
-      request.tokens.push_back(result.token_ids[i]);
+  std::lock_guard<std::mutex> const lock(rm_state_mutex);
+  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+    if guid_of_requests[i] == INVALID_GUID {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[i]];
+
+    switch (request_manager_status) {
+      case PREFILLING:
+        if (request.initial_len == request.llm_cache_size) { // all prompt tokens are prefilled
+          request.tokens.push_back(result.token_ids[request.num_tokens_in_batch]);
+          request_manager_status = DECODING;
+        }
+        break;
+      case DECODING: 
+        request.tokens.push_back(result.token_ids[request.first_token_offset_in_batch]);
+        if (request.tokens.size() == request.max_sequence_length) { // request is completed
+          request.status = Request::COMPLETED;
+          trigger_request_completion_future(request.guid);
+          guid_of_requests[i] = INVALID_GUID;
+          request_manager_status = PREFILLING;
+        }
+        break;
+      default:
+        assert(false);
     }
   }
 }
 
 BatchConfig RequestManager::prepare_next_batch() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
+
+  swicth (request_manager_status) {
+    case PREFILLING:
+      return prepare_prefilling_batch();
+    case DECODING:
+      return prepare_decoding_batch();
+    default:
+      assert(false);
+  }
+}
+
+BatchConfig RequestManager::prepare_prefilling_batch() {
+  if (pending_request_queue.empty()) {
+    if (get_num_active_requests() == 0) {
+      return BatchConfig();
+    } else {
+      return prepare_decoding_batch();
+    }
+  }
+
+  BatchConfig bc;
+  bc.num_tokens = BatchConfig::MAX_NUM_TOKENS;
+
+  request_index = get_empty_request_index();
+  assert(request_index != -1);
+
+  Request new_request = pending_request_queue.front();
+  pending_request_queue.pop();
+  all_requests[new_request.guid] = new_request;
+  guid_of_requests[request_index] = new_request.guid;
+
+  // Per Request Info
+  bc.requestsInfo[request_index].first_token_depth_in_request = 0;
+  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(bc.num_tokens, (int)new_request.tokens.size());
+
+  bc.request_completed[request_index] = false;
+
+  new_request.first_token_offset_in_batch = 0;
+  new_request.num_tokens_in_batch = 0;
+
+  // Delete those after update BatchConfig
+  bc.requestsInfo[request_index].max_sequence_length = new_request.max_sequence_length;
+  bc.requestsInfo[request_index].request_guid = new_request.guid;
+  bc.requestsInfo[request_index].prompt_phase = true;
+  bc.requestsInfo[request_index].batch_config_request_id = request_index;
+
+
+  // Per Token Info
+  for (int j = 0; j < bc.requestsInfo[request_index].num_tokens_in_batch; j++) {
+    int depth = bc.requestsInfo[request_index].first_token_depth_in_request + j;
+    bc.tokensInfo[j].request_index = request_index;
+    bc.tokensInfo[j].abs_depth_in_request = depth;
+    assert(depth < new_request.tokens.size());
+    bc.tokensInfo[j].token_id = new_request.tokens[depth];
+
+    new_request.llm_cache_size++;
+    new_request.num_tokens_in_batch++;
+  }
+  
+  return bc;
+}
+
+BatchConfig RequestManager::prepare_decoding_batch() {
   BatchConfig bc;
   bc.num_tokens = 0;
-  int num_generation_tokens = 0;
-  int num_active_req = -1;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (pending_request_queue.empty()) {
-      break;
+
+  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+    if (guid_of_requests[i] == INVALID_GUID) {
+      continue;
     }
-    Request new_request = pending_request_queue.front();
-    pending_request_queue.pop();
-    all_requests[new_request.guid] = new_request;
-    bc.requestsInfo[i].first_token_depth_in_request = 0;
+
+    Request &request = all_requests[guid_of_requests[i]];
+
+    // Per Request Info
+    bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size;
     bc.requestsInfo[i].first_token_offset_in_batch = bc.num_tokens;
-    bc.requestsInfo[i].request_guid = new_request.guid;
-    bc.requestsInfo[i].num_tokens_in_batch =
-        std::min(get_max_tokens_per_batch(), (int)new_request.tokens.size());
-    bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length;
+    bc.requestsInfo[i].num_tokens_in_batch = 1;
+
     bc.request_completed[i] = false;
-    bc.requestsInfo[i].prompt_phase = true;
-    num_active_req++;
-    bc.requestsInfo[num_active_req].batch_config_request_id = i;
-    for (int j = 0; j < bc.requestsInfo[i].num_tokens_in_batch; j++) {
-      int depth = bc.requestsInfo[i].first_token_depth_in_request + j;
-      bc.tokensInfo[bc.num_tokens].request_index = i;
-      bc.tokensInfo[bc.num_tokens].abs_depth_in_request = depth;
-      assert(depth < new_request.tokens.size());
-      bc.tokensInfo[bc.num_tokens].token_id = new_request.tokens[depth];
-      bc.num_tokens++;
-    }
+
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
+
+    // Delete those after update BatchConfig
+    bc.requestsInfo[i].max_sequence_length = request.max_sequence_length;
+    bc.requestsInfo[i].request_guid = request.guid;
+    bc.requestsInfo[i].prompt_phase = false;
+    bc.requestsInfo[i].batch_config_request_id = i;
+
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = i;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+
+    request.llm_cache_size++;
+    bc.num_tokens++;
   }
-  bc.num_generation_tokens = num_generation_tokens;
+
   return bc;
 }
 

From b5633341dec04a760834c919168ea2c426058ae3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 00:01:28 -0400
Subject: [PATCH 080/667] Fix prepare_next_spec_batch_config.

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 85c941dfd..7eba667da 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1169,6 +1169,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
   new_bc.num_tokens = 0;
+  new_bc.current_depth = current_speculation_step;
   new_bc.num_available_requests = 0;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
@@ -1220,11 +1221,10 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
       }
     }
 
-    // Copy the causal mask
+    // Copy the causal mask, it should already been updated
     new_bc.causalMask[request_index] = request.causal_mask;
   }
 
-  new_bc.num_available_requests = num_available_requests;
   if (verbose) {
     std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
     new_bc.print();

From c669a5eaaf326af57d591b55c1b0210a2a5f4333 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 00:06:08 -0400
Subject: [PATCH 081/667] Implemented init_bitmask and update_bitmask.

---
 src/runtime/request_manager.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7eba667da..d94223747 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1404,6 +1404,12 @@ void RequestManager::init_bitmask(RequestGuid guid, int prompt_length) {
   // 1. Clear the causal mask because our current speculative token tree is
   // empty.
   // 2. Maintain all other fields.
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  bitmask.tree_size = 0;
+  bitmask.current_layer_size = 0;
+  bitmask.prompt_size = prompt_length;
+  bitmask.non_tree_cache_size = prompt_length;
 }
 
 void RequestManager::update_bitmask(RequestGuid guid,
@@ -1414,6 +1420,11 @@ void RequestManager::update_bitmask(RequestGuid guid,
   // 1. Clear the causal mask because our current speculative token tree is
   // empty.
   // 2. Maintain all other fields.
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  bitmask.tree_size = 0;
+  bitmask.current_layer_size = 0;
+  bitmask.non_tree_cache_size += num_committed_tokens;
 }
 
 void RequestManager::append_bitmask(RequestGuid guid) {

From c4e61a72d0859640e2ebfed6bf65808afb7fbb79 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 16:36:54 +0800
Subject: [PATCH 082/667] fix: remove duplicate definition

---
 include/flexflow/request_manager.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 376e7e242..c846adb59 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -280,7 +280,6 @@ class RequestManager {
       token_tree_node_pool;
   // rm state
   std::mutex rm_state_mutex;
-  int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
 
   // TODO: Move this two vector to request struct
   std::unordered_map<RequestGuid,

From 2600b64ac38c93008a66d4e289b7be1a096cbc1e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 19:42:58 +0800
Subject: [PATCH 083/667] chore: update BatchConfig in ops/

- add_bias_residual_layer_norm
- aggregate
- aggregate_spec
---
 src/ops/add_bias_residual_layer_norm.cc | 2 +-
 src/ops/aggregate.cc                    | 2 +-
 src/ops/aggregate_spec.cc               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index e67038090..badb41f40 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -526,7 +526,7 @@ void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
 
 FutureMap AddBiasResidualLayerNorm::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 5f05458e3..8047b0aee 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -297,7 +297,7 @@ void Aggregate::forward(FFModel const &ff) {
 }
 
 FutureMap Aggregate::inference(FFModel const &ff,
-                               BatchConfigFuture const &bc,
+                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
                                MachineView const *mv) {
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 1edd43088..75c084721 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -266,7 +266,7 @@ void AggregateSpec::forward(FFModel const &ff) {
 
 FutureMap
     AggregateSpec::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {

From 012ad1ba1f0d7d1dff29fe06e61e13f02b0528e1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 20:06:16 +0800
Subject: [PATCH 084/667] chore: update BatchConfig in ops/arg_topk, but leave
 beamsearch to kill

---
 src/ops/arg_topk.cc  | 2 +-
 src/ops/arg_topk.cpp | 6 +++---
 src/ops/arg_topk.cu  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 614cc476d..212e10d67 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -292,7 +292,7 @@ void ArgTopK::forward(FFModel const &ff) {
 }
 
 FutureMap ArgTopK::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {
diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index f431d3d4b..665cbe08e 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -379,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
                              int length,
                              int k,
                              bool sorted,
-                             BeamSearchBatchConfig const *bc,
+                             /* Reserved: BatchConfig Updated, leave beamsearch to kill */TreeSearchBatchConfig const *bc,
                              hipStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
@@ -405,7 +405,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
     // check
     int beam_size = -1;
     for (int i = 1; i < bc->max_requests_per_batch(); i++) {
-      if (bc->request_completed[i]) {
+      if (!bc->request_available[i]) {
         continue;
       } else if (beam_size == -1) {
         beam_size = bc->beamRequestsInfo[i].beam_size;
@@ -448,7 +448,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                                      // float *output_ptr,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     BeamSearchBatchConfig const *bc) {
+                                     TreeSearchBatchConfig const *bc) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   // Domain in1_domain = runtime->get_index_space_domain(
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 5b7978812..8110de0ae 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -379,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
                              int length,
                              int k,
                              bool sorted,
-                             BeamSearchBatchConfig const *bc,
+                             /* Reserved: BatchConfig Updated, leave beamsearch to kill */TreeSearchBatchConfig const *bc,
                              cudaStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
@@ -410,7 +410,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
     int last_request_idx =
         bc->requestsInfo[num_activate_requests - 1].batch_config_request_id;
     for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-      if (bc->request_completed[i]) {
+      if (!bc->request_available[i]) {
         continue;
       } else if (beam_size == -1) {
         beam_size = bc->beamRequestsInfo[i].beam_size;
@@ -454,7 +454,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                                      GenericTensorAccessorW const &probs,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     BeamSearchBatchConfig const *bc) {
+                                     TreeSearchBatchConfig const *bc) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 

From 9feec1932fa165ab3671cf4c39f74c303f28a2fc Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 21:09:43 +0800
Subject: [PATCH 085/667] chore: update BatchConfig in ops/

- argmax
- attention
---
 src/ops/argmax.cc    | 2 +-
 src/ops/attention.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index 2ead6a1e3..e5b6331c8 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -261,7 +261,7 @@ void ArgMax::forward(FFModel const &ff) {
 }
 
 FutureMap ArgMax::inference(FFModel const &ff,
-                            BatchConfigFuture const &bc,
+                            /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                             std::vector<ParallelTensor> const &batch_inputs,
                             std::vector<ParallelTensor> const &batch_outputs,
                             MachineView const *mv) {
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index 97afc9434..a2672c297 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -577,7 +577,7 @@ void MultiHeadAttention::forward(FFModel const &ff) {
 
 FutureMap MultiHeadAttention::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {

From 1609ac478de7d6b47865da17892add1b8bb2d972 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 21:10:22 +0800
Subject: [PATCH 086/667] chore: update BatchConfig in ops/beam_topk, but leave
 beamsearch to kill

---
 src/ops/beam_topk.cc  | 4 ++--
 src/ops/beam_topk.cpp | 8 ++++----
 src/ops/beam_topk.cu  | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index d018ff59e..72fc5e96b 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -292,7 +292,7 @@ void BeamTopK::forward(FFModel const &ff) {
 }
 
 FutureMap BeamTopK::inference(FFModel const &ff,
-                              BatchConfigFuture const &bc,
+                              /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                               std::vector<ParallelTensor> const &batch_inputs,
                               std::vector<ParallelTensor> const &batch_outputs,
                               MachineView const *mv) {
@@ -354,7 +354,7 @@ SsmInferenceResult
   TreeSearchBatchConfig const &bc =
       Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
 
-  if (bc.num_tokens == 0) {
+  if (bc.num_active_tokens() == 0) {
     SsmInferenceResult ir;
     return ir;
   }
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
index 18534455a..948fdd110 100644
--- a/src/ops/beam_topk.cpp
+++ b/src/ops/beam_topk.cpp
@@ -290,7 +290,7 @@ __device__ void mergeBeamShards(int num_shards,
     // Initialize the heap as a min-heap.
     for (int slot = 0; slot < heap_size; slot++) {
       // int beam = (slot % max_heap_size) / k;
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
+      /* Reserved: BatchConfig Updated, leave beamsearch to kill */T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
                      ((slot % max_heap_size) / k)];
       min_heap.assign(slot, {slot, (entries[slot].value * prob)});
     }
@@ -474,7 +474,7 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input,
 /*static*/
 template <typename DT>
 void BeamTopK::forward_kernel(BeamTopKMeta const *m,
-                              BeamSearchBatchConfig const *bc,
+                              TreeSearchBatchConfig const *bc,
                               DT const *input_ptr,
                               float *output_ptr,
                               int *indices_ptr,
@@ -511,7 +511,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
   DT acc_probs[max_total_requests];
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     assert(bc->beamRequestsInfo[i].beam_size > 0);
@@ -625,7 +625,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
 
 /*static*/
 void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
-                                      BeamSearchBatchConfig const *bc,
+                                      TreeSearchBatchConfig const *bc,
                                       GenericTensorAccessorR const &input,
                                       float *output_ptr,
                                       int *indices_ptr,
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index 48f057f98..09c09c2e3 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -290,7 +290,7 @@ __device__ void mergeBeamShards(int num_shards,
     // Initialize the heap as a min-heap.
     for (int slot = 0; slot < heap_size; slot++) {
       // int beam = (slot % max_heap_size) / k;
-      T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
+      /* Reserved: BatchConfig Updated, leave beamsearch to kill */T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
                      ((slot % max_heap_size) / k)];
       min_heap.assign(slot, {slot, (entries[slot].value * prob)});
       if (verbose && batch_index == 0) {
@@ -507,7 +507,7 @@ __global__ void beam_topk_forward_kernel(T const *__restrict__ input,
 /*static*/
 template <typename DT>
 void BeamTopK::forward_kernel(BeamTopKMeta const *m,
-                              BeamSearchBatchConfig const *bc,
+                              TreeSearchBatchConfig const *bc,
                               DT const *input_ptr,
                               float *output_ptr,
                               int *indices_ptr,
@@ -544,7 +544,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
   DT acc_probs[max_total_requests];
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     assert(bc->beamRequestsInfo[i].beam_size > 0);

From 6ef34f952b74582ab831e2016860c8c1152c1b7e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 21:10:30 +0800
Subject: [PATCH 087/667] chore: update BatchConfig in ops/

- cast
- element_binary
- element_unary
- embedding
- experts
---
 src/ops/cast.cc           | 2 +-
 src/ops/element_binary.cc | 2 +-
 src/ops/element_unary.cc  | 2 +-
 src/ops/embedding.cc      | 2 +-
 src/ops/experts.cc        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ops/cast.cc b/src/ops/cast.cc
index e514236a3..8cff9f741 100644
--- a/src/ops/cast.cc
+++ b/src/ops/cast.cc
@@ -227,7 +227,7 @@ void Cast::forward(FFModel const &ff) {
 }
 
 FutureMap Cast::inference(FFModel const &ff,
-                          BatchConfigFuture const &bc,
+                          /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                           std::vector<ParallelTensor> const &batch_inputs,
                           std::vector<ParallelTensor> const &batch_outputs,
                           MachineView const *mv) {
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 4352f459b..485162dd0 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -542,7 +542,7 @@ void ElementBinary::forward(FFModel const &ff) {
 
 FutureMap
     ElementBinary::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index 0e1d11555..a166978e4 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -422,7 +422,7 @@ void ElementUnary::forward(FFModel const &ff) {
 
 FutureMap
     ElementUnary::inference(FFModel const &ff,
-                            BatchConfigFuture const &bc,
+                            /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                             std::vector<ParallelTensor> const &batch_inputs,
                             std::vector<ParallelTensor> const &batch_outputs,
                             MachineView const *mv) {
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index e630563b6..674aaf63d 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -456,7 +456,7 @@ void Embedding::forward(FFModel const &ff) {
 }
 
 FutureMap Embedding::inference(FFModel const &ff,
-                               BatchConfigFuture const &bc,
+                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
                                MachineView const *mv) {
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index 8c66f9c7b..a5b00e6cb 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -669,7 +669,7 @@ void Experts::forward(FFModel const &ff) {
 }
 
 FutureMap Experts::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {

From 6b9aaa4ba62faa139c2c4ccfa67d4438850c68ab Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 18 Apr 2024 21:12:01 +0800
Subject: [PATCH 088/667] chore: update BatchConfig in ops/

- fused
- group_by
---
 src/ops/fused.cc    |  2 +-
 src/ops/fused.cpp   | 16 ++++++++--------
 src/ops/fused.cu    | 19 +++++++++----------
 src/ops/group_by.cc |  2 +-
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 9ad5c4dc9..a6d2bb2f9 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -514,7 +514,7 @@ void FusedOp::forward(FFModel const &ff) {
 }
 
 FutureMap FusedOp::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 3282bc57d..f09caa10d 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -525,7 +525,7 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  /* Reserved: BatchConfig Updated */BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
   }
@@ -871,7 +871,7 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         TreeIncMultiHeadSelfAttentionMeta *m =
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        TreeVerifyBatchConfig const &tree_bc =
+        TreeVerifyBatchConfig const &verify_bc =
             Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
@@ -882,7 +882,7 @@ __host__ void
         }
         TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            &tree_bc,
+            &verify_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
             my_weight_accessor[0],
@@ -895,10 +895,10 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         SpecIncMultiHeadSelfAttentionMeta const *m =
             (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        // TreeSearchBatchConfig const *search_bc =
+        //     (TreeSearchBatchConfig *)task->args;
+        TreeSearchBatchConfig const &search_bc =
+            Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -908,7 +908,7 @@ __host__ void
         }
         SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            &beam_bc,
+            &search_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
             my_weight_accessor[0],
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 483028599..875321182 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -539,8 +539,7 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  // BatchConfig const *bc = (BatchConfig *)task->args;
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  /* Reserved: BatchConfig Updated */BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   // Return if no active tokens
   if (bc->num_tokens == 0) {
     return;
@@ -906,9 +905,9 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         TreeIncMultiHeadSelfAttentionMeta *m =
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeVerifyBatchConfig const *tree_bc =
+        // TreeVerifyBatchConfig const *verify_bc =
         //     (TreeVerifyBatchConfig *)task->args;
-        TreeVerifyBatchConfig const &tree_bc =
+        TreeVerifyBatchConfig const &verify_bc =
             Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
@@ -919,7 +918,7 @@ __host__ void
         }
         TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            &tree_bc,
+            &verify_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
             my_weight_accessor[0],
@@ -932,10 +931,10 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         SpecIncMultiHeadSelfAttentionMeta const *m =
             (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        // TreeSearchBatchConfig const *search_bc =
+        //     (TreeSearchBatchConfig *)task->args;
+        TreeSearchBatchConfig const &search_bc =
+            Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -945,7 +944,7 @@ __host__ void
         }
         SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            &beam_bc,
+            &search_bc,
             task->index_point.point_data[0],
             my_input_accessor[0],
             my_weight_accessor[0],
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index f2f402737..715ad14f0 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -322,7 +322,7 @@ void Group_by::forward(FFModel const &ff) {
 }
 
 FutureMap Group_by::inference(FFModel const &ff,
-                              BatchConfigFuture const &bc,
+                              /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                               std::vector<ParallelTensor> const &batch_inputs,
                               std::vector<ParallelTensor> const &batch_outputs,
                               MachineView const *mv) {

From 23296e17c718a333183825b49896a75e7f1930de Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 16:16:00 -0400
Subject: [PATCH 089/667] Removed obsolete API push_spec_infer_tree_width, and
 fixed some formatting issues.

---
 include/flexflow/request_manager.h |  1 -
 src/runtime/request_manager.cc     | 35 +++++++++++++++---------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index c846adb59..1b3661e83 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -158,7 +158,6 @@ class RequestManager {
   int get_max_spec_tree_token_num();
   int get_max_verify_tokens_per_batch();
   void set_max_sequence_length(int max_seq_length);
-  void push_spec_infer_tree_width(int tree_width);
   int get_max_sequence_length();
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index df00c36f2..c6de73687 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -109,11 +109,6 @@ int RequestManager::get_max_sequence_length() {
   return max_sequence_length;
 }
 
-void RequestManager::push_spec_infer_tree_width(int tree_width) {
-  assert(tree_width <= TreeSearchBatchConfig::MAX_BEAM_WIDTH);
-  spec_infer_tree_width.emplace_back(tree_width);
-}
-
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -387,21 +382,25 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   std::lock_guard<std::mutex> const lock(rm_state_mutex);
   for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-    if guid_of_requests[i] == INVALID_GUID {
+    if (guid_of_requests[i] == INVALID_GUID) {
       continue;
     }
     Request &request = all_requests[guid_of_requests[i]];
 
     switch (request_manager_status) {
       case PREFILLING:
-        if (request.initial_len == request.llm_cache_size) { // all prompt tokens are prefilled
-          request.tokens.push_back(result.token_ids[request.num_tokens_in_batch]);
+        if (request.initial_len ==
+            request.llm_cache_size) { // all prompt tokens are prefilled
+          request.tokens.push_back(
+              result.token_ids[request.num_tokens_in_batch]);
           request_manager_status = DECODING;
         }
         break;
-      case DECODING: 
-        request.tokens.push_back(result.token_ids[request.first_token_offset_in_batch]);
-        if (request.tokens.size() == request.max_sequence_length) { // request is completed
+      case DECODING:
+        request.tokens.push_back(
+            result.token_ids[request.first_token_offset_in_batch]);
+        if (request.tokens.size() ==
+            request.max_sequence_length) { // request is completed
           request.status = Request::COMPLETED;
           trigger_request_completion_future(request.guid);
           guid_of_requests[i] = INVALID_GUID;
@@ -417,7 +416,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 BatchConfig RequestManager::prepare_next_batch() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
-  swicth (request_manager_status) {
+  switch (request_manager_status) {
     case PREFILLING:
       return prepare_prefilling_batch();
     case DECODING:
@@ -450,7 +449,8 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   // Per Request Info
   bc.requestsInfo[request_index].first_token_depth_in_request = 0;
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(bc.num_tokens, (int)new_request.tokens.size());
+  bc.requestsInfo[request_index].num_tokens_in_batch =
+      std::min(bc.num_tokens, (int)new_request.tokens.size());
 
   bc.request_completed[request_index] = false;
 
@@ -458,12 +458,12 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   new_request.num_tokens_in_batch = 0;
 
   // Delete those after update BatchConfig
-  bc.requestsInfo[request_index].max_sequence_length = new_request.max_sequence_length;
+  bc.requestsInfo[request_index].max_sequence_length =
+      new_request.max_sequence_length;
   bc.requestsInfo[request_index].request_guid = new_request.guid;
   bc.requestsInfo[request_index].prompt_phase = true;
   bc.requestsInfo[request_index].batch_config_request_id = request_index;
 
-
   // Per Token Info
   for (int j = 0; j < bc.requestsInfo[request_index].num_tokens_in_batch; j++) {
     int depth = bc.requestsInfo[request_index].first_token_depth_in_request + j;
@@ -475,7 +475,7 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
     new_request.llm_cache_size++;
     new_request.num_tokens_in_batch++;
   }
-  
+
   return bc;
 }
 
@@ -1610,6 +1610,7 @@ void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
   // }
 }
 
+// TO BE REMOVED: START
 std::vector<std::pair<BatchConfig::TokenId, int>>
     RequestManager::traverse_verify_tree(
         size_t guid,
@@ -1771,6 +1772,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 
   return verifiedTree;
 }
+// TO BE REMOVED: END
 
 std::vector<GenerationResult>
     FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
@@ -2161,5 +2163,4 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
   }
 }
 /* --------- Request Token Tree Related Functions --------- */
-
 }; // namespace FlexFlow

From c791967cb195e31296865c3e097bc3bbf87843de Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 16:44:16 -0400
Subject: [PATCH 090/667] Add an API to verify the tokens returned by the LLM.

---
 include/flexflow/request_manager.h | 1 +
 src/runtime/request_manager.cc     | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 1b3661e83..661f1780a 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -316,6 +316,7 @@ class RequestManager {
   void update_llm_verify_results(InferenceResult const &llm_verify_result);
   bool update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
+  void get_verify_results(InferenceResult const &llm_verify_result);
   /* ---------- New Helper Functions ---------- */
 
   // Helper functions related to token trees
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c6de73687..4a9d69064 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1774,6 +1774,12 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 }
 // TO BE REMOVED: END
 
+void RequestManager::get_verify_results(
+    InferenceResult const &llm_verify_result) {
+  // This function should return the verified tokens and maintain the committed
+  // tokens.
+}
+
 std::vector<GenerationResult>
     FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
   RequestManager *rm = RequestManager::get_request_manager();

From a0197c12b7c13bab0990b1541c9e4a85afc7d797 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 16:51:52 -0400
Subject: [PATCH 091/667] Fix some bug related to
 BatchConfig::BitMask.non_tree_cache_size.

---
 include/flexflow/batch_config.h | 3 ++-
 src/runtime/request_manager.cc  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index e7df7adea..1a967a3db 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -110,7 +110,8 @@ class BatchConfig {
 
   public:
     Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM];
-    // the number of tokens before the tree
+    // the number of generated tokens before the speculation tree (excluding the
+    // prompt tokens)
     int non_tree_cache_size = 0;
     // current tree size
     int tree_size = 0;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4a9d69064..e66cc4127 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1516,7 +1516,7 @@ void RequestManager::init_bitmask(RequestGuid guid, int prompt_length) {
   bitmask.tree_size = 0;
   bitmask.current_layer_size = 0;
   bitmask.prompt_size = prompt_length;
-  bitmask.non_tree_cache_size = prompt_length;
+  bitmask.non_tree_cache_size = 0;
 }
 
 void RequestManager::update_bitmask(RequestGuid guid,

From 5d898cb3857a899cf37944436a92d2e9c3f1d2b5 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 21:43:05 -0400
Subject: [PATCH 092/667] Add some description on the commit process of the SSM
 and LLM.

---
 include/flexflow/batch_config.h    |  4 ++--
 include/flexflow/request_manager.h | 35 ++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 1a967a3db..5d33c5515 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -72,8 +72,8 @@ class BatchConfig {
     TokenId token_id;
     int abs_index_in_request;
     int request_index;
-    // This offset is only used for small model KV cache commit
-    int last_batch_offset = -1;
+    // For SSM KV cache commitment
+    int kv_cache_index = -1;
   };
 
   class BitMask {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 661f1780a..a0b310281 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -80,16 +80,33 @@ struct Request {
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
   BatchConfig::BitMask causal_mask;
-  // Committed tokens
-  struct CommittedToken {
-    int absolute_index;
-    int request_offset; // Equivalent to the order of the token in the request
-                        // speculative token tree
-  };
   // Here we have to maintain two versions of the committed tokens because the
-  // tree seen by the LLM and the SSM is different due to the pruning
-  std::vector<CommittedToken> llm_committed_tokens;
-  std::vector<CommittedToken> ssm_committed_tokens;
+  // tree seen by the LLM and the SSM is different due to the pruning.
+  //
+  // 1. Commit the SSM KV cache: On the GPU, the KV cache of the
+  // tokens on the speculative token tree is stored together with the KV cache
+  // of the already verified tokens. So the `from_index` should be the absolute
+  // index of the token in the entire token: prompt_length +
+  // generated_sequence_length + index in the speculative token tree. `to_index`
+  // should be the place to put the KV cache in the SSM KV cache: prompt_length
+  // + generated_sequence_length + index_in_committed_tokens.
+  //
+  // 2. Commit the LLM KV cache: On the GPU, the KV cache of the speculative
+  // token tree and the generated tokens are stored separately. So the
+  // `from_index` should be the index of the token in the speculative token
+  // tree. `to_index` should be the place to put the KV cache in the LLM KV
+  // cache: prompt_length + generated_sequence_length +
+  // index_in_committed_tokens.
+  //
+  // Even though `from_index` and `to_index` means different things for the SSM
+  // and the LLM, we can still use the same struct to store the committed
+  // tokens.
+  struct CommittedTokens {
+    int from_index;
+    int to_index;
+  };
+  std::vector<CommittedTokens> llm_committed_tokens;
+  std::vector<CommittedTokens> ssm_committed_tokens;
 };
 
 class TokenTreeNode {

From 70490267160c378d8e66febadb738ccc20d84496 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 18 Apr 2024 21:48:11 -0400
Subject: [PATCH 093/667] Add a more detailed description of the commit process
 of the KV cache of the SSM and the LLM.

---
 include/flexflow/batch_config.h    | 2 +-
 include/flexflow/request_manager.h | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 5d33c5515..b04ee8f59 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -73,7 +73,7 @@ class BatchConfig {
     int abs_index_in_request;
     int request_index;
     // For SSM KV cache commitment
-    int kv_cache_index = -1;
+    int kv_cache_dest_index = -1;
   };
 
   class BitMask {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a0b310281..78542c566 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -91,6 +91,9 @@ struct Request {
   // should be the place to put the KV cache in the SSM KV cache: prompt_length
   // + generated_sequence_length + index_in_committed_tokens.
   //
+  // from_index -> BatchConfig::PerTokenInfo.abs_index_in_request
+  // to_index -> BatchConfig::PerTokenInfo.kv_cache_dest_index
+  //
   // 2. Commit the LLM KV cache: On the GPU, the KV cache of the speculative
   // token tree and the generated tokens are stored separately. So the
   // `from_index` should be the index of the token in the speculative token
@@ -98,9 +101,13 @@ struct Request {
   // cache: prompt_length + generated_sequence_length +
   // index_in_committed_tokens.
   //
+  // from_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_index
+  // to_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_depth
+  //
   // Even though `from_index` and `to_index` means different things for the SSM
   // and the LLM, we can still use the same struct to store the committed
   // tokens.
+
   struct CommittedTokens {
     int from_index;
     int to_index;

From 40e3d877c7c918de200b35c2d9a83e00cbc5ab41 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 19 Apr 2024 00:14:15 -0400
Subject: [PATCH 094/667] Fill in the SSM_SPEC and LLM_VERIFY states in
 prepare_next_batch

---
 src/runtime/request_manager.cc | 103 ++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e66cc4127..4ea76a86b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -421,6 +421,14 @@ BatchConfig RequestManager::prepare_next_batch() {
       return prepare_prefilling_batch();
     case DECODING:
       return prepare_decoding_batch();
+    case SSM_SPEC:
+      if (current_speculation_step == 0) {
+        return prepare_first_spec_batch_config();
+      } else {
+        return prepare_next_spec_batch_config();
+      }
+    case LLM_VERIFY:
+      return prepare_verify_batch_config();
     default:
       assert(false);
   }
@@ -523,18 +531,18 @@ BatchConfig RequestManager::prepare_decoding_batch() {
 TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
-    std::cout
-        << "\n############### prepare_first_spec_batch_config ##############\n";
+    std::cout << "\n############### prepare_first_spec_batch_config "
+                 "##############\n";
   }
   // TODO: Clean up the code, this method does the following:
-  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do this
-  // request by request. The infomation of the committed tokens are stored in
-  // Request.ssm_committed_tokens. Put the information of the committed tokens
-  // into BatchConfig.TokensInfo.
+  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do
+  // this request by request. The infomation of the committed tokens are
+  // stored in Request.ssm_committed_tokens. Put the information of the
+  // committed tokens into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config() for
-  // more details.
+  // Please refer to the implementation of prepare_next_spec_batch_config()
+  // for more details.
 
   // Step 1: use result to update requests
   TreeSearchBatchConfig new_bc;
@@ -591,8 +599,8 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
                  tree_outputs.back().second,
                  token_id);
         }
-        // std::cout << "Index within old batch: " << result_index << std::endl;
-        // printf("  Input: [%d] %d ---> [%d] %d \n",
+        // std::cout << "Index within old batch: " << result_index <<
+        // std::endl; printf("  Input: [%d] %d ---> [%d] %d \n",
         //        abs_depth,
         //        old_bc.tokensInfo[result_index].token_id,
         //        tree_outputs.back().second,
@@ -809,8 +817,8 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
 
       // Token Info
       std::string output = this->tokenizer_->Decode(request.tokens);
-      // Unlike Huggingface, the sentencepiece C++ library automatically removes
-      // the BOS token
+      // Unlike Huggingface, the sentencepiece C++ library automatically
+      // removes the BOS token
       if (model_type == ModelType::LLAMA &&
           request.tokens.at(0) == bos_token_id) {
         output = "<s> " + output;
@@ -942,19 +950,19 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   }
   // TODO: Clean up the code, this method does the following:
   // 1. Commit the verified tokens in the last iteration through the
-  // TreeVerifyBatchConfig . We can do this request by request. The information
-  // of the committed tokens is stored in Request.llm_committed_tokens. Put the
-  // information of the committed tokens into
-  // TreeVerifyBatchConfig::committed_tokens.
+  // TreeVerifyBatchConfig . We can do this request by request. The
+  // information of the committed tokens is stored in
+  // Request.llm_committed_tokens. Put the information of the committed tokens
+  // into TreeVerifyBatchConfig::committed_tokens.
   // 2. Load the tokens on the token tree that are not yet pruned to
-  // TreeVerifyBatchConfig::tokensInfo. Be careful with the abs_depth etc. (skip
-  // the pruned tokens).
+  // TreeVerifyBatchConfig::tokensInfo. Be careful with the abs_depth etc.
+  // (skip the pruned tokens).
   // 3. Create the causal mask for the large model based on the small model
   // causal mask.
   // 4. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config() for
-  // more details.
+  // Please refer to the implementation of prepare_next_spec_batch_config()
+  // for more details.
 
   assert(old_batches.size() > 0);
 
@@ -1029,7 +1037,8 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
       // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", "
       //           << new_bc.causalMask[i].tree_size << ", "
       //           << new_bc.causalMask[i].non_tree_cache_size << "\n";
-      // std::cout << "mask: " << std::bitset<64>(new_bc.causalMask[i].mask[0])
+      // std::cout << "mask: " <<
+      // std::bitset<64>(new_bc.causalMask[i].mask[0])
       //           << "\n";
 
       // Committed Tokens
@@ -1174,7 +1183,8 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
       if (request.llm_cache_size < request.initial_len) {
         // std::cout << "Initialization (prompt) phase: "
         //           << new_bc.requestsInfo[i].num_tokens_in_batch << ", "
-        //           << old_batches.at(0).beamRequestsInfo[i].beam_size << "\n";
+        //           << old_batches.at(0).beamRequestsInfo[i].beam_size <<
+        //           "\n";
         // Initialization (prompt) phase
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
@@ -1209,7 +1219,8 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
               std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
                   request.tokens.back(), request.tokens.size() - 1)};
         }
-      } else { // launch the request into running phase after loading all prompt
+      } else { // launch the request into running phase after loading all
+               // prompt
         if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) {
           // std::cout << "Initialization running phase: "
           //           << new_bc.requestsInfo[i].num_tokens_in_batch << "\n";
@@ -1247,18 +1258,18 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
 TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
-    std::cout
-        << "\n############### prepare_first_spec_batch_config ##############\n";
+    std::cout << "\n############### prepare_first_spec_batch_config "
+                 "##############\n";
   }
   // TODO: Clean up the code, this method does the following:
-  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do this
-  // request by request. The infomation of the committed tokens are stored in
-  // Request.ssm_committed_tokens. Put the information of the committed tokens
-  // into BatchConfig.TokensInfo.
+  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do
+  // this request by request. The infomation of the committed tokens are
+  // stored in Request.ssm_committed_tokens. Put the information of the
+  // committed tokens into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config() for
-  // more details.
+  // Please refer to the implementation of prepare_next_spec_batch_config()
+  // for more details.
   TreeSearchBatchConfig new_bc;
 
   return new_bc;
@@ -1353,23 +1364,23 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   // Request.llm_committed_tokens. Put the information of the committed tokens
   // into TreeVerifyBatchConfig.committed_tokens.
   // 2. Load the tokens on the token tree that are not yet pruned to
-  // TreeVerifyBatchConfig.tokensInfo. Be careful with the abs_depth etc. (skip
-  // the pruned tokens).
+  // TreeVerifyBatchConfig.tokensInfo. Be careful with the abs_depth etc.
+  // (skip the pruned tokens).
   // 3. Create the causal mask for the large model based on the small model
   // causal mask (call create_llm_bitmask()).
   // 4. Maintain TreeVerifyBatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config() for
-  // more details.
+  // Please refer to the implementation of prepare_next_spec_batch_config()
+  // for more details.
 }
 
 void RequestManager::update_llm_verify_results(
     InferenceResult const &llm_verify_result) {
   // TODO: Implement this function
-  // We may have two types of InferenceResults, one is the results from sampling
-  // the large model, the other is the top-p / top-k logits of the large model,
-  // we can first implement the former one. For the latter one, we have to add a
-  // CPU based verify function.
+  // We may have two types of InferenceResults, one is the results from
+  // sampling the large model, the other is the top-p / top-k logits of the
+  // large model, we can first implement the former one. For the latter one,
+  // we have to add a CPU based verify function.
   // 1. Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
@@ -1394,8 +1405,8 @@ bool RequestManager::update_ssm_inference_results(
   int result_index = 0;
 
   // Here we assume that the order of the tokens in the last
-  // TreeSearchBatchConfig and hence the last SsmInferenceResult is equal to the
-  // order of the request in the last TreeSearchBatchConfig
+  // TreeSearchBatchConfig and hence the last SsmInferenceResult is equal to
+  // the order of the request in the last TreeSearchBatchConfig
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1582,9 +1593,9 @@ void RequestManager::append_bitmask(RequestGuid guid) {
 }
 
 BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
-  // This method creates a new bitmask for LLM verification model's bitmask, it
-  // does not modify the small model's bitmask
-  // This method is called by prepare_verify_batch_config
+  // This method creates a new bitmask for LLM verification model's bitmask,
+  // it does not modify the small model's bitmask This method is called by
+  // prepare_verify_batch_config
   // TODO: implement this function
   // 1. Create the bitmask based on the pruned request token tree
   // 2. Maintain all other fields
@@ -1776,8 +1787,8 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 
 void RequestManager::get_verify_results(
     InferenceResult const &llm_verify_result) {
-  // This function should return the verified tokens and maintain the committed
-  // tokens.
+  // This function should return the verified tokens and maintain the
+  // committed tokens.
 }
 
 std::vector<GenerationResult>

From f40ccc669ce1591055f5e9f455da9418b12b6553 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 19 Apr 2024 17:43:58 +0800
Subject: [PATCH 095/667] feat: GPU side batch_config_meta update

---
 include/flexflow/config.h      |  9 ++++---
 src/runtime/request_manager.cu | 43 +++++++---------------------------
 2 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 1be26b8e3..3fea8e3a6 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -80,12 +80,11 @@ struct FFHandler {
 
   // request info + token info + topolopgy mask info
   size_t batch_config_metadata_size =
-      sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(TreeSearchBatchConfig::beamTokenInfo) +
-      sizeof(TreeSearchBatchConfig::beamRequestsInfo) +
+      sizeof(BatchConfig::tokensInfo) +
+      sizeof(BatchConfig::requestsInfo) +
+      sizeof(BatchConfig::request_available) +
       sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_available);
+      sizeof(TreeVerifyBatchConfig::committed_tokens);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   DataType quantization_type;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index c5fd6b3a7..66305cf69 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -96,28 +96,19 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::requestsInfo);
 
+  checkCUDA(cudaMemcpyAsync(
+      static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+      &(batch_config->request_available),
+      sizeof(BatchConfig::request_available),
+      cudaMemcpyHostToDevice,
+      stream));
+  total_copy_size += sizeof(BatchConfig::request_available);
+
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     TreeSearchBatchConfig const *beam_batch_config =
         static_cast<TreeSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamTokenInfo),
-        sizeof(TreeSearchBatchConfig::beamTokenInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(TreeSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamRequestsInfo),
-        sizeof(TreeSearchBatchConfig::beamRequestsInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(TreeSearchBatchConfig::beamRequestsInfo);
-
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
         &(beam_batch_config->causalMask),
@@ -125,15 +116,6 @@ void RequestManager::load_batch_config_task(
         cudaMemcpyHostToDevice,
         stream));
     total_copy_size += sizeof(BatchConfig::causalMask);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_available),
-        sizeof(BatchConfig::request_available),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_available);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
@@ -152,15 +134,6 @@ void RequestManager::load_batch_config_task(
         cudaMemcpyHostToDevice,
         stream));
     total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_available),
-        sizeof(BatchConfig::request_available),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_available);
   }
 
   // add a size check

From 80f68c16840738e674f5b3c881e3a0d75d1adff8 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 19 Apr 2024 18:11:30 +0800
Subject: [PATCH 096/667] chore: update BatchConfig in
 src/ops/inc_multihead_self_attention

---
 .../ops/inc_multihead_self_attention.h        |  1 +
 src/ops/inc_multihead_self_attention.cc       |  2 +-
 src/ops/inc_multihead_self_attention.cpp      |  4 +-
 src/ops/inc_multihead_self_attention.cu       | 42 ++++++++++++-------
 4 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 43dc527bc..779f55127 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -193,6 +193,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
   BatchConfig::PerRequestInfo *request_infos;
+  bool *request_available;
   DataType quantization_type;
   bool offload;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 7aa350377..9220e9c38 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -741,7 +741,7 @@ void IncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap IncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index e1e299d42..a97b97526 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -127,7 +127,7 @@ template <typename DT>
 __global__ void
     apply_rotary_embedding_native(DT *input_ptr,
                                   hipFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
+                                  /* Reserved: BatchConfig Updated, leave beamsearch to kill */BatchConfig::PerTokenInfo const *tokenInfos,
                                   int qProjSize,
                                   int kProjSize,
                                   int num_q_heads,
@@ -530,7 +530,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_available[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index e44b4c97f..c5bdcb427 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -52,7 +52,8 @@ __global__ void compute_attention_kernel_generation_kernel(
     int max_seq_length,
     int per_head_size,
     int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos) {
+    /* Reserved: BatchConfig Updated */BatchConfig::PerRequestInfo *request_infos,
+    bool *request_available) {
 
   // q, k
   using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
@@ -80,14 +81,21 @@ __global__ void compute_attention_kernel_generation_kernel(
   // request idx
   int const request_idx = blockIdx.y;
 
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
+  int requext_idx_in_batch = 0;
+  for (int i = 0; i < request_idx; i++) {
+    while (!request_available[requext_idx_in_batch]) {
+      requext_idx_in_batch++;
+    }
+  }
+
+  // threads converge
+  __syncthreads();
 
   int const first_step = 0;
 
   int const tlength =
-      request_infos[batch_config_request_id].first_token_index_in_request +
-      request_infos[batch_config_request_id].num_tokens_in_batch;
+      request_infos[requext_idx_in_batch].first_token_index_in_request +
+      request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
   // shared memory objects
   extern __shared__ char smem_[];
@@ -135,7 +143,7 @@ __global__ void compute_attention_kernel_generation_kernel(
   constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
 
   DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
 
   int ti_end =
       div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
@@ -238,7 +246,7 @@ __global__ void compute_attention_kernel_generation_kernel(
 
   // The base pointer for the value in the cache buffer.
   DT const *v_cache_batch =
-      value_cache + batch_config_request_id * max_seq_length * hidden_size + vi;
+      value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
 
   if (Dh == Dh_MAX || vi < Dh) {
     for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
@@ -714,14 +722,15 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
           BatchConfig::max_sequence_length(),                                  \
           m->qProjSize,                                                        \
           m->hidden_size,                                                      \
-          m->request_infos)
+          m->request_infos,                                                    \
+          m->request_available)
 
 template <typename DT>
 void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
                                          BatchConfig const *bc,
                                          DT *output_ptr,
                                          cudaStream_t stream) {
-  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
+  dim3 grid(m->num_q_heads, bc->num_tokens);
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
   size_t smem_sz;
@@ -825,13 +834,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
-  if (bc->num_generation_tokens > 0) {
+  if (bc->num_tokens > 0) {
     // phase 3: Compute attention score for generation tokens
     compute_attention_kernel_generation<DT>(
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
-  if (bc->num_tokens > bc->num_generation_tokens) {
+  if (bc->num_tokens > bc->num_tokens) {
     // phase 4: Compute attention score for prompt tokens;
     compute_attention_kernel_prompt(
         m, bc, shard_id, bias_ptr, weight_ptr, stream);
@@ -929,7 +938,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_available[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (!bc->request_available[i] || (!bc->requestsInfo[i].prompt_phase)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
@@ -1125,13 +1134,13 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
     }
     tokens_previous_requests += num_new_tokens;
   }
-  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+  if (tokens_previous_requests != (num_tokens - bc->num_tokens)) {
     bc->print();
     printf("tokens_previous_requests: %i\n", tokens_previous_requests);
     printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
+    printf("bc->num_tokens: %i\n", bc->num_tokens);
   }
-  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
+  assert(tokens_previous_requests == (num_tokens - bc->num_tokens));
 }
 
 /*static*/
@@ -1429,6 +1438,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo));
+    request_available = reinterpret_cast<bool *>(
+        reinterpret_cast<char *>(handler.batch_config_metadata) +
+        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
 
     if (offload) {
       // token_infos =

From ae7ba9864a7e141afc8e4be6a11c1602e122fa7d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 19 Apr 2024 18:16:41 +0800
Subject: [PATCH 097/667] chore: update BatchConfig in ops/

- layer_norm
- linear
- noop
- residual_layer_norm
- residual_rms_norm
- rms_norm
- sampling
- sigmoid_silu_multi
- softmax
---
 src/ops/layer_norm.cc          | 2 +-
 src/ops/linear.cc              | 2 +-
 src/ops/noop.cc                | 2 +-
 src/ops/residual_layer_norm.cc | 2 +-
 src/ops/residual_rms_norm.cc   | 2 +-
 src/ops/rms_norm.cc            | 2 +-
 src/ops/sampling.cc            | 2 +-
 src/ops/sigmoid_silu_multi.cc  | 2 +-
 src/ops/softmax.cc             | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 2218ffe39..158af9322 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -439,7 +439,7 @@ void LayerNorm::forward(FFModel const &ff) {
 }
 
 FutureMap LayerNorm::inference(FFModel const &ff,
-                               BatchConfigFuture const &bc,
+                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
                                MachineView const *mv) {
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 0c7a0f78f..981df5dca 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -556,7 +556,7 @@ void Linear::forward(FFModel const &ff) {
 }
 
 FutureMap Linear::inference(FFModel const &ff,
-                            BatchConfigFuture const &bc,
+                            /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                             std::vector<ParallelTensor> const &batch_inputs,
                             std::vector<ParallelTensor> const &batch_outputs,
                             MachineView const *mv) {
diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index da2d4922e..a5561bcd1 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -259,7 +259,7 @@ void NoOp::init(FFModel const &ff) {
 void NoOp::forward(FFModel const &ff) {}
 
 FutureMap NoOp::inference(FFModel const &ff,
-                          BatchConfigFuture const &bc,
+                          /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                           std::vector<ParallelTensor> const &batch_inputs,
                           std::vector<ParallelTensor> const &batch_outputs,
                           MachineView const *mv) {
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index ed9252c30..b85ecdd13 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -536,7 +536,7 @@ Op *ResidualLayerNorm::materialize(FFModel &ff,
 
 FutureMap ResidualLayerNorm::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index f4f5bb72d..2723cb06b 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -365,7 +365,7 @@ void ResidualRMSNorm::forward(FFModel const &ff) {
 
 FutureMap
     ResidualRMSNorm::inference(FFModel const &ff,
-                               BatchConfigFuture const &bc,
+                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
                                MachineView const *mv) {
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index bf07ee6bb..56cf08147 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -340,7 +340,7 @@ void RMSNorm::forward(FFModel const &ff) {
 }
 
 FutureMap RMSNorm::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index 9fc2316f9..3c67fc6ec 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -247,7 +247,7 @@ void Sampling::forward(FFModel const &ff) {
 }
 
 FutureMap Sampling::inference(FFModel const &ff,
-                              BatchConfigFuture const &bc,
+                              /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                               std::vector<ParallelTensor> const &batch_inputs,
                               std::vector<ParallelTensor> const &batch_outputs,
                               MachineView const *mv) {
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3ddd6b8d6..2495f86bd 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -262,7 +262,7 @@ void SigmoidSiluMulti::backward(FFModel const &ff) {
 
 FutureMap SigmoidSiluMulti::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 03618423b..3687fe860 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -279,7 +279,7 @@ OpMeta *Softmax::init_task(Task const *task,
 }
 
 FutureMap Softmax::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
+                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                              std::vector<ParallelTensor> const &batch_inputs,
                              std::vector<ParallelTensor> const &batch_outputs,
                              MachineView const *mv) {

From 2d2179561d1b69b7eaf3f36f6b4992b4b4a07da1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 19 Apr 2024 18:20:05 +0800
Subject: [PATCH 098/667] chore: update BatchConfig in ops/

- split
- topk
---
 src/ops/split.cc | 2 +-
 src/ops/topk.cc  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/split.cc b/src/ops/split.cc
index 7c6b631b2..a9b01dc21 100644
--- a/src/ops/split.cc
+++ b/src/ops/split.cc
@@ -250,7 +250,7 @@ void Split::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 FutureMap Split::inference(FFModel const &ff,
-                           BatchConfigFuture const &bc,
+                           /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                            std::vector<ParallelTensor> const &batch_inputs,
                            std::vector<ParallelTensor> const &batch_outputs,
                            MachineView const *mv) {
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index 7d30a8aff..17512328e 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -270,7 +270,7 @@ void TopK::forward(FFModel const &ff) {
 }
 
 FutureMap TopK::inference(FFModel const &ff,
-                          BatchConfigFuture const &bc,
+                          /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
                           std::vector<ParallelTensor> const &batch_inputs,
                           std::vector<ParallelTensor> const &batch_outputs,
                           MachineView const *mv) {

From 8a8994015c1e56875220d240c1c7be7f73deb576 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 19 Apr 2024 18:51:05 +0800
Subject: [PATCH 099/667] style: macro minor

---
 src/ops/inc_multihead_self_attention.cu | 48 +++++++++++++------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index c5bdcb427..510a6e34f 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -701,29 +701,31 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
-                                   BatchConfig::max_sequence_length(),         \
-                                   THREADS_PER_VALUE,                          \
-                                   THDS_PER_BLOCK);                            \
-  compute_attention_kernel_generation_kernel<DT,                               \
-                                             THDS_PER_BLOCK,                   \
-                                             Dh,                               \
-                                             Dh_MAX,                           \
-                                             THDS_PER_KEY,                     \
-                                             THREADS_PER_VALUE>                \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length(),                                  \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos,                                                    \
-          m->request_available)
+#define LAUNCH_ATTENTION_SCORE_KERNEL(                                          \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)    \
+  do {                                                                          \
+    smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                              \
+                                    BatchConfig::max_sequence_length(),         \
+                                    THREADS_PER_VALUE,                          \
+                                    THDS_PER_BLOCK);                            \
+    compute_attention_kernel_generation_kernel<DT,                              \
+                                              THDS_PER_BLOCK,                   \
+                                              Dh,                               \
+                                              Dh_MAX,                           \
+                                              THDS_PER_KEY,                     \
+                                              THREADS_PER_VALUE>                \
+        <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                            \
+            static_cast<DT *>(m->devQKVProjArray),                              \
+            static_cast<DT *>(m->keyCache),                                     \
+            static_cast<DT *>(m->valueCache),                                   \
+            output_ptr,                                                         \
+            scale,                                                              \
+            BatchConfig::max_sequence_length(),                                 \
+            m->qProjSize,                                                       \
+            m->hidden_size,                                                     \
+            m->request_infos,                                                   \
+            m->request_available);                                              \
+  } while (0)
 
 template <typename DT>
 void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,

From 49fce381554ddfa13e235c52ede96d4beb574a4c Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 19 Apr 2024 11:15:34 -0400
Subject: [PATCH 100/667] Now we don't commit small model KV cache.

---
 include/flexflow/batch_config.h    |  2 --
 include/flexflow/request_manager.h | 36 +++++++++++++++++-------------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index b04ee8f59..86c1df872 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -72,8 +72,6 @@ class BatchConfig {
     TokenId token_id;
     int abs_index_in_request;
     int request_index;
-    // For SSM KV cache commitment
-    int kv_cache_dest_index = -1;
   };
 
   class BitMask {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 78542c566..7fee20b87 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -80,19 +80,19 @@ struct Request {
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
   BatchConfig::BitMask causal_mask;
-  // Here we have to maintain two versions of the committed tokens because the
-  // tree seen by the LLM and the SSM is different due to the pruning.
+  // Here we maintain a struct CommitTokens which has a field `from_index` and
+  // `to_index`. The `from_index` is used by the LLM KV cache commitment and the
+  // `to_index` is used both by the the SSM KV cache recomputation and the LLM
+  // KV cache commitment. Details are as follows:
   //
-  // 1. Commit the SSM KV cache: On the GPU, the KV cache of the
-  // tokens on the speculative token tree is stored together with the KV cache
-  // of the already verified tokens. So the `from_index` should be the absolute
-  // index of the token in the entire token: prompt_length +
-  // generated_sequence_length + index in the speculative token tree. `to_index`
-  // should be the place to put the KV cache in the SSM KV cache: prompt_length
-  // + generated_sequence_length + index_in_committed_tokens.
+  // 1. Recompute the SSM KV cache: We don't commit the KV cache of the SSM
+  // committed tokens but recompute them instead. That is, after the we append
+  // the committed tokens to the generated sequence, just like in the prefilling
+  // phase, and pass them into the SSM to recompute the KV cache. Here we don't
+  // need `from_index` because we don't copy the KV cache, but we need
+  // `to_index`, which is the indices of the committed tokens in the request.
   //
-  // from_index -> BatchConfig::PerTokenInfo.abs_index_in_request
-  // to_index -> BatchConfig::PerTokenInfo.kv_cache_dest_index
+  // to_index -> BatchConfig::PerTokenInfo.abs_index_in_request
   //
   // 2. Commit the LLM KV cache: On the GPU, the KV cache of the speculative
   // token tree and the generated tokens are stored separately. So the
@@ -104,16 +104,20 @@ struct Request {
   // from_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_index
   // to_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_depth
   //
-  // Even though `from_index` and `to_index` means different things for the SSM
-  // and the LLM, we can still use the same struct to store the committed
-  // tokens.
+  // Actually, for a committed token, the `to_index` for the LLM KV cache and
+  // the SSM KV cache are the same thing, so we can use the same field to store
+  // the information.
+  //
+  // When storing the committed tokens:
+  // from_index: The offset of the committed token in the request in the
+  // TreeVerifyBatchConfig
+  // to_index: The absolute index of the token in the request
 
   struct CommittedTokens {
     int from_index;
     int to_index;
   };
-  std::vector<CommittedTokens> llm_committed_tokens;
-  std::vector<CommittedTokens> ssm_committed_tokens;
+  std::vector<CommittedTokens> committed_tokens;
 };
 
 class TokenTreeNode {

From c179ba6a2ac9772337b3c6905e411c3379e75f0b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 19 Apr 2024 15:55:23 -0400
Subject: [PATCH 101/667] 1. Add a token_id field in Request::CommittedToken
 for small model recomputation. 2. Implemented the function to compare the LLM
 output and the SSM speculation tree, not finished yet.

---
 include/flexflow/request_manager.h |  7 +++--
 src/runtime/request_manager.cc     | 50 ++++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 7fee20b87..7fde35fb0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -113,11 +113,14 @@ struct Request {
   // TreeVerifyBatchConfig
   // to_index: The absolute index of the token in the request
 
-  struct CommittedTokens {
+  struct CommittedToken {
     int from_index;
     int to_index;
+    BatchConfig::TokenId token_id;
+    CommittedToken(int from_index, int to_index, BatchConfig::TokenId token_id)
+        : from_index(from_index), to_index(to_index), token_id(token_id) {}
   };
-  std::vector<CommittedTokens> committed_tokens;
+  std::vector<CommittedToken> committed_tokens;
 };
 
 class TokenTreeNode {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4ea76a86b..1ebe356d8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1425,7 +1425,7 @@ bool RequestManager::update_ssm_inference_results(
             guid,
             ssm_inference_result.token_ids[result_index],
             ssm_inference_result.probs[result_index],
-            0);
+            -1);
         result_index++;
       }
     } else if (token_tree.tree_layers.size() < current_speculation_step - 1) {
@@ -1787,8 +1787,54 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 
 void RequestManager::get_verify_results(
     InferenceResult const &llm_verify_result) {
-  // This function should return the verified tokens and maintain the
+  // This function maintain the generated token list of the request and the
   // committed tokens.
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    request.committed_tokens.clear();
+
+    // Traverse the speculative token tree and it with the LLM's sampling output
+    int llm_result_index = 0;
+    int verified_parent_pos = -1;
+    int committed_token_index = 0;
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    for (auto const &tree_layer : token_tree.tree_layers) {
+      bool token_accepted_this_layer = false;
+      int current_layer_index = 0;
+      for (auto const &node_ptr : tree_layer) {
+        if (node_ptr->pruned) {
+          continue;
+        }
+        if (node_ptr->parent_pos != verified_parent_pos) {
+          llm_result_index++;
+          current_layer_index++;
+          continue;
+        } else if (token_accepted_this_layer) {
+          // A token is already accepted in the current layer
+          llm_result_index++;
+          current_layer_index++;
+          continue;
+        } else {
+          if (node_ptr->id == llm_verify_result.token_ids[llm_result_index]) {
+            request.committed_tokens.push_back(Request::CommittedToken(
+                llm_result_index, committed_token_index, node_ptr->id));
+            request.tokens.push_back(node_ptr->id);
+            token_accepted_this_layer = true;
+            verified_parent_pos = current_layer_index;
+            committed_token_index++;
+          }
+          llm_result_index++;
+          current_layer_index++;
+        }
+      }
+    }
+  }
 }
 
 std::vector<GenerationResult>

From 9880a3bf88ee6e17bfeab1d90e5b64ae20932a8f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 19 Apr 2024 16:15:53 -0400
Subject: [PATCH 102/667] Renamed InferenceMode::BEAM_SEARCH_MODE to
 InferenceMode::TREE_SEARCH_MODE.

---
 inference/models/falcon.cc                    |  6 +--
 inference/models/llama.cc                     |  6 +--
 inference/models/mpt.cc                       |  6 +--
 inference/models/opt.cc                       |  6 +--
 inference/models/starcoder.cc                 |  4 +-
 inference/spec_infer/spec_infer.cc            |  8 +--
 src/ops/inc_multihead_self_attention.cpp      | 27 +++++-----
 src/ops/inc_multihead_self_attention.cu       | 53 ++++++++++---------
 src/ops/spec_inc_multihead_self_attention.cpp |  2 +-
 src/ops/spec_inc_multihead_self_attention.cu  |  2 +-
 src/runtime/batch_config.cc                   |  2 +-
 src/runtime/inference_manager.cc              |  2 +-
 src/runtime/request_manager.cpp               |  2 +-
 src/runtime/request_manager.cu                | 14 ++---
 14 files changed, 71 insertions(+), 69 deletions(-)

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index a529411dd..d59274727 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -40,7 +40,7 @@ void FALCON::create_falcon_model(FFModel &ff,
   {
     // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
+        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
             ? BatchConfig::max_verify_tokens_per_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
@@ -97,7 +97,7 @@ void FALCON::create_falcon_model(FFModel &ff,
     }
 
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         mha = ff.spec_inc_multiquery_self_attention(
             att_norm,
             falcon_config.hidden_size,
@@ -233,7 +233,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                             "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 517f53443..8e86d73b5 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -42,7 +42,7 @@ void LLAMA::create_llama_model(FFModel &ff,
   Tensor input;
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
+        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
             ? BatchConfig::max_verify_tokens_per_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
@@ -93,7 +93,7 @@ void LLAMA::create_llama_model(FFModel &ff,
 
     Tensor mha;
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         mha = ff.spec_inc_multihead_self_attention(
             att_norm,
             llama_config.hidden_size,
@@ -247,7 +247,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                           "output");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(dense, -1);
     // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false);
     // output = ff.argmax(softmax, /*beam_Search*/ true);
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 70e2b5e9c..11845da0e 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -41,7 +41,7 @@ void MPT::create_mpt_model(FFModel &ff,
   Tensor input;
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
+        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
             ? BatchConfig::max_verify_tokens_per_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
@@ -94,7 +94,7 @@ void MPT::create_mpt_model(FFModel &ff,
 
     Tensor attn_outputs;
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         attn_outputs = ff.spec_inc_multihead_self_attention(
             layernorm_output,
             mpt_config.hidden_size,
@@ -241,7 +241,7 @@ void MPT::create_mpt_model(FFModel &ff,
                             "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 5677d5658..0623a941c 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -43,7 +43,7 @@ void OPT::create_opt_model(FFModel &ff,
   ff.set_position_offset(2);
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
+        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
             ? BatchConfig::max_verify_tokens_per_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
@@ -102,7 +102,7 @@ void OPT::create_opt_model(FFModel &ff,
 
     Tensor mha;
     switch (mode) {
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         mha = ff.spec_inc_multihead_self_attention(
             hidden_states,
             opt_config.hidden_size,
@@ -246,7 +246,7 @@ void OPT::create_opt_model(FFModel &ff,
                             "embed_tokens_weight_lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     // output = ff.beam_top_k(softmax, opt_config.max_beam_width, false);
     output = ff.argmax(softmax, /*beam_Search*/ true);
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 8b0dc1098..5295d92cb 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -49,7 +49,7 @@ void STARCODER::create_starcoder_model(
   {
     // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
+        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
             ? BatchConfig::max_verify_tokens_per_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
@@ -205,7 +205,7 @@ void STARCODER::create_starcoder_model(
                             "lm_head");
 
   Tensor output;
-  if (mode == BEAM_SEARCH_MODE) {
+  if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     // output = ff.beam_top_k(softmax, startcoder_config.max_beam_width, false);
     output = ff.argmax(softmax, /*beam_Search*/ true);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index f7edfd769..cc5270ee9 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -371,27 +371,27 @@ void FlexFlow::top_level_task(Task const *task,
       LLAMA::create_llama_model(beam_model,
                                 model_metadata.ssm_model_config_paths[ssm_id],
                                 model_metadata.ssm_model_weights_paths[ssm_id],
-                                BEAM_SEARCH_MODE,
+                                TREE_SEARCH_MODE,
                                 generationConfig,
                                 use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
                             model_metadata.ssm_model_config_paths[ssm_id],
                             model_metadata.ssm_model_weights_paths[ssm_id],
-                            BEAM_SEARCH_MODE,
+                            TREE_SEARCH_MODE,
                             use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) {
       FALCON::create_falcon_model(
           beam_model,
           model_metadata.ssm_model_config_paths[ssm_id],
           model_metadata.ssm_model_weights_paths[ssm_id],
-          BEAM_SEARCH_MODE,
+          TREE_SEARCH_MODE,
           use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) {
       MPT::create_mpt_model(beam_model,
                             model_metadata.ssm_model_config_paths[ssm_id],
                             model_metadata.ssm_model_weights_paths[ssm_id],
-                            BEAM_SEARCH_MODE,
+                            TREE_SEARCH_MODE,
                             generationConfig,
                             use_full_precision);
     } else {
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index a97b97526..8659f3ec3 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -124,18 +124,19 @@ __global__ void scaling_query_kernel(DT *input_ptr,
 }
 
 template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  hipFloatComplex *complex_input,
-                                  /* Reserved: BatchConfig Updated, leave beamsearch to kill */BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
+__global__ void apply_rotary_embedding_native(
+    DT *input_ptr,
+    hipFloatComplex *complex_input,
+    /* Reserved: BatchConfig Updated, leave beamsearch to kill */
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int qProjSize,
+    int kProjSize,
+    int num_q_heads,
+    int num_tokens,
+    int num_kv_heads,
+    int q_block_size,
+    int k_block_size,
+    int q_array_size) {
   CUDA_KERNEL_LOOP(
       i,
       num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
@@ -965,7 +966,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_sequence_length();
         break;
       }
-      case BEAM_SEARCH_MODE: {
+      case TREE_SEARCH_MODE: {
         key_cache_size = num_q_heads * kProjSize *
                          TreeSearchBatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length() *
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 510a6e34f..11d2a2543 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -52,7 +52,8 @@ __global__ void compute_attention_kernel_generation_kernel(
     int max_seq_length,
     int per_head_size,
     int hidden_size,
-    /* Reserved: BatchConfig Updated */BatchConfig::PerRequestInfo *request_infos,
+    /* Reserved: BatchConfig Updated */
+    BatchConfig::PerRequestInfo *request_infos,
     bool *request_available) {
 
   // q, k
@@ -701,30 +702,30 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-#define LAUNCH_ATTENTION_SCORE_KERNEL(                                          \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)    \
-  do {                                                                          \
-    smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                              \
-                                    BatchConfig::max_sequence_length(),         \
-                                    THREADS_PER_VALUE,                          \
-                                    THDS_PER_BLOCK);                            \
-    compute_attention_kernel_generation_kernel<DT,                              \
-                                              THDS_PER_BLOCK,                   \
-                                              Dh,                               \
-                                              Dh_MAX,                           \
-                                              THDS_PER_KEY,                     \
-                                              THREADS_PER_VALUE>                \
-        <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                            \
-            static_cast<DT *>(m->devQKVProjArray),                              \
-            static_cast<DT *>(m->keyCache),                                     \
-            static_cast<DT *>(m->valueCache),                                   \
-            output_ptr,                                                         \
-            scale,                                                              \
-            BatchConfig::max_sequence_length(),                                 \
-            m->qProjSize,                                                       \
-            m->hidden_size,                                                     \
-            m->request_infos,                                                   \
-            m->request_available);                                              \
+#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
+  do {                                                                         \
+    smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                             \
+                                     BatchConfig::max_sequence_length(),       \
+                                     THREADS_PER_VALUE,                        \
+                                     THDS_PER_BLOCK);                          \
+    compute_attention_kernel_generation_kernel<DT,                             \
+                                               THDS_PER_BLOCK,                 \
+                                               Dh,                             \
+                                               Dh_MAX,                         \
+                                               THDS_PER_KEY,                   \
+                                               THREADS_PER_VALUE>              \
+        <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                           \
+            static_cast<DT *>(m->devQKVProjArray),                             \
+            static_cast<DT *>(m->keyCache),                                    \
+            static_cast<DT *>(m->valueCache),                                  \
+            output_ptr,                                                        \
+            scale,                                                             \
+            BatchConfig::max_sequence_length(),                                \
+            m->qProjSize,                                                      \
+            m->hidden_size,                                                    \
+            m->request_infos,                                                  \
+            m->request_available);                                             \
   } while (0)
 
 template <typename DT>
@@ -1363,7 +1364,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_sequence_length();
         break;
       }
-      case BEAM_SEARCH_MODE:
+      case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 187ada4d5..439f60296 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -586,7 +586,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
-                                    BEAM_SEARCH_MODE,
+                                    TREE_SEARCH_MODE,
                                     attn,
                                     attn->qSize,
                                     attn->kSize,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index d5cddb15e..a880666f6 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -823,7 +823,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
-                                    BEAM_SEARCH_MODE,
+                                    TREE_SEARCH_MODE,
                                     attn,
                                     attn->qSize,
                                     attn->kSize,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 6fba6eff5..22e1d3889 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -46,7 +46,7 @@ BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) {
   // Check future size
   if (bc->get_mode() == INC_DECODING_MODE) {
     assert(Future(future).get_untyped_size() == sizeof(BatchConfig));
-  } else if (bc->get_mode() == BEAM_SEARCH_MODE) {
+  } else if (bc->get_mode() == TREE_SEARCH_MODE) {
     assert(Future(future).get_untyped_size() == sizeof(TreeSearchBatchConfig));
   } else if (bc->get_mode() == TREE_VERIFY_MODE) {
     assert(Future(future).get_untyped_size() == sizeof(TreeVerifyBatchConfig));
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index cf8a7aa89..4fcbaa3e3 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -310,7 +310,7 @@ FutureMap InferenceManager::inference(FFModel *model,
   if (bc.get_mode() == INC_DECODING_MODE) {
     BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
     return inference(model, index, bcf);
-  } else if (bc.get_mode() == BEAM_SEARCH_MODE) {
+  } else if (bc.get_mode() == TREE_SEARCH_MODE) {
     BatchConfig const *bc_ptr = &bc;
     TreeSearchBatchConfig const *bsbc_ptr =
         static_cast<TreeSearchBatchConfig const *>(bc_ptr);
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index adced990e..4f7f3e82c 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -90,7 +90,7 @@ void RequestManager::load_batch_config_task(
   total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
-  if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
+  if (batch_config->get_mode() == TREE_SEARCH_MODE) {
     TreeSearchBatchConfig const *beam_batch_config =
         static_cast<TreeSearchBatchConfig const *>(batch_config);
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 66305cf69..2f91c89bf 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -96,16 +96,16 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::requestsInfo);
 
-  checkCUDA(cudaMemcpyAsync(
-      static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-      &(batch_config->request_available),
-      sizeof(BatchConfig::request_available),
-      cudaMemcpyHostToDevice,
-      stream));
+  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
+                                total_copy_size,
+                            &(batch_config->request_available),
+                            sizeof(BatchConfig::request_available),
+                            cudaMemcpyHostToDevice,
+                            stream));
   total_copy_size += sizeof(BatchConfig::request_available);
 
   // load speculative metadata
-  if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
+  if (batch_config->get_mode() == TREE_SEARCH_MODE) {
     TreeSearchBatchConfig const *beam_batch_config =
         static_cast<TreeSearchBatchConfig const *>(batch_config);
 

From 8838dc98dc620fb7bdefa8d417d61359a201d9a7 Mon Sep 17 00:00:00 2001
From: Shuhuai Lin <shuhuail@catalyst-cluster.cs.cmu.edu>
Date: Sat, 20 Apr 2024 00:23:27 -0400
Subject: [PATCH 103/667] finish prepare_first_spec_batch_config() and
 prepare_verify_batch_config()

---
 src/runtime/request_manager.cc | 106 +++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1ebe356d8..4cb5852fb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1271,7 +1271,53 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   // Please refer to the implementation of prepare_next_spec_batch_config()
   // for more details.
   TreeSearchBatchConfig new_bc;
+  // Assume that only one small model is in use now -> All the requests in a batch are using the same small model? 
+  new_bc.model_id = 0;
+  new_bc.num_tokens = 0;
+  new_bc.current_depth = 0; // depth of first spec is 0
+  new_bc.num_available_requests = 0;
+  assert(current_speculation_step == 0);
 
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      new_bc.request_available[request_index] = false;
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    new_bc.request_available[request_index] = true;
+    new_bc.num_available_requests++;
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch = new_bc.num_tokens;
+    // TODO: check this profiling, what is profiling
+    profiling_requests[request.guid].ssm_decoding_steps += 1;
+
+    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    assert(token_tree.tree_size == 0);
+    assert(token_tree.tree_node_size == 0);
+    // 1. Get committed tokens from committed_tokens
+    std::vector <CommittedToken> &committed_tokens = request.committed_tokens;
+
+    // 2. Maintain all other fields of TreeSearchBatchConfig
+    new_bc.requestsInfo[request_index].first_token_index_in_request = request.tokens.size();
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = committed_tokens.size();
+
+    // 3. Store committed tokens to tokensInfo
+    for (int committed_token_index = 0; committed_token_index < committed_tokens.size(); committed_token_index++) {
+      CommittedToken committed_token = committed_tokens.at(committed_token_index);
+      new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = committed_token.to_index;
+      new_bc.tokensInfo[new_bc.num_tokens].token_id = committed_token.token_id;
+      new_bc.num_tokens++;
+    }
+    // Copy the causal mask, it should already been updated
+    new_bc.causalMask[request_index] = request.causal_mask;
+  }
+  if (verbose) {
+    std::cout << "prepare_first_spec_batch_config NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
   return new_bc;
 }
 
@@ -1372,6 +1418,66 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   // TreeSearchBatchConfig.
   // Please refer to the implementation of prepare_next_spec_batch_config()
   // for more details.
+  TreeVerifyBatchConfig new_bc;
+  new_bc.num_tokens = 0;
+  new_bc.num_available_requests = 0;
+  new_bc.num_tokens_to_commit = 0;
+
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      new_bc.request_available[request_index] = false;
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    new_bc.request_available[request_index] = true;
+    new_bc.num_available_requests++;
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch = new_bc.num_tokens;
+    profiling_requests[request.guid].llm_decoding_steps += 1;
+
+    // 1. Maintain requestsInfo.first_token_index_in_request of TreeSearchBatchConfig
+    new_bc.requestsInfo[request_index].first_token_index_in_request = request.tokens.size();
+
+    // 2. Put the information of the committed tokens into TreeVerifyBatchConfig.committed_tokens.
+    std::vector<CommittedToken> committed_tokens = request.committed_tokens;
+    for (int committed_token_index = 0; committed_token_index < committed_tokens.size(); committed_token_index++) {
+      CommittedToken committed_token = committed_tokens.at(committed_token_index);
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = request_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.from_index; // not sure
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.to_index; // not sure
+      new_bc.num_tokens_to_commit++;
+    }
+
+    // 3. Load the tokens on the token tree that are not yet pruned to
+    // TreeVerifyBatchConfig.tokensInfo. Be careful with the abs_depth etc.
+    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    int token_tree_index = 0;
+    for (std::list<std::shared_ptr<TokenTreeNode>> &tree_layer : token_tree.tree_layers) {
+      for (std::shared_ptr<TokenTreeNode> tree_node : tree_layer) {
+        if (tree_node->pruned == false) {
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = request.tokens.size() + token_tree_index;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node.id;
+          new_bc.num_tokens++;
+          token_tree_index++;
+        }
+      }
+    }
+
+    // 4. Maintain requestsInfo.num_tokens_in_batch of TreeSearchBatchConfig
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
+
+    // 5. Create the causal mask for the large model based on the small model causal mask.
+    new_bc.causalMask[request_index] = create_llm_bitmask(guid);
+  }
+
+  if (verbose) {
+    std::cout << "prepare_next_batch_verify NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
+  return new_bc;
 }
 
 void RequestManager::update_llm_verify_results(

From 6c4ca0a854c2d8ace13846431761717bafb8a679 Mon Sep 17 00:00:00 2001
From: Shuhuai Lin <shuhuail@catalyst-cluster.cs.cmu.edu>
Date: Sat, 20 Apr 2024 00:39:10 -0400
Subject: [PATCH 104/667] Remove some useless annotations

---
 src/runtime/request_manager.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4cb5852fb..5dad0f9c6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1271,10 +1271,10 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   // Please refer to the implementation of prepare_next_spec_batch_config()
   // for more details.
   TreeSearchBatchConfig new_bc;
-  // Assume that only one small model is in use now -> All the requests in a batch are using the same small model? 
+  // Assume that only one small model is in use now
   new_bc.model_id = 0;
   new_bc.num_tokens = 0;
-  new_bc.current_depth = 0; // depth of first spec is 0
+  new_bc.current_depth = 0;
   new_bc.num_available_requests = 0;
   assert(current_speculation_step == 0);
 
@@ -1445,13 +1445,12 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     for (int committed_token_index = 0; committed_token_index < committed_tokens.size(); committed_token_index++) {
       CommittedToken committed_token = committed_tokens.at(committed_token_index);
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = request_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.from_index; // not sure
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.to_index; // not sure
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.from_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.to_index;
       new_bc.num_tokens_to_commit++;
     }
 
-    // 3. Load the tokens on the token tree that are not yet pruned to
-    // TreeVerifyBatchConfig.tokensInfo. Be careful with the abs_depth etc.
+    // 3. Load the tokens on the token tree that are not yet pruned to TreeVerifyBatchConfig.tokensInfo.
     TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
     int token_tree_index = 0;
     for (std::list<std::shared_ptr<TokenTreeNode>> &tree_layer : token_tree.tree_layers) {

From 8e0347488c81110fdb5427fa609560b901e8ca21 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 20 Apr 2024 14:01:01 +0800
Subject: [PATCH 105/667] chore: update BatchConfig in ops/arg_topk

---
 src/ops/arg_topk.cpp | 20 ++++----------------
 src/ops/arg_topk.cu  | 26 ++++----------------------
 2 files changed, 8 insertions(+), 38 deletions(-)

diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index 665cbe08e..3a60e54fe 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -379,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
                              int length,
                              int k,
                              bool sorted,
-                             /* Reserved: BatchConfig Updated, leave beamsearch to kill */TreeSearchBatchConfig const *bc,
+                             /* Reserved: BatchConfig Updated */TreeSearchBatchConfig const *bc,
                              hipStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
@@ -398,29 +398,17 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
   size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
   // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
   size_t num_blocks = batch_size;
-  // all requests are in the same beam stages
+  // all requests share the same number of branches
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
 
-    // check
-    int beam_size = -1;
-    for (int i = 1; i < bc->max_requests_per_batch(); i++) {
-      if (!bc->request_available[i]) {
-        continue;
-      } else if (beam_size == -1) {
-        beam_size = bc->beamRequestsInfo[i].beam_size;
-      } else {
-        assert(beam_size == bc->beamRequestsInfo[i].beam_size);
-      }
-    }
-
-    assert(num_shards >= (size_t)beam_size);
+    assert(num_shards >= (size_t)TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
     arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
-        beam_size,
+        TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
         output_ptr,
         indices_ptr,
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 8110de0ae..5403494bd 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -379,7 +379,7 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
                              int length,
                              int k,
                              bool sorted,
-                             /* Reserved: BatchConfig Updated, leave beamsearch to kill */TreeSearchBatchConfig const *bc,
+                             /* Reserved: BatchConfig Updated */TreeSearchBatchConfig const *bc,
                              cudaStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
@@ -399,34 +399,16 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
   // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
   size_t num_blocks = batch_size;
 
-  // all requests are in the same beam stages
+  // all requests share the same number of branches
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
-
-    // check
-    // allow last request different with others
-    int beam_size = -1;
-    int num_activate_requests = bc->num_active_requests();
-    int last_request_idx =
-        bc->requestsInfo[num_activate_requests - 1].batch_config_request_id;
-    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-      if (!bc->request_available[i]) {
-        continue;
-      } else if (beam_size == -1) {
-        beam_size = bc->beamRequestsInfo[i].beam_size;
-
-      } else if (i != last_request_idx) {
-        assert(beam_size == bc->beamRequestsInfo[i].beam_size);
-      } else if (i == last_request_idx) {
-      }
-    }
-    assert(num_shards >= (size_t)beam_size);
+    assert(num_shards >= (size_t)TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
     arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
-        beam_size,
+        TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
         output_ptr,
         indices_ptr,

From 79232a459c822d37c9f4c05e306fee8cceb4caaf Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 20 Apr 2024 11:48:34 -0400
Subject: [PATCH 106/667] Remove unused APIs in TreeSearchBatchConfig.

---
 include/flexflow/batch_config.h         |  2 -
 src/runtime/request_manager.cc          | 64 +++++++++++++++++--------
 src/runtime/tree_search_batch_config.cc |  4 --
 3 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 86c1df872..2bf6863de 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -172,8 +172,6 @@ class TreeSearchBatchConfig : public BatchConfig {
                                   TreeSearchBatchConfig const &bc);
   void print() const;
   void save_to_file(std::string const &filename) const;
-  int current_depth() const;
-  int get_speculative_request_num() const;
 
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
   inline static int const MAX_TREE_DEPTH = 16;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5dad0f9c6..c2f6ca330 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1289,7 +1289,8 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
     new_bc.num_available_requests++;
-    new_bc.requestsInfo[request_index].first_token_offset_in_batch = new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
     // TODO: check this profiling, what is profiling
     profiling_requests[request.guid].ssm_decoding_steps += 1;
 
@@ -1297,17 +1298,23 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     assert(token_tree.tree_size == 0);
     assert(token_tree.tree_node_size == 0);
     // 1. Get committed tokens from committed_tokens
-    std::vector <CommittedToken> &committed_tokens = request.committed_tokens;
+    std::vector<CommittedToken> &committed_tokens = request.committed_tokens;
 
     // 2. Maintain all other fields of TreeSearchBatchConfig
-    new_bc.requestsInfo[request_index].first_token_index_in_request = request.tokens.size();
-    new_bc.requestsInfo[request_index].num_tokens_in_batch = committed_tokens.size();
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
+        request.tokens.size();
+    new_bc.requestsInfo[request_index].num_tokens_in_batch =
+        committed_tokens.size();
 
     // 3. Store committed tokens to tokensInfo
-    for (int committed_token_index = 0; committed_token_index < committed_tokens.size(); committed_token_index++) {
-      CommittedToken committed_token = committed_tokens.at(committed_token_index);
+    for (int committed_token_index = 0;
+         committed_token_index < committed_tokens.size();
+         committed_token_index++) {
+      CommittedToken committed_token =
+          committed_tokens.at(committed_token_index);
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = committed_token.to_index;
+      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+          committed_token.to_index;
       new_bc.tokensInfo[new_bc.num_tokens].token_id = committed_token.token_id;
       new_bc.num_tokens++;
     }
@@ -1315,7 +1322,8 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     new_bc.causalMask[request_index] = request.causal_mask;
   }
   if (verbose) {
-    std::cout << "prepare_first_spec_batch_config NEW batchconfig:" << std::endl;
+    std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
+              << std::endl;
     new_bc.print();
   }
   return new_bc;
@@ -1434,30 +1442,43 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
     new_bc.num_available_requests++;
-    new_bc.requestsInfo[request_index].first_token_offset_in_batch = new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
     profiling_requests[request.guid].llm_decoding_steps += 1;
 
-    // 1. Maintain requestsInfo.first_token_index_in_request of TreeSearchBatchConfig
-    new_bc.requestsInfo[request_index].first_token_index_in_request = request.tokens.size();
+    // 1. Maintain requestsInfo.first_token_index_in_request of
+    // TreeSearchBatchConfig
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
+        request.tokens.size();
 
-    // 2. Put the information of the committed tokens into TreeVerifyBatchConfig.committed_tokens.
+    // 2. Put the information of the committed tokens into
+    // TreeVerifyBatchConfig.committed_tokens.
     std::vector<CommittedToken> committed_tokens = request.committed_tokens;
-    for (int committed_token_index = 0; committed_token_index < committed_tokens.size(); committed_token_index++) {
-      CommittedToken committed_token = committed_tokens.at(committed_token_index);
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index = request_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index = committed_token.from_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = committed_token.to_index;
+    for (int committed_token_index = 0;
+         committed_token_index < committed_tokens.size();
+         committed_token_index++) {
+      CommittedToken committed_token =
+          committed_tokens.at(committed_token_index);
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+          request_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+          committed_token.from_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+          committed_token.to_index;
       new_bc.num_tokens_to_commit++;
     }
 
-    // 3. Load the tokens on the token tree that are not yet pruned to TreeVerifyBatchConfig.tokensInfo.
+    // 3. Load the tokens on the token tree that are not yet pruned to
+    // TreeVerifyBatchConfig.tokensInfo.
     TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
     int token_tree_index = 0;
-    for (std::list<std::shared_ptr<TokenTreeNode>> &tree_layer : token_tree.tree_layers) {
+    for (std::list<std::shared_ptr<TokenTreeNode>> &tree_layer :
+         token_tree.tree_layers) {
       for (std::shared_ptr<TokenTreeNode> tree_node : tree_layer) {
         if (tree_node->pruned == false) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = request.tokens.size() + token_tree_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              request.tokens.size() + token_tree_index;
           new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node.id;
           new_bc.num_tokens++;
           token_tree_index++;
@@ -1468,7 +1489,8 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     // 4. Maintain requestsInfo.num_tokens_in_batch of TreeSearchBatchConfig
     new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
-    // 5. Create the causal mask for the large model based on the small model causal mask.
+    // 5. Create the causal mask for the large model based on the small model
+    // causal mask.
     new_bc.causalMask[request_index] = create_llm_bitmask(guid);
   }
 
diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
index ff3308da3..6f78fb71e 100644
--- a/src/runtime/tree_search_batch_config.cc
+++ b/src/runtime/tree_search_batch_config.cc
@@ -46,10 +46,6 @@ InferenceMode TreeSearchBatchConfig::get_mode() const {
   return TREE_SEARCH_MODE;
 }
 
-int TreeSearchBatchConfig::get_speculative_request_num() const {
-  return speculative_request_num;
-}
-
 std::ostream &
     operator<<(std::ostream &os,
                TreeSearchBatchConfig const &tree_search_batch_config) {

From a1c93f6d561e8a563f04c42fce0dd36d72313b9b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 20 Apr 2024 12:37:38 -0400
Subject: [PATCH 107/667] Small modifications on prepare_verify_batch_config
 and prepare_first_spec_batch_config.

---
 src/runtime/request_manager.cc | 48 +++++++++++++++-------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c2f6ca330..d6d9ceb39 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1289,20 +1289,17 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
     new_bc.num_available_requests++;
-    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
-        new_bc.num_tokens;
     // TODO: check this profiling, what is profiling
     profiling_requests[request.guid].ssm_decoding_steps += 1;
 
-    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
-    assert(token_tree.tree_size == 0);
-    assert(token_tree.tree_node_size == 0);
-    // 1. Get committed tokens from committed_tokens
-    std::vector<CommittedToken> &committed_tokens = request.committed_tokens;
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
 
-    // 2. Maintain all other fields of TreeSearchBatchConfig
+    // 2. Maintain requestsInfo
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
     new_bc.requestsInfo[request_index].first_token_index_in_request =
-        request.tokens.size();
+        request.tokens.size() - committed_tokens.size();
     new_bc.requestsInfo[request_index].num_tokens_in_batch =
         committed_tokens.size();
 
@@ -1310,7 +1307,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     for (int committed_token_index = 0;
          committed_token_index < committed_tokens.size();
          committed_token_index++) {
-      CommittedToken committed_token =
+      Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
       new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
@@ -1318,7 +1315,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
       new_bc.tokensInfo[new_bc.num_tokens].token_id = committed_token.token_id;
       new_bc.num_tokens++;
     }
-    // Copy the causal mask, it should already been updated
+    // 4. Copy the causal mask, it should already been updated
     new_bc.causalMask[request_index] = request.causal_mask;
   }
   if (verbose) {
@@ -1442,23 +1439,20 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
     new_bc.num_available_requests++;
-    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
-        new_bc.num_tokens;
+    // TODO: check this profiling
     profiling_requests[request.guid].llm_decoding_steps += 1;
 
-    // 1. Maintain requestsInfo.first_token_index_in_request of
-    // TreeSearchBatchConfig
+    // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.tokens.size();
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
 
     // 2. Put the information of the committed tokens into
     // TreeVerifyBatchConfig.committed_tokens.
-    std::vector<CommittedToken> committed_tokens = request.committed_tokens;
-    for (int committed_token_index = 0;
-         committed_token_index < committed_tokens.size();
-         committed_token_index++) {
-      CommittedToken committed_token =
-          committed_tokens.at(committed_token_index);
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
+    for (auto const &committed_token : committed_tokens) {
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
@@ -1470,16 +1464,15 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
 
     // 3. Load the tokens on the token tree that are not yet pruned to
     // TreeVerifyBatchConfig.tokensInfo.
-    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    TokenTree &token_tree = request.speculative_token_trees[0];
     int token_tree_index = 0;
-    for (std::list<std::shared_ptr<TokenTreeNode>> &tree_layer :
-         token_tree.tree_layers) {
-      for (std::shared_ptr<TokenTreeNode> tree_node : tree_layer) {
+    for (auto const &tree_layer : token_tree.tree_layers) {
+      for (auto const &tree_node : tree_layer) {
         if (tree_node->pruned == false) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               request.tokens.size() + token_tree_index;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node.id;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
           new_bc.num_tokens++;
           token_tree_index++;
         }
@@ -1487,7 +1480,8 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     }
 
     // 4. Maintain requestsInfo.num_tokens_in_batch of TreeSearchBatchConfig
-    new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
+    new_bc.requestsInfo[request_index].num_tokens_in_batch =
+        token_tree_index + 1;
 
     // 5. Create the causal mask for the large model based on the small model
     // causal mask.

From 85a0a4ffc311c34c7354469b1e1613d2708125c4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 22 Apr 2024 14:39:25 +0800
Subject: [PATCH 108/667] feat: add current_phase field in BatchConfig, to
 indicate which phase of kernel

---
 include/flexflow/batch_config.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 86c1df872..79fc5bd72 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -63,6 +63,13 @@ class BatchConfig {
   int num_tokens;
   int num_available_requests;
 
+  enum class ExecutionPhase {
+    PROMPT,
+    GENERATION
+  };
+
+  ExecutionPhase current_phase;
+
   struct PerRequestInfo {
     int first_token_index_in_request;
     int first_token_offset_in_batch;

From 857576649db7cbb938b14b9144c34bd2f6c4998d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 22 Apr 2024 14:41:36 +0800
Subject: [PATCH 109/667] update BatchConfig in
 ops/inc_multihead_self_attention

---
 src/ops/inc_multihead_self_attention.cpp |  6 +++---
 src/ops/inc_multihead_self_attention.cu  | 10 ++++------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 8659f3ec3..b74225bee 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -127,7 +127,7 @@ template <typename DT>
 __global__ void apply_rotary_embedding_native(
     DT *input_ptr,
     hipFloatComplex *complex_input,
-    /* Reserved: BatchConfig Updated, leave beamsearch to kill */
+    /* Reserved: BatchConfig Updated */
     BatchConfig::PerTokenInfo const *tokenInfos,
     int qProjSize,
     int kProjSize,
@@ -970,11 +970,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         key_cache_size = num_q_heads * kProjSize *
                          TreeSearchBatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length() *
-                         TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+                         TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
         value_cache_size = num_q_heads * vProjSize *
                            TreeSearchBatchConfig::max_requests_per_batch() *
                            BatchConfig::max_sequence_length() *
-                           TreeSearchBatchConfig::MAX_BEAM_WIDTH;
+                           TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
         break;
       }
       default:
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 11d2a2543..1af539852 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -837,14 +837,12 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
-  if (bc->num_tokens > 0) {
+  if (bc->current_phase == BatchConfig::ExecutionPhase::GENERATION) {
     // phase 3: Compute attention score for generation tokens
     compute_attention_kernel_generation<DT>(
         m, bc, static_cast<DT *>(m->attn_heads), stream);
-  }
-
-  if (bc->num_tokens > bc->num_tokens) {
-    // phase 4: Compute attention score for prompt tokens;
+  } else if (bc->current_phase == BatchConfig::ExecutionPhase::PROMPT) {
+    // phase 3: Compute attention score for prompt tokens;
     compute_attention_kernel_prompt(
         m, bc, shard_id, bias_ptr, weight_ptr, stream);
   }
@@ -941,7 +939,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (!bc->request_available[i]) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;

From 99d7dd12a4d54340c55b7068e02859cf70db4f89 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 22 Apr 2024 19:19:59 +0800
Subject: [PATCH 110/667] chore: update BatchConfig in
 ops/spec_inc_multihead_attention

---
 .../ops/spec_inc_multihead_self_attention.h   |   5 +-
 src/ops/spec_inc_multihead_self_attention.cc  |   2 +-
 src/ops/spec_inc_multihead_self_attention.cpp |   2 +-
 src/ops/spec_inc_multihead_self_attention.cu  | 100 +++++++-----------
 4 files changed, 42 insertions(+), 67 deletions(-)

diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index 6ba52fe52..1e046bb26 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -139,10 +139,7 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   ~SpecIncMultiHeadSelfAttentionMeta(void);
 
 public:
-  Realm::RegionInstance beam_search_reserve_inst;
-  TreeSearchBatchConfig::BeamSearchPerTokenInfo *beam_token_infos;
-  TreeSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos;
-  bool *request_completed;
+  Realm::RegionInstance tree_search_reserve_inst;
   BatchConfig::BitMask *causalMask;
 };
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index dc23f4525..08b0a2ac9 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -673,7 +673,7 @@ void SpecIncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap SpecIncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 439f60296..c56268b9d 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -36,7 +36,7 @@ __global__ void spec_store_kv_cache(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    BatchConfig::PerTokenInfo *tokenInfos,
+    /* Reserved: BatchConfig, leave HIP code to be updated */BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
     TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
     TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index a880666f6..690ab89bd 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -48,10 +48,9 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     int const max_seq_length,
     int per_head_size,
     int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos,
-    TreeSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos,
+    /* Reserved: BatchConfig Updated */BatchConfig::PerRequestInfo *request_infos,
     BatchConfig::BitMask *causalMask,
-    bool *request_completed) {
+    bool *request_available) {
 
   // q, k
   using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
@@ -74,30 +73,37 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   int const request_idx = blockIdx.y;
 
   // request id in batch config
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
+  int requext_idx_in_batch = 0;
+  for (int i = 0; i < request_idx; i++) {
+    while (!request_available[requext_idx_in_batch]) {
+      requext_idx_in_batch++;
+    }
+  }
+
+  // threads converge
+  __syncthreads();
 
   // request_idx = re
 
-  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
+  BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
 
   int const first_step = 0;
 
   // int const tlength =
-  //     request_infos[batch_config_request_id].first_token_depth_in_request +
-  //     request_infos[batch_config_request_id].num_tokens_in_batch;
+  //     request_infos[requext_idx_in_batch].first_token_depth_in_request +
+  //     request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
   int const totalCacheSize =
       bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1;
 
   int first_token_idx = 0;
-  for (int r = 0; r < batch_config_request_id; r++) {
+  for (int r = 0; r < requext_idx_in_batch; r++) {
     first_token_idx +=
-        request_completed[r] ? 0 : causalMask[r].current_layer_size;
+        request_available[r] ? causalMask[r].current_layer_size : 0;
   }
 
   int const tree_branch_num =
-      beam_request_infos[batch_config_request_id].sub_request_num;
+      beam_request_infos[requext_idx_in_batch].sub_request_num;
 
   // shared memory objects
   extern __shared__ char smem_[];
@@ -128,7 +134,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
 
   DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
 
   int ti_end =
       div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step;
@@ -169,7 +175,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
 
         // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
         //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
-        //          batch_config_request_id,
+        //          requext_idx_in_batch,
         //          ti,
         //          qk,
         //          qi);
@@ -252,7 +258,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
 
     // The base pointer for the value in the cache buffer.
     DT const *v_cache_batch =
-        value_cache + batch_config_request_id * max_seq_length * hidden_size +
+        value_cache + requext_idx_in_batch * max_seq_length * hidden_size +
         vi;
 
     if (Dh == Dh_MAX || vi < Dh) {
@@ -316,8 +322,6 @@ __global__ void spec_inc_store_kv_cache(
     DT *vCache_ptr,
     BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
-    TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
-    TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
     BatchConfig::BitMask *causalMask,
     int qProjSize,
     int kProjSize,
@@ -360,10 +364,10 @@ __global__ void spec_inc_store_kv_cache(
 
 template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                            BeamSearchBatchConfig const *bc,
+                            TreeSearchBatchConfig const *bc,
                             cudaStream_t stream) {
   int num_tokens = bc->num_active_tokens();
-  int curr_depth = bc->beamRequestsInfo[0].current_depth;
+  int curr_depth = bc->current_depth;
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
     spec_inc_store_kv_cache<<<GET_BLOCKS(parallelism),
@@ -375,8 +379,6 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
         static_cast<DT *>(m->valueCache),
         m->token_infos,
         m->request_infos,
-        m->beam_token_infos,
-        m->beam_request_infos,
         m->causalMask,
         m->qProjSize,
         m->kProjSize,
@@ -413,19 +415,18 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
           m->qProjSize,                                                        \
           m->hidden_size,                                                      \
           m->request_infos,                                                    \
-          m->beam_request_infos,                                               \
           m->causalMask,                                                       \
-          m->request_completed)
+          m->request_available)
 
 template <typename DT>
 void compute_spec_inc_attention_kernel_generation(
     SpecIncMultiHeadSelfAttentionMeta const *m,
-    BeamSearchBatchConfig const *bc,
+    TreeSearchBatchConfig const *bc,
     DT *output_ptr,
     cudaStream_t stream) {
   // one block == one head per request
   // how many generation requests
-  dim3 grid(m->num_q_heads, bc->get_speculative_request_num());
+  dim3 grid(m->num_q_heads, bc->num_active_requests());
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
   size_t smem_sz;
@@ -461,7 +462,7 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
-                                     BeamSearchBatchConfig const *bc,
+                                     TreeSearchBatchConfig const *bc,
                                      int shard_id,
                                      DT *output_ptr,
                                      DT const *bias_ptr,
@@ -502,16 +503,13 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) ||
+    if (!bc->request_available[i] ||
         (bc->requestsInfo[i].num_tokens_in_batch == 0)) {
       continue;
-    } else if (tokens_previous_requests < bc->num_generation_tokens) {
-      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
-      continue;
     }
 
     // all requests in prompt phase should only have one sub requests;
-    assert(bc->sub_requests[i] == 1);
+    // assert(bc->sub_requests[i] == 1);
     // int num_new_tokens = bc->num_processing_tokens[i];
     // int total_tokens = bc->token_last_available_idx[i] + 1;
 
@@ -696,13 +694,12 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     tokens_prev_requests_squares += num_new_tokens * total_tokens;
   }
 
-  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+  if (tokens_previous_requests != num_tokens) {
     bc->print();
     printf("tokens_previous_requests: %i\n", tokens_previous_requests);
     printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
   }
-  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
+  assert(tokens_previous_requests == num_tokens);
 }
 
 template <typename DT>
@@ -726,16 +723,17 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                      stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
-  if (bc->num_generation_tokens > 0) {
-    compute_spec_inc_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
-  }
+
   // phase 3: Compute attention score
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  if (bc->num_tokens > bc->num_generation_tokens) {
+  if (bc->current_phase == BatchConfig::ExecutionPhase::GENERATION) {
+    compute_spec_inc_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  } else if (bc->current_phase == BatchConfig::ExecutionPhase::PROMPT) {
     compute_attention_kernel_prompt(
         m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
   }
+
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
 
@@ -854,38 +852,18 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    beam_token_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
-
-    beam_request_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BeamSearchBatchConfig::beamTokenInfo));
     causalMask = reinterpret_cast<BatchConfig::BitMask *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-        sizeof(BatchConfig::causalMask));
+        sizeof(BatchConfig::request_available));
   }
 
   cudaStreamSynchronize(stream);
 }
 
 SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) {
-  if (beam_search_reserve_inst != Realm::RegionInstance::NO_INST) {
-    beam_search_reserve_inst.destroy();
+  if (tree_search_reserve_inst != Realm::RegionInstance::NO_INST) {
+    tree_search_reserve_inst.destroy();
   }
 }
 

From 7d3e857f2f3d78e3bf90bbd0b0de0c61bc2e52ba Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 22 Apr 2024 19:56:13 -0400
Subject: [PATCH 111/667] Changed the name of BatchConfig::BitMask.tree_size to
 BatchConfig::BitMask.tree_and_prompt_size and removed
 BatchConfig::BitMask.prompt_size.

---
 include/flexflow/batch_config.h              | 12 +++++------
 src/ops/spec_inc_multihead_self_attention.cu | 21 ++++++++++++++------
 src/runtime/request_manager.cc               | 16 +++++++--------
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 2bf6863de..8a03cddb8 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -111,17 +111,17 @@ class BatchConfig {
     // the number of generated tokens before the speculation tree (excluding the
     // prompt tokens)
     int non_tree_cache_size = 0;
-    // current tree size
-    int tree_size = 0;
+    // Tree size or prompt size. Because the prefilling phase and the decoding
+    // phase are separated, we only need one field to store the size of the tree
+    // or the prompt.
+    int tree_or_prompt_size = 0;
     int current_layer_size = 0;
-    // input length-> prompt/root
-    int prompt_size = 0;
+
     BitMask() = default;
     BitMask(BitMask const &other) {
       non_tree_cache_size = other.non_tree_cache_size;
-      tree_size = other.tree_size;
+      tree_or_prompt_size = other.tree_or_prompt_size;
       current_layer_size = other.current_layer_size;
-      prompt_size = other.prompt_size;
       for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
         bit_mask[i] = other.bit_mask[i];
       }
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index a880666f6..bf373cf78 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -87,8 +87,11 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   //     request_infos[batch_config_request_id].first_token_depth_in_request +
   //     request_infos[batch_config_request_id].num_tokens_in_batch;
 
+  //   int const totalCacheSize = bitmask.non_tree_cache_size +
+  //                              bitmask.tree_or_prompt_size +
+  //                              bitmask.prompt_size - 1;
   int const totalCacheSize =
-      bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1;
+      bitmask.non_tree_cache_size + bitmask.tree_or_prompt_size;
 
   int first_token_idx = 0;
   for (int r = 0; r < batch_config_request_id; r++) {
@@ -141,8 +144,10 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
           ii * THREADS_PER_KEY * K_VEC_SIZE);
     }
 
-    int const query_token =
-        bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi;
+    // int const query_token = bitmask.prompt_size + bitmask.tree_or_prompt_size
+    // -
+    //                         1 - tree_branch_num + qi;
+    int const query_token = bitmask.tree_or_prompt_size - tree_branch_num + qi;
 
     __syncthreads();
     for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
@@ -347,9 +352,13 @@ __global__ void spec_inc_store_kv_cache(
     // if prompt token -> token id
     // if tree token:
 
-    int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size +
-                          bitmask.tree_size - 1 - bitmask.current_layer_size +
-                          token_idx - request_token_offset;
+    // int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size +
+    //                       bitmask.tree_or_prompt_size - 1 -
+    //                       bitmask.current_layer_size + token_idx -
+    //                       request_token_offset;
+    int const cache_idx =
+        bitmask.non_tree_cache_size + bitmask.tree_or_prompt_size -
+        bitmask.current_layer_size + token_idx - request_token_offset;
 
     kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
                offset] = kVal;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d6d9ceb39..b8b915a37 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1595,7 +1595,7 @@ void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,
   // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
   // t4: 0000000..1000
   bitmask.non_tree_cache_size = 0;
-  bitmask.tree_size = 1;
+  bitmask.tree_or_prompt_size = 1;
 
   bitmask.prompt_size = initLength;
   bitmask.layer_size = initLength;
@@ -1620,7 +1620,7 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
   //           << bitmask.non_tree_cache_size << "\n";
 
   bitmask.non_tree_cache_size = non_tree_size + initLength - 1;
-  bitmask.tree_size = 1;
+  bitmask.tree_or_prompt_size = 1;
   bitmask.layer_size = initLength;
   // std::cout << "non_tree_size: " << non_tree_size << "\n";
   bitmask.prompt_size = 1;
@@ -1645,7 +1645,7 @@ void RequestManager::init_bitmask(RequestGuid guid, int prompt_length) {
   // 2. Maintain all other fields.
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
-  bitmask.tree_size = 0;
+  bitmask.tree_or_prompt_size = 0;
   bitmask.current_layer_size = 0;
   bitmask.prompt_size = prompt_length;
   bitmask.non_tree_cache_size = 0;
@@ -1661,7 +1661,7 @@ void RequestManager::update_bitmask(RequestGuid guid,
   // 2. Maintain all other fields.
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
-  bitmask.tree_size = 0;
+  bitmask.tree_or_prompt_size = 0;
   bitmask.current_layer_size = 0;
   bitmask.non_tree_cache_size += num_committed_tokens;
 }
@@ -1686,11 +1686,11 @@ void RequestManager::append_bitmask(RequestGuid guid) {
           .tree_layers[current_speculation_step - 1];
   int new_layer_size = tree_layer.size();
   int last_layer_size = bitmask.current_layer_size;
-  int previous_tree_size = bitmask.tree_size;
+  int previous_tree_size = bitmask.tree_or_prompt_size;
   bitmask.current_layer_size = new_layer_size;
-  bitmask.tree_size += new_layer_size;
+  bitmask.tree_or_prompt_size += new_layer_size;
 
-  assert(bitmask.tree_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  assert(bitmask.tree_or_prompt_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
 
   int parent_offset = previous_tree_size - last_layer_size;
   int child_offset = previous_tree_size;
@@ -1731,7 +1731,7 @@ void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
   // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
   // t4: 0000000..1000
   bitmask.non_tree_cache_size = 0;
-  bitmask.tree_size = 1;
+  bitmask.tree_or_prompt_size = 1;
   bitmask.prompt_size += initLength;
   bitmask.layer_size = initLength;
 

From 414d21f48601b6771a31eae3d44bcc1a2ceb4f04 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 22 Apr 2024 20:00:36 -0400
Subject: [PATCH 112/667] Remove unused code.

---
 src/runtime/request_manager.cc | 727 ---------------------------------
 1 file changed, 727 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b8b915a37..b2b03afc8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -527,733 +527,6 @@ BatchConfig RequestManager::prepare_decoding_batch() {
 }
 /* ----- Speculative Inference Specific functions ----- */
 
-// TO BE REMOVED: START
-TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-  if (verbose) {
-    std::cout << "\n############### prepare_first_spec_batch_config "
-                 "##############\n";
-  }
-  // TODO: Clean up the code, this method does the following:
-  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do
-  // this request by request. The infomation of the committed tokens are
-  // stored in Request.ssm_committed_tokens. Put the information of the
-  // committed tokens into BatchConfig.TokensInfo.
-  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
-  // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config()
-  // for more details.
-
-  // Step 1: use result to update requests
-  TreeSearchBatchConfig new_bc;
-  new_bc.num_tokens = 0;
-  new_bc.model_id = model_id;
-  int result_index = 0;
-
-  int num_generation_tokens = 0;
-  int num_active_req = -1;
-
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_available[i]) {
-      continue;
-    }
-    size_t guid = old_bc.requestsInfo[i].request_guid;
-    Request &request = all_requests[guid];
-
-    std::cout << "[ " << guid << " ]" << std::endl;
-
-    // Verify this: get verified tokens from result
-    std::vector<std::pair<BatchConfig::TokenId, int>> tree_outputs =
-        std::vector<std::pair<BatchConfig::TokenId, int>>();
-
-    assert(old_bc.num_tokens > 0);
-
-    // reset committed_tokens
-    if (committed_tokens.count(guid) == 0) {
-      committed_tokens[guid] = {};
-    } else {
-      committed_tokens[guid].clear();
-    }
-
-    // iterate through all the tokens that belong to request i
-    int root_abs_depth = request.tokens.size() - 1;
-
-    while (result_index < old_bc.num_tokens &&
-           old_bc.tokensInfo[result_index].request_index == i) {
-      int abs_depth = old_bc.tokensInfo[result_index].abs_index_in_request;
-      int token_id = result.token_ids[result_index];
-
-      if (request.status == Request::PENDING) {
-        committed_tokens[guid].emplace_back(abs_depth, result_index);
-      } else if (abs_depth >= root_abs_depth) {
-        tree_outputs.emplace_back(token_id, abs_depth + 1);
-        // std::cout << "committred tokens push: " << abs_depth
-        //           << " ,result index: " << result_index << "\n";
-        committed_tokens[guid].emplace_back(abs_depth, result_index);
-
-        if (verbose) {
-          std::cout << "Index within old batch: " << result_index << std::endl;
-          printf("  Input: [%d] %d ---> [%d] %d \n",
-                 abs_depth,
-                 old_bc.tokensInfo[result_index].token_id,
-                 tree_outputs.back().second,
-                 token_id);
-        }
-        // std::cout << "Index within old batch: " << result_index <<
-        // std::endl; printf("  Input: [%d] %d ---> [%d] %d \n",
-        //        abs_depth,
-        //        old_bc.tokensInfo[result_index].token_id,
-        //        tree_outputs.back().second,
-        //        token_id);
-      }
-      result_index++;
-    }
-
-    if (request.status == Request::RUNNING) {
-
-      std::vector<std::pair<BatchConfig::TokenId, int>> verified_tokens =
-          traverse_verify_tree(guid, dfs_tree_inputs.at(guid), tree_outputs);
-
-      log_req_mgr.print("Number of Verified Tokens = %zu",
-                        verified_tokens.size());
-      // check if the request is finished
-      if (verified_tokens.size() + request.tokens.size() >=
-          request.max_sequence_length) {
-        // Append all verified tokens to the request
-        for (auto const &token_pair : verified_tokens) {
-          if (token_pair.second < request.max_sequence_length) {
-            request.tokens.push_back(token_pair.first);
-          }
-        }
-        log_req_mgr.print("[Done] guid(%zu) with final length(%zu)",
-                          request.guid,
-                          request.tokens.size());
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        // Unlike Huggingface, the sentencepiece C++ library automatically
-        // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
-            request.tokens.at(0) == bos_token_id) {
-          output = "<s> " + output;
-        }
-        {
-          // update generation result
-          GenerationResult &gr = request_generation_results[request.guid];
-          assert(gr.guid == request.guid);
-          gr.output_tokens = request.tokens;
-          gr.output_text = output;
-        }
-        request.status = Request::COMPLETED;
-        trigger_request_completion_future(request.guid);
-        log_req_mgr.print("Final output: %s", output.c_str());
-
-        new_bc.request_available[i] = true;
-        new_bc.request_running[i] = false;
-        num_processed_requests++;
-
-        // Log profiling info
-        ProfileInfo profile_info = profiling_requests[request.guid];
-        profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
-        profile_info.ssm_decoding_steps = 0;
-        total_request_run_time +=
-            profile_info.finish_time - profile_info.start_time;
-        profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
-
-        // Write output to file if needed:
-        if (!output_filepath.empty()) {
-          std::ofstream outputFile(output_filepath, std::ios::app);
-          if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
-              }
-            }
-            outputFile << std::endl;
-            outputFile << output;
-
-            outputFile.close();
-          } else {
-            std::cout << "Unable to open the output file: " << output_filepath
-                      << std::endl;
-            assert(false);
-          }
-        }
-
-        // delete the old input tree from cache
-        dfs_tree_inputs.erase(request.guid);
-
-      } else { // Request not finished, pass verified_tokens to next iteration
-
-        new_bc.request_available[i] = false;
-        new_bc.request_running[i] = true;
-        num_active_req++;
-
-        // Normal Request Info
-        new_bc.requestsInfo[i].first_token_index_in_request =
-            verified_tokens.front().second;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid =
-            old_bc.requestsInfo[i].request_guid;
-        new_bc.requestsInfo[i].max_sequence_length =
-            old_bc.requestsInfo[i].max_sequence_length;
-        new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size();
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-        // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-        int new_max_depth =
-            new_bc.requestsInfo[i].max_sequence_length -
-            new_bc.requestsInfo[i].first_token_index_in_request -
-            verified_tokens.size();
-        new_bc.beamRequestsInfo[i].current_depth = 1;
-
-        profiling_requests[request.guid].ssm_decoding_steps = 0;
-        new_bc.requestsInfo[i].prompt_phase = true;
-
-        int ssm_decoding_steps = 0;
-        new_bc.beamRequestsInfo[i].beam_size =
-            spec_infer_tree_width.size() > ssm_decoding_steps
-                ? spec_infer_tree_width[ssm_decoding_steps]
-                : 1;
-        new_bc.beamRequestsInfo[i].max_depth =
-            std::min(new_max_depth, TreeSearchBatchConfig::MAX_BEAM_DEPTH);
-        for (int j = 0;
-             j < TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-             j++) {
-          new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-          new_bc.beamRequestsInfo[i].probs[j] = 1;
-        }
-
-        new_bc.beamRequestsInfo[i].sub_request_num = 1;
-
-        new_bc.sub_requests[i] = 1;
-
-        update_bitmask(new_bc.causalMask[i],
-                       verified_tokens.size(),
-                       request.tokens.size());
-
-        // Token Info
-        for (int j = 0; j < verified_tokens.size(); j++) {
-          auto token = verified_tokens.at(j);
-
-          // Normal Token Info
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              token.second;
-
-          // Beam Token Info
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
-          new_bc.num_tokens++;
-
-          // Add verified token to request's token list
-          request.tokens.push_back(token.first);
-
-          if (new_bc.num_tokens == get_max_tokens_per_batch()) {
-            break;
-          }
-        }
-
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        // Unlike Huggingface, the sentencepiece C++ library automatically
-        // removes the BOS token
-        if (model_type == ModelType::LLAMA &&
-            request.tokens.at(0) == bos_token_id) {
-          output = "<s> " + output;
-        }
-        log_req_mgr.print("Output: %s", output.c_str());
-      }
-
-    } else if (request.status == Request::PENDING) {
-      new_bc.request_available[i] = false;
-      new_bc.request_running[i] = false;
-      num_active_req++;
-
-      std::cout << "ssm_cache_size: " << request.ssm_cache_size << ", "
-                << "initial_len: " << request.initial_len << std::endl;
-      assert(request.ssm_cache_size == request.initial_len);
-
-      // Normal Request Info
-      new_bc.requestsInfo[i].first_token_index_in_request =
-          request.ssm_cache_size;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_bc.requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      // TODO: Beam Request Info, missing from VerifyTreeBatchConfig
-      new_bc.beamRequestsInfo[i].current_depth = 1;
-      int ssm_decoding_steps =
-          profiling_requests[request.guid].ssm_decoding_steps;
-      new_bc.beamRequestsInfo[i].beam_size =
-          spec_infer_tree_width.size() > ssm_decoding_steps
-              ? spec_infer_tree_width[ssm_decoding_steps]
-              : 1;
-      new_bc.beamRequestsInfo[i].max_depth = 0;
-      for (int j = 0; j < TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-           j++) {
-        new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-        new_bc.beamRequestsInfo[i].probs[j] = 1;
-      }
-
-      new_bc.beamRequestsInfo[i].sub_request_num = 1;
-
-      new_bc.sub_requests[i] = 1;
-
-      // Token Info
-      std::string output = this->tokenizer_->Decode(request.tokens);
-      // Unlike Huggingface, the sentencepiece C++ library automatically
-      // removes the BOS token
-      if (model_type == ModelType::LLAMA &&
-          request.tokens.at(0) == bos_token_id) {
-        output = "<s> " + output;
-      }
-      log_req_mgr.print("Output: %s", output.c_str());
-    } else {
-      assert(false);
-    }
-  }
-
-  // Step 2: Initialize new request
-  for (int i = 0; i < TreeSearchBatchConfig::max_requests_per_batch(); i++) {
-    if (new_bc.request_available[i]) {
-      if (!pending_request_queue.empty() &&
-          new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
-        // all_requests[new_request.guid] = new_request;
-        num_active_req++;
-        new_bc.requestsInfo[i].first_token_index_in_request = 0;
-        new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-        new_bc.requestsInfo[i].request_guid = new_request.guid;
-        new_bc.requestsInfo[i].num_tokens_in_batch =
-            std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-                     (int)new_request.tokens.size());
-        new_bc.requestsInfo[i].max_sequence_length =
-            new_request.max_sequence_length;
-        new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 0;
-        profile_info.ssm_decoding_steps = 0;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
-        // init the beam search metadata per request
-        int ssm_decoding_steps = profile_info.ssm_decoding_steps;
-
-        new_bc.beamRequestsInfo[i].beam_size =
-            spec_infer_tree_width.size() > ssm_decoding_steps
-                ? spec_infer_tree_width[ssm_decoding_steps]
-                : 1;
-        new_bc.beamRequestsInfo[i].current_depth = 1;
-        new_bc.beamRequestsInfo[i].max_depth =
-            std::min(TreeSearchBatchConfig::MAX_BEAM_DEPTH,
-                     get_max_tokens_per_batch() -
-                         new_bc.requestsInfo[i].num_tokens_in_batch - 1);
-        for (int j = 0;
-             j < TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-             j++) {
-          new_bc.beamRequestsInfo[i].parent_id[j] = 0;
-          new_bc.beamRequestsInfo[i].probs[j] = 1;
-        }
-
-        new_bc.request_available[i] = false;
-        new_bc.requestsInfo[i].prompt_phase = true;
-
-        new_bc.beamRequestsInfo[i].sub_request_num = 1;
-        printf("sub request num == 1, %d \n",
-               new_bc.beamRequestsInfo[i].beam_size);
-
-        new_bc.sub_requests[i] = 1;
-
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          int depth = new_bc.requestsInfo[i].first_token_index_in_request + j;
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request = depth;
-          assert(depth < new_request.tokens.size());
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              new_request.tokens[depth];
-
-          // beam search meta data, indicate which sub request this token
-          // belongs to, init to 0;
-          new_bc.beamTokenInfo[new_bc.num_tokens].sub_request_index = 0;
-          new_bc.num_tokens++;
-        }
-
-        init_bitmask(new_bc.causalMask[i],
-                     new_bc.requestsInfo[i].num_tokens_in_batch);
-
-        // if (new_bc.requestsInfo[i].num_tokens_in_batch <
-        // new_request.initial_len) {
-        //   all_requests[new_request.guid].status = Request::PENDING;
-        //   new_bc.request_running[i] = false;
-        //   std::cout << "Request " << new_request.guid << " is pending"
-        //             << std::endl;
-        // } else {
-        //   all_requests[new_request.guid].status = Request::RUNNING;
-        //   new_bc.request_running[i] = true;
-        //   std::cout << "Request " << new_request.guid << " is running"
-        //             << std::endl;
-        // }
-        all_requests[new_request.guid].status = Request::PENDING;
-        all_requests[new_request.guid].ssm_cache_size =
-            new_bc.requestsInfo[i].num_tokens_in_batch;
-        new_bc.request_running[i] = false;
-        std::cout << "SSM KV Cache Size init: "
-                  << all_requests[new_request.guid].ssm_cache_size << std::endl;
-        std::cout << "LLM KV Cache Size init: "
-                  << all_requests[new_request.guid].llm_cache_size << std::endl;
-
-        std::cout << "load " << new_bc.requestsInfo[i].num_tokens_in_batch
-                  << " tokens for request " << new_request.guid << std::endl;
-        std::cout << "total prompt in request: " << new_request.initial_len
-                  << std::endl;
-
-        if (new_bc.num_tokens == get_max_tokens_per_batch()) {
-          break;
-        }
-      }
-    }
-  }
-  new_bc.num_generation_tokens = num_generation_tokens;
-
-  if (verbose) {
-    std::cout << "prepare_next_batch_init OLD vs NEW batchconfigs below:"
-              << std::endl;
-    old_bc.print();
-    new_bc.print();
-  }
-  return new_bc;
-}
-
-TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-  if (verbose) {
-    std::cout
-        << "\n############### prepare_next_batch_verify ###############\n";
-  }
-  // TODO: Clean up the code, this method does the following:
-  // 1. Commit the verified tokens in the last iteration through the
-  // TreeVerifyBatchConfig . We can do this request by request. The
-  // information of the committed tokens is stored in
-  // Request.llm_committed_tokens. Put the information of the committed tokens
-  // into TreeVerifyBatchConfig::committed_tokens.
-  // 2. Load the tokens on the token tree that are not yet pruned to
-  // TreeVerifyBatchConfig::tokensInfo. Be careful with the abs_depth etc.
-  // (skip the pruned tokens).
-  // 3. Create the causal mask for the large model based on the small model
-  // causal mask.
-  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
-  // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config()
-  // for more details.
-
-  assert(old_batches.size() > 0);
-
-  TreeVerifyBatchConfig new_bc;
-  new_bc.num_tokens_to_commit = 0;
-  new_bc.num_tokens = 0;
-
-  int max_prompt_load_size = get_max_verify_tokens_per_batch();
-  for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) {
-    if (old_batches.at(0).request_available[i]) {
-      continue;
-    } else if (old_batches.at(0).request_running[i]) {
-      max_prompt_load_size -= (TreeSearchBatchConfig::MAX_BEAM_DEPTH + 1);
-    } else {
-      max_prompt_load_size -= 1;
-    }
-  }
-  int num_active_req = -1;
-  for (int i = 0; i < TreeVerifyBatchConfig::max_requests_per_batch(); i++) {
-    if (old_batches.at(0).request_available[i]) {
-      continue;
-    }
-    num_active_req++;
-    size_t guid = old_batches.at(0).requestsInfo[i].request_guid;
-    Request &request = all_requests[guid];
-
-    // Profiling
-    profiling_requests[request.guid].llm_decoding_steps += 1;
-
-    if (request.status == Request::RUNNING) {
-      new_bc.request_running[i] = true;
-
-      // Get the dfs tree
-      std::vector<std::vector<std::pair<BatchConfig::TokenId, int>>>
-          all_dfs_trees;
-
-      for (int j = 0; j < old_batches.size(); j++) {
-        std::vector<std::pair<BatchConfig::TokenId, int>> new_tree =
-            traverse_beam_tree(old_batches.at(j), i, request.tokens.size() - 1);
-        all_dfs_trees.push_back(new_tree);
-      }
-      assert(all_dfs_trees.size() == old_batches.size());
-      std::vector<std::pair<BatchConfig::TokenId, int>> dfs_tree_inputs =
-          merge_dfs_trees(all_dfs_trees, request.tokens.size() - 1, guid);
-
-      if (verbose) {
-        std::cout << "Request Tokens Size: " << request.tokens.size()
-                  << std::endl;
-        for (int k = 0; k < request.tokens.size(); k++) {
-          std::cout << k << ": " << request.tokens[k] << std::endl;
-        }
-      }
-
-      // Normal Request Info
-      new_bc.requestsInfo[i].first_token_index_in_request =
-          dfs_tree_inputs.front().second;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid =
-          old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      // copy bitmask to verify batchconfig
-      memcpy(&(new_bc.causalMask[i]),
-             &(old_batches.at(0).causalMask[i]),
-             sizeof(BatchConfig::BitMask));
-      // TODO: Check this
-      new_bc.requestsInfo[i].num_tokens_in_batch = 0;
-      new_bc.request_available[i] = false;
-
-      // std::cout << "dfs_tree_inputs: " << dfs_tree_inputs.size() << ", "
-      //           << new_bc.causalMask[i].tree_size << ", "
-      //           << new_bc.causalMask[i].non_tree_cache_size << "\n";
-      // std::cout << "mask: " <<
-      // std::bitset<64>(new_bc.causalMask[i].mask[0])
-      //           << "\n";
-
-      // Committed Tokens
-      if (committed_tokens.find(guid) != committed_tokens.end()) {
-        for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
-          // if (j < committed_tokens.at(guid).size()) {
-
-          auto committed_token = committed_tokens.at(guid).at(j);
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-              committed_token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-              committed_token.first;
-          if (verbose) {
-            std::cout << new_bc.num_tokens_to_commit
-                      << "- committed_token.token_depth: "
-                      << committed_token.first
-                      << ", token_index: " << committed_token.second
-                      << std::endl;
-          }
-          new_bc.num_tokens_to_commit++;
-          request.llm_cache_size++;
-          // }
-        }
-      }
-      if (verbose) {
-        std::cout << "new_bc.num_tokens_to_commit: "
-                  << new_bc.num_tokens_to_commit << std::endl;
-      }
-
-      // Incremental phase: only add the last committed token
-      new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-          request.tokens.size() - 1;
-
-      new_bc.num_tokens++;
-      new_bc.requestsInfo[i].num_tokens_in_batch++;
-
-      if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) {
-        assert(false &&
-               "Exceeding the space available in the TreeVerify batch");
-        break;
-      }
-
-      new_bc.requestsInfo[i].first_token_index_in_request =
-          request.tokens.size() - 1;
-
-      bool cutLayer = false;
-      // Add Tokens from the DFS Tree to the next batch
-      for (int j = 1; j < dfs_tree_inputs.size(); j++) {
-        auto token = dfs_tree_inputs.at(j);
-        if (verbose) {
-          std::cout << "[" << j << "] Token: " << token.first
-                    << ", Depth:" << token.second << std::endl;
-        }
-        // Normal Token Info
-        new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-        new_bc.tokensInfo[new_bc.num_tokens].token_id = token.first;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-            token.second;
-
-        new_bc.num_tokens++;
-        new_bc.requestsInfo[i].num_tokens_in_batch++;
-
-        if (new_bc.num_tokens == get_max_verify_tokens_per_batch() &&
-            (j != dfs_tree_inputs.size() - 1)) {
-          cutLayer = true;
-          break;
-        }
-      }
-
-      // delete the last incomplete layer
-      if (cutLayer) {
-        int total_tokens = new_bc.num_tokens;
-        for (int j = total_tokens - 1; j >= 1; j--) {
-          new_bc.num_tokens--;
-          new_bc.requestsInfo[i].num_tokens_in_batch--;
-          // std::cout << "cut: " << j << "\n";
-          if (new_bc.tokensInfo[j].abs_index_in_request !=
-              new_bc.tokensInfo[j - 1].abs_index_in_request) {
-            break;
-          }
-        }
-      }
-
-    } else if (request.status == Request::PENDING) {
-      new_bc.request_running[i] = false;
-      if (verbose) {
-        std::cout << "[Verify] Request " << request.guid
-                  << " is pending in loading prompt phase" << std::endl;
-        std::cout << "SSM KV Cache Size verify: " << request.ssm_cache_size
-                  << std::endl;
-        std::cout << "LLM KV Cache Size verify: " << request.llm_cache_size
-                  << std::endl;
-      }
-
-      // Commit all tokens from the last loading batch
-      if (committed_tokens.find(guid) != committed_tokens.end()) {
-        for (int j = 0; j < committed_tokens.at(guid).size(); j++) {
-          auto token = committed_tokens.at(guid).at(j);
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
-              token.second;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-              i;
-          new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-              token.first;
-
-          new_bc.num_tokens_to_commit++;
-          request.llm_cache_size++;
-        }
-        std::cout << "[Verify] Committed Tokens from last loading batch: "
-                  << new_bc.num_tokens_to_commit << std::endl;
-      }
-
-      memcpy(&(new_bc.causalMask[i]),
-             &(old_batches.at(0).causalMask[i]),
-             sizeof(BatchConfig::BitMask));
-
-      // Normal Request Info
-      new_bc.requestsInfo[i].first_token_index_in_request =
-          request.llm_cache_size;
-      new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
-      new_bc.requestsInfo[i].request_guid =
-          old_batches.at(0).requestsInfo[i].request_guid;
-      new_bc.requestsInfo[i].max_sequence_length =
-          old_batches.at(0).requestsInfo[i].max_sequence_length;
-      new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-
-      new_bc.request_available[i] = false;
-
-      new_bc.requestsInfo[i].num_tokens_in_batch =
-          std::min(max_prompt_load_size,
-                   (int)request.initial_len -
-                       new_bc.requestsInfo[i].first_token_index_in_request);
-      max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch;
-
-      std::cout << "max_prompt_load_size: " << max_prompt_load_size
-                << std::endl;
-
-      if (request.llm_cache_size < request.initial_len) {
-        // std::cout << "Initialization (prompt) phase: "
-        //           << new_bc.requestsInfo[i].num_tokens_in_batch << ", "
-        //           << old_batches.at(0).beamRequestsInfo[i].beam_size <<
-        //           "\n";
-        // Initialization (prompt) phase
-        for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              request.tokens[request.llm_cache_size + j];
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              request.llm_cache_size + j;
-          new_bc.num_tokens++;
-        }
-
-        if (new_bc.num_tokens > get_max_verify_tokens_per_batch()) {
-          printf("Exceeding (%i) the space available (%i) in the TreeVerify "
-                 "batch\n",
-                 new_bc.num_tokens,
-                 get_max_verify_tokens_per_batch());
-          assert(false);
-        }
-
-        if (new_bc.requestsInfo[i].num_tokens_in_batch +
-                request.llm_cache_size >=
-            request.initial_len) {
-          // launch the request into running phase after loading all prompt
-          request.status = Request::RUNNING;
-          new_bc.request_running[i] = true;
-
-          // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch: "
-          //           << new_bc.requestsInfo[i].num_tokens_in_batch <<
-          //           std::endl;
-          new_bc.requestsInfo[i].prompt_phase = true;
-
-          dfs_tree_inputs[guid] =
-              std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
-                  request.tokens.back(), request.tokens.size() - 1)};
-        }
-      } else { // launch the request into running phase after loading all
-               // prompt
-        if (get_max_verify_tokens_per_batch() - new_bc.num_tokens > 0) {
-          // std::cout << "Initialization running phase: "
-          //           << new_bc.requestsInfo[i].num_tokens_in_batch << "\n";
-          request.status = Request::RUNNING;
-          new_bc.request_running[i] = true;
-
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens.back();
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              request.tokens.size() - 1;
-
-          new_bc.num_tokens++;
-          new_bc.requestsInfo[i].num_tokens_in_batch++;
-          // std::cout << "new_bc.requestsInfo[i].num_tokens_in_batch2: "
-          //           << new_bc.requestsInfo[i].num_tokens_in_batch <<
-          //           std::endl;
-
-          new_bc.requestsInfo[i].prompt_phase = true;
-          dfs_tree_inputs[guid] =
-              std::vector<std::pair<BatchConfig::TokenId, int>>{std::make_pair(
-                  request.tokens.back(), request.tokens.size() - 1)};
-        }
-      }
-
-    } else {
-      assert(false && "Request status is not RUNNING or PENDING");
-    }
-  }
-
-  return new_bc;
-}
-// TO BE REMOVED: END
-
 /***** Request Init Phase *****/
 TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);

From 3f2ca94244e37a63f10eadedabd6717ccd4fdf9b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 22 Apr 2024 20:45:40 -0400
Subject: [PATCH 113/667] Removed unused method.

---
 include/flexflow/request_manager.h |  1 -
 src/runtime/request_manager.cc     | 19 -------------------
 2 files changed, 20 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 7fde35fb0..8226a3088 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -196,7 +196,6 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
-  void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength);
   void init_bitmask(RequestGuid guid, int prompt_length);
   void append_bitmask(RequestGuid guid);
   void update_bitmask(RequestGuid guid, int num_committed_tokens);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b2b03afc8..8d65fa8d3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -996,25 +996,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 }
 /* --------- Bitmask Related Functions --------- */
 
-// prompt phase, init task
-void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask,
-                                          int initLength) {
-  assert(initLength > 0);
-  // std::cout << "append pending bit mask: " << initLength << "\n";
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
-  // t4: 0000000..1000
-  bitmask.non_tree_cache_size = 0;
-  bitmask.tree_or_prompt_size = 1;
-  bitmask.prompt_size += initLength;
-  bitmask.layer_size = initLength;
-
-  // for (int i = 0; i < bitmask.prompt_size; i++) {
-  //   for (int j = i; j < bitmask.prompt_size; j++) {
-  //     bitmask.mask[i] |= (1 << j);
-  //   }
-  // }
-}
-
 // TO BE REMOVED: START
 std::vector<std::pair<BatchConfig::TokenId, int>>
     RequestManager::traverse_verify_tree(

From 1833a32fd18aef8ff8c6980d615b66d19fff0ec1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 23 Apr 2024 00:14:38 -0400
Subject: [PATCH 114/667] 1. Added root token to the speculative token tree.
 The root token is the last token in the accepted sequence. 2. Added a API
 add_root_to_spec_token_tree to add the root. 3. Changed the name of
 TokenTreeNode.tree_node_size to tree_size_including_pruned to avoid
 confusion. 4. Other small fixes related to adding root (the tree layer is
 increased by 1).

---
 include/flexflow/request_manager.h |  8 ++-
 src/runtime/request_manager.cc     | 96 +++++++++++++++++-------------
 2 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8226a3088..1ae547cd7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -153,7 +153,7 @@ class TokenTree {
   // The numebr of tokens in the tree that are not pruned
   int tree_size = 0;
   // The numebr of tokens in the tree including the pruned ones
-  int tree_node_size = 0;
+  int tree_size_including_pruned = 0;
   void add_layer() {
     tree_layers.emplace_back();
   }
@@ -350,8 +350,10 @@ class RequestManager {
   /* ---------- New Helper Functions ---------- */
 
   // Helper functions related to token trees
-  void RequestManager::init_token_trees(RequestGuid guid);
-  void add_token_to_spec_token_tree(RequestGuid guid,
+  void init_token_trees(RequestGuid guid);
+  void add_root_to_spec_token_tree(RequestGuid guid,
+                                   BatchConfig::TokenId token_id);
+  bool add_token_to_spec_token_tree(RequestGuid guid,
                                     BatchConfig::TokenId token_id,
                                     int parent_pos,
                                     float joint_prob);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8d65fa8d3..e6fc4af87 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -632,20 +632,20 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
 
     // Fill in the tokens
     TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
-    if (token_tree.tree_layers.size() < current_speculation_step) {
+    if (token_tree.tree_layers.size() <= current_speculation_step) {
       // This request has no token to decode in this and the following small
       // model inference steps
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() + token_tree.tree_node_size;
+          request.tokens.size() + token_tree.tree_size_including_pruned;
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
-          token_tree.tree_layers.at(current_speculation_step - 1);
+          token_tree.tree_layers.at(current_speculation_step);
       // Exclude the current layer from the token tree, because we want the
       // start index
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() + token_tree.tree_node_size -
+          request.tokens.size() + token_tree.tree_size_including_pruned -
           current_layer.size();
       new_bc.requestsInfo[request_index].num_tokens_in_batch =
           current_layer.size();
@@ -797,6 +797,7 @@ bool RequestManager::update_ssm_inference_results(
 
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
+  bool token_added_to_spec_tree = false;
 
   // Here we assume that the order of the tokens in the last
   // TreeSearchBatchConfig and hence the last SsmInferenceResult is equal to
@@ -812,22 +813,12 @@ bool RequestManager::update_ssm_inference_results(
     Request &request = all_requests[guid];
 
     TokenTree &token_tree = request.speculative_token_trees[0];
-    if (token_tree.tree_layers.size() == 0 && current_speculation_step == 1) {
-      // This is the first layer of the tree
-      for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-        add_token_to_spec_token_tree(
-            guid,
-            ssm_inference_result.token_ids[result_index],
-            ssm_inference_result.probs[result_index],
-            -1);
-        result_index++;
-      }
-    } else if (token_tree.tree_layers.size() < current_speculation_step - 1) {
+    if (token_tree.tree_layers.size() < current_speculation_step) {
       // This means that the parent layer is empty
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
-          token_tree.tree_layers[current_speculation_step - 2];
+          token_tree.tree_layers[current_speculation_step - 1];
       int parent_pos = 0;
       for (auto parent_it = parent_tree_layer.begin();
            parent_it != parent_tree_layer.end();
@@ -843,11 +834,13 @@ bool RequestManager::update_ssm_inference_results(
           // Parent token is not pruned
           for (int child_idx = 0; child_idx < num_branches; child_idx++) {
             float parent_prob = (*parent_it)->joint_prob;
-            add_token_to_spec_token_tree(
-                guid,
-                ssm_inference_result.token_ids[result_index],
-                ssm_inference_result.probs[result_index] * parent_prob,
-                parent_pos);
+            token_added_to_spec_tree =
+                token_added_to_spec_tree ||
+                add_token_to_spec_token_tree(
+                    guid,
+                    ssm_inference_result.token_ids[result_index],
+                    ssm_inference_result.probs[result_index] * parent_prob,
+                    parent_pos);
             result_index++;
           }
         }
@@ -856,6 +849,17 @@ bool RequestManager::update_ssm_inference_results(
     }
     append_bitmask(guid);
   }
+  return token_added_to_spec_tree;
+
+  /* Move this to update_inference_results() */
+  // State maintenance
+  current_speculation_step++;
+  if (!token_added_to_spec_tree ||
+      current_speculation_step > TreeSearchBatchConfig::MAX_TREE_DEPTH) {
+    // No token is added to the token tree, which indicates that the ssm
+    // inference phase is done. Proceed to the large model verification phase.
+    request_manager_status = LLM_VERIFY;
+  }
 }
 
 /* --------- Bitmask Related Functions --------- */
@@ -941,7 +945,8 @@ void RequestManager::update_bitmask(RequestGuid guid,
 
 void RequestManager::append_bitmask(RequestGuid guid) {
   // This method changes the bitmask in place
-  // This method is called after the first small model decoding step
+  // This method is called by update_ssm_inference_results(), after the new
+  // tokens are added to the token tree
   assert(current_speculation_step >= 1 &&
          "The current speculation step should be no less than 1");
 
@@ -949,14 +954,13 @@ void RequestManager::append_bitmask(RequestGuid guid) {
   BatchConfig::BitMask &bitmask = request.causal_mask;
   TokenTree &token_tree = request.speculative_token_trees[0];
 
-  if (token_tree.tree_layers.size() < current_speculation_step) {
+  if (token_tree.tree_layers.size() <= current_speculation_step) {
     // This request has no token added in this and the following small model
     // inference steps, skip it
     return;
   }
   std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-      request.speculative_token_trees[0]
-          .tree_layers[current_speculation_step - 1];
+      request.speculative_token_trees[0].tree_layers[current_speculation_step];
   int new_layer_size = tree_layer.size();
   int last_layer_size = bitmask.current_layer_size;
   int previous_tree_size = bitmask.tree_or_prompt_size;
@@ -971,14 +975,8 @@ void RequestManager::append_bitmask(RequestGuid guid) {
   int child_idx = 0;
   for (auto const &child_ptr : tree_layer) {
     // Each child copy its parent's mask
-    // Here we assume child_ptr->parent_pos denotes the position of the parent
-    // in its corresponding layer, check this
-    if (current_speculation_step > 1) {
-      // When current_speculation_step == 1, the
-      // tokens don't have a parent to attend to
-      bitmask.bit_mask[child_offset + child_idx] =
-          bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
-    }
+    bitmask.bit_mask[child_offset + child_idx] =
+        bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
     // Each child attend to itself
     bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
                                                        child_idx);
@@ -1478,11 +1476,23 @@ void RequestManager::init_token_trees(RequestGuid guid) {
   request.speculative_token_trees.clear();
 }
 
-void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
+void RequestManager::add_root_to_spec_token_tree(
+    RequestGuid guid, BatchConfig::TokenId token_id) {
+  // This method is called by update_llm_verify_results()
+  // The last token in the accepted sequence should be the root of the next
+  // speculation tree. The reason is that the KV cache of this token is not
+  // computed yet, and we need the large model to decode the logit of this token
+  // to verify its childs (the tokens in the first layer).
+  // This method should: construct and add the root token to the empty
+  // speculative token tree, with parent_pos being -1 and joint_prob being 1.0
+}
+
+bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
                                                   BatchConfig::TokenId token_id,
                                                   int parent_pos,
                                                   float joint_prob) {
   // This method assumes only one small model is used for speculation
+  // This method is called by update_ssm_inference_results()
 
   // This is called after the first small model inference
   assert(current_speculation_step >= 1 &&
@@ -1492,17 +1502,15 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
 
-  if (speculative_token_tree.tree_layers.size() ==
-      current_speculation_step - 1) {
+  if (speculative_token_tree.tree_layers.size() == current_speculation_step) {
     // When adding the first token, we need to add a new layer
     speculative_token_tree.add_layer();
   } else {
     // To add a token, the tree depth is either the same as the current
     // speculation step or one more than the current speculation step.
     assert(speculative_token_tree.tree_layers.size() ==
-               current_speculation_step &&
-           "The depth of the token tree should be consistent with the depth of "
-           "the token being added");
+               current_speculation_step + 1 &&
+           "Invalid token tree depth");
   }
 
   bool remove_min_node = false;
@@ -1577,23 +1585,25 @@ void RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
         std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
     request.speculative_token_trees[0]
-        .tree_layers[current_speculation_step - 1]
+        .tree_layers[current_speculation_step]
         .push_back(node_ptr);
     speculative_token_tree.tree_size++;
-    speculative_token_tree.tree_node_size++;
+    speculative_token_tree.tree_size_including_pruned++;
   }
+  return add_new_node;
 }
 
 void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
   // This method assumes only one small model is used for speculation
   Request &request = all_requests[guid];
 
-  if (request.speculative_token_trees[0].tree_layers.size() <
+  if (request.speculative_token_trees[0].tree_layers.size() <=
       current_speculation_step) {
     // There are no tokens in the last layer
     return;
   }
-  auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
+  auto &last_layer =
+      request.speculative_token_trees[0].tree_layers[current_speculation_step];
   for (auto it = last_layer.begin(); it != last_layer.end(); ++it) {
     if ((*it)->pruned) {
       last_layer.erase(it);

From 9e94f02151e66e49773bd3ceec13d5c8a77522dc Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 23 Apr 2024 14:55:39 +0800
Subject: [PATCH 115/667] chore: revise specinc

---
 src/ops/spec_inc_multihead_self_attention.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 690ab89bd..8456c157b 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -102,8 +102,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
         request_available[r] ? causalMask[r].current_layer_size : 0;
   }
 
-  int const tree_branch_num =
-      beam_request_infos[requext_idx_in_batch].sub_request_num;
+  int const tree_branch_num = 
+      request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
   // shared memory objects
   extern __shared__ char smem_[];
@@ -426,7 +426,7 @@ void compute_spec_inc_attention_kernel_generation(
     cudaStream_t stream) {
   // one block == one head per request
   // how many generation requests
-  dim3 grid(m->num_q_heads, bc->num_active_requests());
+  dim3 grid(m->num_q_heads, bc->num_available_requests);
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
   size_t smem_sz;

From 8edf26c078202f3c4563e9302baa96474e74d502 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 23 Apr 2024 14:56:36 +0800
Subject: [PATCH 116/667] fix: typo

---
 src/runtime/batch_config.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 22e1d3889..b396319af 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -63,7 +63,7 @@ InferenceMode BatchConfig::get_mode() const {
 int BatchConfig::num_active_requests() const {
   int num_requests = 0;
   for (int i = 0; i < max_requests_per_batch(); i++) {
-    if (!request_available[i]) {
+    if (request_available[i]) {
       num_requests++;
     }
   }

From 2a12c67229aab0092d48cff1375b04dd19acfb47 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 23 Apr 2024 17:29:23 +0800
Subject: [PATCH 117/667] chore: update BatchConfig in
 ops/tree_inc_multihead_self_attention

---
 .../ops/tree_inc_multihead_self_attention.h   |  1 -
 src/ops/tree_inc_multihead_self_attention.cc  |  2 +-
 src/ops/tree_inc_multihead_self_attention.cpp |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 65 ++++++++++---------
 4 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 02df0c013..d160da4a7 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -147,7 +147,6 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   int num_active_tokens;
   Realm::RegionInstance committed_token_reserve_inst;
   TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos;
-  bool *request_completed;
   BatchConfig::BitMask *causalMask;
 };
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index d0efb01d5..19ee265c5 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -740,7 +740,7 @@ void TreeIncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap TreeIncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 9e491127d..2ad253c24 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -36,7 +36,7 @@ __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    /* Reserved: BatchConfig, leave HIP code to be updated */TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 2d76fcf07..e8659607a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -50,12 +50,13 @@ __global__ void compute_attention_kernel_fused_kernel(
     int const max_token_per_batch,
     int per_head_size,
     int hidden_size,
-    BatchConfig::PerRequestInfo *request_infos,
+    /* Reserved: BatchConfig Updated */BatchConfig::PerRequestInfo *request_infos,
     int num_heads,
     int num_requests,
     BatchConfig::BitMask *causalMask,
-    bool *request_completed,
-    int qk_smem_sz) {
+    bool *request_available,
+    int qk_smem_sz,
+    bool prompt_phase) {
 
   // q, k
   using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
@@ -77,28 +78,35 @@ __global__ void compute_attention_kernel_fused_kernel(
   // request idx
   int const request_idx = blockIdx.y;
 
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
+  // request id in batch config
+  int requext_idx_in_batch = 0;
+  for (int i = 0; i < request_idx; i++) {
+    while (!request_available[requext_idx_in_batch]) {
+      requext_idx_in_batch++;
+    }
+  }
+
+  // threads converge
+  __syncthreads();
 
   int const first_step = 0;
 
   int const tlength =
-      request_infos[batch_config_request_id].first_token_index_in_request +
-      request_infos[batch_config_request_id].num_tokens_in_batch;
+      request_infos[requext_idx_in_batch].first_token_index_in_request +
+      request_infos[requext_idx_in_batch].num_tokens_in_batch;
   int const qlength =
-      request_infos[batch_config_request_id].num_tokens_in_batch;
+      request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
-  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
+  BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
 
   int first_token_idx = 0;
-  for (int r = 0; r < batch_config_request_id; r++) {
+  for (int r = 0; r < requext_idx_in_batch; r++) {
     first_token_idx +=
-        request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch;
+        request_available[r] ? request_infos[r].num_tokens_in_batch : 0;
   }
 
-  bool prompt_phase = request_infos[batch_config_request_id].prompt_phase;
   int q_start =
-      request_infos[batch_config_request_id].first_token_index_in_request;
+      request_infos[requext_idx_in_batch].first_token_index_in_request;
 
   // shared memory objects
   extern __shared__ char smem_[];
@@ -129,7 +137,7 @@ __global__ void compute_attention_kernel_fused_kernel(
   constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
 
   DT const *k_cache_batch =
-      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
 
   int ti_end =
       div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
@@ -260,7 +268,7 @@ __global__ void compute_attention_kernel_fused_kernel(
 
     // The base pointer for the value in the cache buffer.
     DT const *v_cache_batch =
-        value_cache + batch_config_request_id * max_seq_length * hidden_size +
+        value_cache + requext_idx_in_batch * max_seq_length * hidden_size +
         vi;
 
     if (Dh == Dh_MAX || vi < Dh) {
@@ -536,7 +544,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_available[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     assert(processed_tokens_in_batch ==
@@ -792,7 +800,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
 }
 
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream)      \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream, prompt_phase)      \
   smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
                               BatchConfig::max_sequence_length() +             \
                                   BatchConfig::max_spec_tree_token_num(),      \
@@ -813,7 +821,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
           output_ptr,                                                          \
           scale,                                                               \
           BatchConfig::max_sequence_length() +                                 \
-              BatchConfig::BatchConfig::max_spec_tree_token_num(),             \
+              BatchConfig::max_spec_tree_token_num(),                          \
           BatchConfig::max_tokens_per_batch(),                                 \
           m->qProjSize,                                                        \
           m->hidden_size,                                                      \
@@ -821,8 +829,9 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
           m->num_q_heads,                                                      \
           bc->num_active_requests(),                                           \
           m->causalMask,                                                       \
-          m->request_completed,                                                \
-          smem_sz[0])
+          m->request_available,                                                \
+          smem_sz[0],                                                          \
+          prompt_phase)
 
 template <typename DT>
 void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
@@ -852,6 +861,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
       m->hidden_size);
 
   dim3 grid(m->num_q_heads, bc->num_active_requests());
+  bool const prompt_phase = (bc->current_phase == BatchConfig::ExecutionPhase::PROMPT);
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
   // 0->qk production size, 1->total shared size
@@ -859,11 +869,11 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   if (per_head_size == 64) {
     constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
     LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream, prompt_phase);
   } else if (per_head_size == 128) {
     constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
     LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream, prompt_phase);
   } else {
     assert(false && "a unsupported head size");
   }
@@ -1059,22 +1069,17 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-
     causalMask = reinterpret_cast<BatchConfig::BitMask *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
+        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
+        sizeof(BatchConfig::request_available));
     committed_token_infos =
         reinterpret_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
             reinterpret_cast<char *>(handler.batch_config_metadata) +
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
+            sizeof(BatchConfig::request_available)) +
             sizeof(BatchConfig::causalMask));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::causalMask) +
-        sizeof(TreeVerifyBatchConfig::committed_tokens));
   }
 
   cudaStreamSynchronize(stream);

From cce98a61da166f7f96e43bcd9ce855706f62fa0b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 23 Apr 2024 17:31:44 +0800
Subject: [PATCH 118/667] chore: unify reserverd info

---
 src/ops/spec_inc_multihead_self_attention.cpp | 2 +-
 src/ops/tree_inc_multihead_self_attention.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index c56268b9d..cf605f8fd 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -36,7 +36,7 @@ __global__ void spec_store_kv_cache(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    /* Reserved: BatchConfig, leave HIP code to be updated */BatchConfig::PerTokenInfo *tokenInfos,
+    /* Reserved: BatchConfig Updated, leave HIP code to be updated */BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
     TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
     TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 2ad253c24..e1fc4d73a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -36,7 +36,7 @@ __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    /* Reserved: BatchConfig, leave HIP code to be updated */TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    /* Reserved: BatchConfig Updated, leave HIP code to be updated */TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,

From 5c4df2ee13c2971e94a32681205f6061cfa52f3c Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 23 Apr 2024 11:40:39 -0400
Subject: [PATCH 119/667] Fix name.

---
 inference/models/llama.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/models/llama.h b/inference/models/llama.h
index 3d8a5d1df..1a1481d38 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -51,8 +51,8 @@ class LLAMA {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
+      max_beam_width = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      max_beam_depth = TreeSearchBatchConfig::MAX_TREE_DEPTH;
     }
 
     void print() const {

From c9004454bb4417577cff9a663ead904755872d48 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 24 Apr 2024 02:01:13 +0800
Subject: [PATCH 120/667] chore: update cacl of first_token_idx

---
 src/ops/spec_inc_multihead_self_attention.cu | 6 +-----
 src/ops/tree_inc_multihead_self_attention.cu | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 6c82d7a5e..15b3f0a57 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -99,11 +99,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   int const totalCacheSize =
       bitmask.non_tree_cache_size + bitmask.tree_or_prompt_size;
 
-  int first_token_idx = 0;
-  for (int r = 0; r < requext_idx_in_batch; r++) {
-    first_token_idx +=
-        request_available[r] ? causalMask[r].current_layer_size : 0;
-  }
+  int const first_token_idx = request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
   int const tree_branch_num = 
       request_infos[requext_idx_in_batch].num_tokens_in_batch;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index e8659607a..357bf7aa5 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -99,11 +99,7 @@ __global__ void compute_attention_kernel_fused_kernel(
 
   BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
 
-  int first_token_idx = 0;
-  for (int r = 0; r < requext_idx_in_batch; r++) {
-    first_token_idx +=
-        request_available[r] ? request_infos[r].num_tokens_in_batch : 0;
-  }
+  int const first_token_idx = request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
   int q_start =
       request_infos[requext_idx_in_batch].first_token_index_in_request;

From 49cad9a3fb5ed0198336935cd82c2d78c7a9b5af Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 23 Apr 2024 19:48:55 -0400
Subject: [PATCH 121/667] 1. Add clear_bitmask() function to
 BatchConfig::BitMask to clear all masks. 2. Changed the name of init_bitmask
 to init_bitmask_prompt and update_bitmask to update_bitmask_prompt. 3. Added
 init_bitmask_spec() to initialize the mask for the root.

---
 include/flexflow/batch_config.h    | 17 ++++++++---
 include/flexflow/request_manager.h |  5 ++--
 src/runtime/request_manager.cc     | 48 +++++++++++++++++++++---------
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index eadca9b2e..501fbf887 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -63,10 +63,7 @@ class BatchConfig {
   int num_tokens;
   int num_available_requests;
 
-  enum class ExecutionPhase {
-    PROMPT,
-    GENERATION
-  };
+  enum class ExecutionPhase { PROMPT, GENERATION };
 
   ExecutionPhase current_phase;
 
@@ -109,6 +106,10 @@ class BatchConfig {
         return (bits[idx] & (1ULL << bit)) != 0;
       }
 
+      void clear() {
+        std::memset(bits, 0, sizeof(bits));
+      }
+
     private:
       uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 8]; // Array to hold 256 bits
     };
@@ -125,6 +126,7 @@ class BatchConfig {
     int current_layer_size = 0;
 
     BitMask() = default;
+
     BitMask(BitMask const &other) {
       non_tree_cache_size = other.non_tree_cache_size;
       tree_or_prompt_size = other.tree_or_prompt_size;
@@ -133,6 +135,13 @@ class BatchConfig {
         bit_mask[i] = other.bit_mask[i];
       }
     }
+
+    void clear_bitmask() {
+      // Clear bit_mask but keep the other fields
+      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
+        bit_mask[i].clear();
+      }
+    }
   };
 
   BitMask causalMask[MAX_NUM_REQUESTS];
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 1ae547cd7..797b44ef9 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -196,9 +196,10 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
-  void init_bitmask(RequestGuid guid, int prompt_length);
+  void init_bitmask_prompt(RequestGuid guid, int prompt_length);
   void append_bitmask(RequestGuid guid);
-  void update_bitmask(RequestGuid guid, int num_committed_tokens);
+  void update_bitmask_prompt(RequestGuid guid, int num_committed_tokens);
+  void init_bitmask_spec(RequestGuid guid, int num_committed_tokens);
   BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
 
   FFModel *get_ssm_model(int model_id);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e6fc4af87..3c320ae4a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -913,34 +913,54 @@ void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
 }
 // TO BE REMOVED: END
 
-void RequestManager::init_bitmask(RequestGuid guid, int prompt_length) {
-  // This method modifies the bitmask in place
-  // This method is called by update_llm_verify_results
-  // TODO: implement this function
+void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
+  // This method is called by update_llm_verify_results when there are new
+  // request to load into the batch
   // 1. Clear the causal mask because our current speculative token tree is
   // empty.
   // 2. Maintain all other fields.
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
-  bitmask.tree_or_prompt_size = 0;
-  bitmask.current_layer_size = 0;
-  bitmask.prompt_size = prompt_length;
+
+  bitmask.clear_bitmask();
+  bitmask.tree_or_prompt_size = prompt_length;
+  bitmask.current_layer_size = prompt_length;
   bitmask.non_tree_cache_size = 0;
 }
 
-void RequestManager::update_bitmask(RequestGuid guid,
-                                    int num_committed_tokens) {
+void RequestManager::update_bitmask_prompt(RequestGuid guid,
+                                           int num_committed_tokens) {
   // This method modifies the bitmask in place
   // This method is called by update_llm_verify_results
-  // TODO: implement this function
-  // 1. Clear the causal mask because our current speculative token tree is
-  // empty.
+  // 1. Clear the causal mask because the first SSM inference uses the prompt
+  // kernel and it doesn't use mask.
   // 2. Maintain all other fields.
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
-  bitmask.tree_or_prompt_size = 0;
-  bitmask.current_layer_size = 0;
+  bitmask.clear_bitmask();
+  bitmask.tree_or_prompt_size = num_committed_tokens;
+  bitmask.current_layer_size = num_committed_tokens;
+}
+
+void RequestManager::init_bitmask_spec(RequestGuid guid,
+                                       int num_committed_tokens) {
+  // This method modifies the bitmask in place
+  // This method is called by the first call of update_ssm_verify_results in a
+  // speculative iteration
+  // CAUTION: You should still call append_bitmask() after this method
+  // 1. Clear the causal mask and add a root into it, because the tree is
+  // currently empty but we have a root.
+  // 2. Maintain all other fields.
+  assert(current_speculation_step == 1 &&
+         "The current speculation step should be 1");
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  bitmask.clear_bitmask();
+  // Set the mask for the root
+  bitmask.bit_mask[0].set_bit(0);
+  bitmask.tree_or_prompt_size = 1;
   bitmask.non_tree_cache_size += num_committed_tokens;
+  bitmask.current_layer_size = 1;
 }
 
 void RequestManager::append_bitmask(RequestGuid guid) {

From 93871aba3df6d2d6e62fd81ab077e62b723b08c9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 24 Apr 2024 14:39:42 -0400
Subject: [PATCH 122/667] 1. Make update_llm_verify_results return bool. 2.
 Renamed get_verify_results to get_verify_results_greedy.

---
 include/flexflow/request_manager.h |  4 ++--
 src/runtime/request_manager.cc     | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 797b44ef9..6023832e3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -344,10 +344,10 @@ class RequestManager {
   // cache of the small model.
   TreeSearchBatchConfig prepare_first_spec_batch_config();
   TreeVerifyBatchConfig prepare_verify_batch_config();
-  void update_llm_verify_results(InferenceResult const &llm_verify_result);
+  bool update_llm_verify_results(InferenceResult const &llm_verify_result);
   bool update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
-  void get_verify_results(InferenceResult const &llm_verify_result);
+  void get_verify_results_greedy(InferenceResult const &llm_verify_result);
   /* ---------- New Helper Functions ---------- */
 
   // Helper functions related to token trees
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3c320ae4a..88a4453f4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -768,7 +768,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   return new_bc;
 }
 
-void RequestManager::update_llm_verify_results(
+bool RequestManager::update_llm_verify_results(
     InferenceResult const &llm_verify_result) {
   // TODO: Implement this function
   // We may have two types of InferenceResults, one is the results from
@@ -778,14 +778,14 @@ void RequestManager::update_llm_verify_results(
   // 1. Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
-  // token, while for the sampling construction of the speculative token tree,
-  // we need to implement a CPU based verify function.
-  // 2. Store the committed tokens to Request.llm_committed_tokens and
-  // Request.ssm_committed_tokens.
+  // token, this is implemented in get_verify_results_greedy(). For the
+  // sampling construction of the speculative token tree, we need to implement a
+  // CPU based verify function.
+  // 2. Store the committed tokens to Request.committed_tokens.
   // 3. Store the verified tokens to Request.tokens.
-  // 4. Some requests may be completed after appending the verified tokens,
-  // maintain the complete requests, and start a prefilling iteration.
-  // 5. For requests not completed, update their causal mask.
+  // 4. For requests not completed, update their causal mask.
+  // 5. Some requests may be completed after appending the verified tokens. If
+  // there is a request completed, return true.
 }
 
 bool RequestManager::update_ssm_inference_results(
@@ -1007,7 +1007,7 @@ void RequestManager::append_bitmask(RequestGuid guid) {
 BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   // This method creates a new bitmask for LLM verification model's bitmask,
   // it does not modify the small model's bitmask This method is called by
-  // prepare_verify_batch_config
+  // prepare_verify_batch_config()
   // TODO: implement this function
   // 1. Create the bitmask based on the pruned request token tree
   // 2. Maintain all other fields
@@ -1178,7 +1178,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 }
 // TO BE REMOVED: END
 
-void RequestManager::get_verify_results(
+void RequestManager::get_verify_results_greedy(
     InferenceResult const &llm_verify_result) {
   // This function maintain the generated token list of the request and the
   // committed tokens.

From d3a1aba1beae9a18205826c0d53f8c0189acd7b2 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyu.wang@yahooinc.com>
Date: Wed, 24 Apr 2024 15:30:15 -0400
Subject: [PATCH 123/667] fix typos

---
 include/flexflow/request_manager.h | 2 ++
 src/runtime/request_manager.cc     | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 9e28c0ac1..671763ef2 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -278,6 +278,8 @@ class RequestManager {
   BatchConfig get_next_batch_config(InferenceResult const &result);
   void update_inference_results(InferenceResult const &result);
   BatchConfig prepare_next_batch();
+  BatchConfig prepare_prefilling_batch();
+  BatchConfig prepare_decoding_batch();
 
   int get_num_active_requests();
   int get_empty_request_index();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b734ec5da..e790e4900 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -387,7 +387,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   std::lock_guard<std::mutex> const lock(rm_state_mutex);
   for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-    if guid_of_requests[i] == INVALID_GUID {
+    if (guid_of_requests[i] == INVALID_GUID) {
       continue;
     }
     Request &request = all_requests[guid_of_requests[i]];
@@ -417,7 +417,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 BatchConfig RequestManager::prepare_next_batch() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
-  swicth (request_manager_status) {
+  switch (request_manager_status) {
     case PREFILLING:
       return prepare_prefilling_batch();
     case DECODING:

From 88586e4babcf51a73999e63b26a6f596054b5441 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyu.wang@yahooinc.com>
Date: Wed, 24 Apr 2024 15:36:13 -0400
Subject: [PATCH 124/667] fix

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e790e4900..3a54dc3c4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -439,7 +439,7 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   BatchConfig bc;
   bc.num_tokens = BatchConfig::MAX_NUM_TOKENS;
 
-  request_index = get_empty_request_index();
+  int request_index = get_empty_request_index();
   assert(request_index != -1);
 
   Request new_request = pending_request_queue.front();
@@ -508,7 +508,7 @@ BatchConfig RequestManager::prepare_decoding_batch() {
 
     // Per Token Info
     bc.tokensInfo[bc.num_tokens].request_index = i;
-    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
     request.llm_cache_size++;

From 03dfa8d9914eb683af2a5d5b8e581101f139ddd9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 24 Apr 2024 16:54:04 -0400
Subject: [PATCH 125/667] Re-implemented the function
 get_verify_results_greedy().

---
 src/runtime/request_manager.cc | 98 +++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 26 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index fc2d2d9bc..67f14b763 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -778,14 +778,14 @@ bool RequestManager::update_llm_verify_results(
   // 1. Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
-  // token, this is implemented in get_verify_results_greedy(). For the
-  // sampling construction of the speculative token tree, we need to implement a
-  // CPU based verify function.
-  // 2. Store the committed tokens to Request.committed_tokens.
-  // 3. Store the verified tokens to Request.tokens.
-  // 4. For requests not completed, update their causal mask.
-  // 5. Some requests may be completed after appending the verified tokens. If
+  // token, this is implemented in get_verify_results_greedy(). This function
+  // stores the commmitted tokens into the corresponding fields in the Request.
+  // For the sampling construction of the speculative token tree, we need to
+  // implement a CPU based verify function.
+  // 2. For requests not completed, update their causal mask.
+  // 3. Some requests may be completed after appending the verified tokens. If
   // there is a request completed, return true.
+  get_verify_results_greedy(llm_verify_result);
 }
 
 bool RequestManager::update_ssm_inference_results(
@@ -1180,6 +1180,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 
 void RequestManager::get_verify_results_greedy(
     InferenceResult const &llm_verify_result) {
+  int llm_result_offset = 0;
   // This function maintain the generated token list of the request and the
   // committed tokens.
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
@@ -1192,41 +1193,86 @@ void RequestManager::get_verify_results_greedy(
     assert(request.status == Request::RUNNING);
     request.committed_tokens.clear();
 
-    // Traverse the speculative token tree and it with the LLM's sampling output
-    int llm_result_index = 0;
-    int verified_parent_pos = -1;
-    int committed_token_index = 0;
+    int committed_token_index = request.tokens.size();
+
     TokenTree &token_tree = request.speculative_token_trees[0];
-    for (auto const &tree_layer : token_tree.tree_layers) {
+    // First add the root to the committed tokens
+    request.committed_tokens.push_back(Request::CommittedToken(
+        llm_result_offset,
+        committed_token_index,
+        llm_verify_result.token_ids[llm_result_offset]));
+    committed_token_index++;
+    // The position of the last accepted token in its tree layer
+    int last_accepted_token_layer_index = 0;
+    // The index of the last accepted token in the entire tree (excluding the
+    // pruned tokens)
+    int last_accepted_token_index = 0;
+
+    int current_token_index = 1; // Because we skip the root
+    int num_layers = token_tree.tree_layers.size();
+    for (int layer_index = 1; layer_index < num_layers; layer_index++) {
+      // We skip the first layer
+      std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
+          token_tree.tree_layers.at(layer_index);
+
       bool token_accepted_this_layer = false;
-      int current_layer_index = 0;
+      int current_token_layer_index = 0;
+
       for (auto const &node_ptr : tree_layer) {
         if (node_ptr->pruned) {
           continue;
         }
-        if (node_ptr->parent_pos != verified_parent_pos) {
-          llm_result_index++;
-          current_layer_index++;
-          continue;
-        } else if (token_accepted_this_layer) {
-          // A token is already accepted in the current layer
-          llm_result_index++;
-          current_layer_index++;
+        if ((node_ptr->parent_pos != last_accepted_token_layer_index) ||
+            token_accepted_this_layer) {
+          // The token's parent is not accepted, or there is already another
+          // token accepted in this layer
+          current_token_index++;
+          current_token_layer_index++;
           continue;
         } else {
-          if (node_ptr->id == llm_verify_result.token_ids[llm_result_index]) {
+          // The token's parent is accepted, and no token has been accepted in
+          // this layer yet
+          if (node_ptr->id ==
+              llm_verify_result
+                  .token_ids[llm_result_offset + last_accepted_token_index]) {
+            // The token's parent is accepted, and this token's id equals the
+            // llm's sample at its parent's position. We accept this token.
+
+            // from_index: the index of the token in the tree
+            // to_index: the committed token index in the request
             request.committed_tokens.push_back(Request::CommittedToken(
-                llm_result_index, committed_token_index, node_ptr->id));
+                current_token_index, committed_token_index, node_ptr->id));
             request.tokens.push_back(node_ptr->id);
+
             token_accepted_this_layer = true;
-            verified_parent_pos = current_layer_index;
+            last_accepted_token_index = current_token_index;
+            last_accepted_token_layer_index = current_token_layer_index;
             committed_token_index++;
+            current_token_index++;
+            current_token_layer_index++;
           }
-          llm_result_index++;
-          current_layer_index++;
         }
       }
+      if (!token_accepted_this_layer) {
+        // No token is accepted in this layer, we should stop the traversal
+        // However, we have to add the last sampled token as a correction from
+        // the LLM
+
+        // from_index: since this token is not in the token tree, neither the
+        // ssm nor the llm have its KV cache, so the from_index should be a
+        // place holder, which is -1
+        request.committed_tokens.push_back(Request::CommittedToken(
+            -1,
+            committed_token_index,
+            llm_verify_result
+                .token_ids[llm_result_offset + last_accepted_token_index]));
+        request.tokens.push_back(
+            llm_verify_result
+                .token_ids[llm_result_offset + last_accepted_token_index]);
+        break;
+      }
     }
+    llm_result_offset += token_tree.tree_size;
   }
 }
 

From 7e8e99f3286a57fca7c3ab2beb8783f32e6e58cc Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 24 Apr 2024 17:20:35 -0400
Subject: [PATCH 126/667] Fix a bug: when adding the committed_tokens into the
 TreeVerifyBatchConfig, we don't need to add the last committed token in the
 request.

---
 src/runtime/request_manager.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 67f14b763..8350facd1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -723,9 +723,15 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
 
     // 2. Put the information of the committed tokens into
     // TreeVerifyBatchConfig.committed_tokens.
+    // Note here, we shouldn't put the last token in request.committed_tokens
+    // into new_bc. Because the LLM don't have that token's KV cache.
     std::vector<Request::CommittedToken> &committed_tokens =
         request.committed_tokens;
-    for (auto const &committed_token : committed_tokens) {
+    for (int committed_token_index = 0;
+         committed_token_index < committed_tokens.size() - 1;
+         committed_token_index++) {
+      Request::CommittedToken &committed_token =
+          committed_tokens.at(committed_token_index);
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =

From 7a40b1e6d3595da55cff84562963128e62d966db Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 24 Apr 2024 21:00:44 -0400
Subject: [PATCH 127/667] Modified some comments for
 update_llm_verify_results()

---
 src/runtime/request_manager.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8350facd1..87e33e40a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -788,8 +788,11 @@ bool RequestManager::update_llm_verify_results(
   // stores the commmitted tokens into the corresponding fields in the Request.
   // For the sampling construction of the speculative token tree, we need to
   // implement a CPU based verify function.
-  // 2. For requests not completed, update their causal mask.
-  // 3. Some requests may be completed after appending the verified tokens. If
+  // 2. Call add_root_token_to_spec_token_tree() to add the root token to the
+  // requests' speculative token tree. The root token is the last committed
+  // token.
+  // 3. For requests not completed, update their causal mask.
+  // 4. Some requests may be completed after appending the verified tokens. If
   // there is a request completed, return true.
   get_verify_results_greedy(llm_verify_result);
 }

From 1432ff052eafb0dcc4a870081e147c77a95d559b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 24 Apr 2024 21:12:01 -0400
Subject: [PATCH 128/667] 1. Added TokenTree::clear(), and a constructor. 2.
 Changed the name of init_token_trees to init_token_tree. 3. Re-implemented
 init_token_tree and add_root_to_spec_token_tree.

---
 include/flexflow/request_manager.h | 11 ++++++++++-
 src/runtime/request_manager.cc     | 15 +++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f76f9e8b6..3a2e76b7d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -154,9 +154,18 @@ class TokenTree {
   int tree_size = 0;
   // The numebr of tokens in the tree including the pruned ones
   int tree_size_including_pruned = 0;
+
   void add_layer() {
     tree_layers.emplace_back();
   }
+
+  void clear() {
+    tree_layers.clear();
+    tree_size = 0;
+    tree_size_including_pruned = 0;
+  }
+
+  TokenTree() : tree_size(0), tree_size_including_pruned(0) {}
 };
 
 class RequestManager {
@@ -353,7 +362,7 @@ class RequestManager {
   /* ---------- New Helper Functions ---------- */
 
   // Helper functions related to token trees
-  void init_token_trees(RequestGuid guid);
+  void init_token_tree(RequestGuid guid);
   void add_root_to_spec_token_tree(RequestGuid guid,
                                    BatchConfig::TokenId token_id);
   bool add_token_to_spec_token_tree(RequestGuid guid,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 87e33e40a..2bdc3d69e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -788,9 +788,9 @@ bool RequestManager::update_llm_verify_results(
   // stores the commmitted tokens into the corresponding fields in the Request.
   // For the sampling construction of the speculative token tree, we need to
   // implement a CPU based verify function.
-  // 2. Call add_root_token_to_spec_token_tree() to add the root token to the
-  // requests' speculative token tree. The root token is the last committed
-  // token.
+  // 2. Call init_token_tree() add_root_token_to_spec_token_tree() to add the
+  // root token to the requests' speculative token tree. The root token is the
+  // last committed token.
   // 3. For requests not completed, update their causal mask.
   // 4. Some requests may be completed after appending the verified tokens. If
   // there is a request completed, return true.
@@ -1546,9 +1546,11 @@ RequestManager *RequestManager::get_request_manager() {
 }
 
 /* --------- Request Token Tree Related Functions --------- */
-void RequestManager::init_token_trees(RequestGuid guid) {
+void RequestManager::init_token_tree(RequestGuid guid) {
   Request &request = all_requests[guid];
   request.speculative_token_trees.clear();
+  // Assume we only use one small model for speculation
+  request.speculative_token_trees.emplace_back();
 }
 
 void RequestManager::add_root_to_spec_token_tree(
@@ -1560,6 +1562,11 @@ void RequestManager::add_root_to_spec_token_tree(
   // to verify its childs (the tokens in the first layer).
   // This method should: construct and add the root token to the empty
   // speculative token tree, with parent_pos being -1 and joint_prob being 1.0
+  Request &request = all_requests[guid];
+  TokenTree &speculative_token_tree = request.speculative_token_trees[0];
+  speculative_token_tree.add_layer();
+  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, -1, 1.0);
+  speculative_token_tree.tree_layers[0].push_back(node_ptr);
 }
 
 bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,

From 400fe1d3a3f957336e8405022bb9da407f8502cd Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 24 Apr 2024 21:41:11 -0400
Subject: [PATCH 129/667] Add comments on the prefilling and decoding side.

---
 src/runtime/request_manager.cc | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2bdc3d69e..26176816d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -435,6 +435,14 @@ BatchConfig RequestManager::prepare_next_batch() {
 }
 
 BatchConfig RequestManager::prepare_prefilling_batch() {
+  // This function is called when the request_manager_status is PREFILLING,
+  // which means that there is at least one empty slot in the currnet decoding
+  // batch, and there is at least one pending request in the request queue.
+  // This function takes the pending request, and load its prefilling tokens,
+  // constructing a BatchConfig with only one request.
+
+  // The following part should be moved to the update_inference_results()
+  // function
   if (pending_request_queue.empty()) {
     if (get_num_active_requests() == 0) {
       return BatchConfig();
@@ -455,12 +463,12 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   guid_of_requests[request_index] = new_request.guid;
 
   // Per Request Info
-  bc.requestsInfo[request_index].first_token_depth_in_request = 0;
+  bc.requestsInfo[request_index].first_token_index_in_request = 0;
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
   bc.requestsInfo[request_index].num_tokens_in_batch =
       std::min(bc.num_tokens, (int)new_request.tokens.size());
 
-  bc.request_completed[request_index] = false;
+  bc.request_available[request_index] = true;
 
   new_request.first_token_offset_in_batch = 0;
   new_request.num_tokens_in_batch = 0;
@@ -488,6 +496,10 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
 }
 
 BatchConfig RequestManager::prepare_decoding_batch() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
+
   BatchConfig bc;
   bc.num_tokens = 0;
 

From 1433c9d07851f5f78fa952802c9ec02299cd3c1e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 25 Apr 2024 12:03:19 -0400
Subject: [PATCH 130/667] 1. Removed BatchConfig::ExecutionPhase and
 BatchConfig::current_phase. 2. Added BatchConfig::prompt_phase to indicate
 whether the batch is in prompt phase. 3. Re-implemented the initialization of
 BatchConfig.

---
 include/flexflow/batch_config.h | 18 ++++++++----------
 src/runtime/batch_config.cc     | 17 +++++------------
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 501fbf887..24122f394 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -62,20 +62,18 @@ class BatchConfig {
 
   int num_tokens;
   int num_available_requests;
-
-  enum class ExecutionPhase { PROMPT, GENERATION };
-
-  ExecutionPhase current_phase;
+  bool prompt_phase;
 
   struct PerRequestInfo {
-    int first_token_index_in_request;
-    int first_token_offset_in_batch;
-    int num_tokens_in_batch;
+    int first_token_index_in_request = 0;
+    int first_token_offset_in_batch = 0;
+    int num_tokens_in_batch = 0;
   };
+
   struct PerTokenInfo {
-    TokenId token_id;
-    int abs_index_in_request;
-    int request_index;
+    TokenId token_id = 0;
+    int abs_index_in_request = 0;
+    int request_index = 0;
   };
 
   class BitMask {
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index b396319af..1b5215975 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -25,18 +25,11 @@ LegionRuntime::Logger::Category log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_tokens(0) {
-  for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
-    requestsInfo[i].first_token_index_in_request = 0;
-    requestsInfo[i].first_token_offset_in_batch = 0;
-    requestsInfo[i].num_tokens_in_batch = 0;
-    request_available[i] = true;
-  }
-  for (int i = 0; i < MAX_NUM_TOKENS; i++) {
-    tokensInfo[i].abs_index_in_request = 0;
-    tokensInfo[i].request_index = 0;
-    tokensInfo[i].token_id = 0;
-  }
+BatchConfig::BatchConfig()
+    : num_tokens(0), num_available_requests(0), prompt_phase(false) {
+  memset(request_available, false, sizeof(bool) * MAX_NUM_REQUESTS);
+  // Don't need to initialize requestInfo ,tokensInfo, and causalMask here
+  // because they initialize themselves.
 }
 
 /*static*/

From aa3d598538be55fa07c6d1c77fae61509aaf3e9f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 25 Apr 2024 12:05:42 -0400
Subject: [PATCH 131/667] Maintail BatchConfig::prompt_phase in
 prepare_next_batch()

---
 src/runtime/request_manager.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 26176816d..27a38f883 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -441,6 +441,11 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   // This function takes the pending request, and load its prefilling tokens,
   // constructing a BatchConfig with only one request.
 
+  // TODO:
+  // 1. Adept this function to the new design
+  // 2. Move the following part to the update_inference_results() function
+  // 3. Change the BatchConfig.prompt_phase
+
   // The following part should be moved to the update_inference_results()
   // function
   if (pending_request_queue.empty()) {
@@ -500,6 +505,11 @@ BatchConfig RequestManager::prepare_decoding_batch() {
   // fills the last token of each request in the current batch to the
   // BatchConfig for the LLM to decode.
 
+  // TODO:
+  // 1. Adept this function to the new design
+  // 2. Move the following part to the update_inference_results() function
+  // 3. Change the BatchConfig::prompt_phase
+
   BatchConfig bc;
   bc.num_tokens = 0;
 
@@ -561,6 +571,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   new_bc.num_tokens = 0;
   new_bc.current_depth = 0;
   new_bc.num_available_requests = 0;
+  new_bc.prompt_phase = true;
   assert(current_speculation_step == 0);
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
@@ -625,6 +636,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   new_bc.num_tokens = 0;
   new_bc.current_depth = current_speculation_step;
   new_bc.num_available_requests = 0;
+  new_bc.prompt_phase = false;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
@@ -712,6 +724,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   new_bc.num_tokens = 0;
   new_bc.num_available_requests = 0;
   new_bc.num_tokens_to_commit = 0;
+  new_bc.prompt_phase = false;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {

From 5ad499cae16bd314c427f9774e75635d21ff3bcf Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 25 Apr 2024 12:11:13 -0400
Subject: [PATCH 132/667] Removed unused helper functions and rearranged the
 header file.

---
 include/flexflow/request_manager.h |  33 ++---
 src/runtime/request_manager.cc     | 213 -----------------------------
 2 files changed, 15 insertions(+), 231 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 3a2e76b7d..07800e2aa 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -205,11 +205,6 @@ class RequestManager {
                           int eos_token_id,
                           std::string const &path);
   void register_output_filepath(std::string const &);
-  void init_bitmask_prompt(RequestGuid guid, int prompt_length);
-  void append_bitmask(RequestGuid guid);
-  void update_bitmask_prompt(RequestGuid guid, int num_committed_tokens);
-  void init_bitmask_spec(RequestGuid guid, int num_committed_tokens);
-  BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
 
   FFModel *get_ssm_model(int model_id);
 
@@ -228,12 +223,6 @@ class RequestManager {
   // Methods to check and mark request completion
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
-  std::vector<std::pair<BatchConfig::TokenId, int>> traverse_verify_tree(
-      size_t guid,
-      std::vector<std::pair<BatchConfig::TokenId, int>> const
-          &inputSerializedTree,
-      std::vector<std::pair<BatchConfig::TokenId, int>> const
-          &outputSerializedTree);
   static void background_serving_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
@@ -260,7 +249,6 @@ class RequestManager {
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
       Legion::Runtime *runtime);
-
   // API for rm state machine
   BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
                                           Legion::Context ctx,
@@ -344,7 +332,10 @@ class RequestManager {
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
 
-  /* ---------- New Helper Functions ---------- */
+  /* ---------- Spec Decoding Helper Functions ---------- */
+  bool update_llm_verify_results(InferenceResult const &llm_verify_result);
+  bool update_ssm_inference_results(
+      SsmInferenceResult const &ssm_inference_result);
   // Prepare the next speculation batch config. This function is called before
   // the second step of the speculation.
   TreeSearchBatchConfig prepare_next_spec_batch_config();
@@ -355,13 +346,18 @@ class RequestManager {
   // cache of the small model.
   TreeSearchBatchConfig prepare_first_spec_batch_config();
   TreeVerifyBatchConfig prepare_verify_batch_config();
-  bool update_llm_verify_results(InferenceResult const &llm_verify_result);
-  bool update_ssm_inference_results(
-      SsmInferenceResult const &ssm_inference_result);
+
+  // LLM result verification
   void get_verify_results_greedy(InferenceResult const &llm_verify_result);
-  /* ---------- New Helper Functions ---------- */
 
-  // Helper functions related to token trees
+  // Bitmask related
+  void init_bitmask_prompt(RequestGuid guid, int prompt_length);
+  void append_bitmask(RequestGuid guid);
+  void update_bitmask_prompt(RequestGuid guid, int num_committed_tokens);
+  void init_bitmask_spec(RequestGuid guid, int num_committed_tokens);
+  BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
+
+  // Token tree related
   void init_token_tree(RequestGuid guid);
   void add_root_to_spec_token_tree(RequestGuid guid,
                                    BatchConfig::TokenId token_id);
@@ -370,6 +366,7 @@ class RequestManager {
                                     int parent_pos,
                                     float joint_prob);
   void prune_last_layer_of_spec_token_tree(RequestGuid guid);
+  /* ---------- Spec Decoding Helper Functions ---------- */
 };
 
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 27a38f883..a07c2edbc 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -898,55 +898,6 @@ bool RequestManager::update_ssm_inference_results(
 
 /* --------- Bitmask Related Functions --------- */
 
-// TO BE REMOVED: START
-// prompt phase, init task
-void RequestManager::init_bitmask(BatchConfig::BitMask &bitmask,
-                                  int initLength) {
-  assert(initLength > 0);
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
-  // t4: 0000000..1000
-  bitmask.non_tree_cache_size = 0;
-  bitmask.tree_or_prompt_size = 1;
-
-  bitmask.prompt_size = initLength;
-  bitmask.layer_size = initLength;
-  // std::cout << "see bit mask" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[0]) << "\n";
-  // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[1]) << "\n";
-  // std::cout << "see bit mask" << std::bitset<64>(bitmask.mask[2]) << "\n";
-}
-
-// prepare next init
-void RequestManager::update_bitmask(BatchConfig::BitMask &bitmask,
-                                    int initLength,
-                                    int non_tree_size) {
-  // assert(initLength == 1);
-  // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100,
-  // t4: 0000000..1000
-  assert(initLength <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM &&
-         "do not support tree size > 64");
-  assert(initLength >= 1 && "verified token num should >= 1");
-
-  // std::cout << "non tree size: " << non_tree_size << ", "
-  //           << bitmask.non_tree_cache_size << "\n";
-
-  bitmask.non_tree_cache_size = non_tree_size + initLength - 1;
-  bitmask.tree_or_prompt_size = 1;
-  bitmask.layer_size = initLength;
-  // std::cout << "non_tree_size: " << non_tree_size << "\n";
-  bitmask.prompt_size = 1;
-  for (int i = 0; i < bitmask.prompt_size; i++) {
-    for (int j = i; j < bitmask.prompt_size; j++) {
-      bitmask.mask[i] |= (1 << j);
-    }
-  }
-
-  // std::cout << "see bit mask update" << bitmask.prompt_size << "\n";
-  // std::cout << "see bit mask update" << std::bitset<64>(bitmask.mask[0])
-  //           << "\n";
-}
-// TO BE REMOVED: END
-
 void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
   // This method is called by update_llm_verify_results when there are new
   // request to load into the batch
@@ -1048,170 +999,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 }
 /* --------- Bitmask Related Functions --------- */
 
-// TO BE REMOVED: START
-std::vector<std::pair<BatchConfig::TokenId, int>>
-    RequestManager::traverse_verify_tree(
-        size_t guid,
-        std::vector<std::pair<BatchConfig::TokenId, int>> const
-            &inputSerializedTree,
-        std::vector<std::pair<BatchConfig::TokenId, int>> const
-            &outputSerializedTree) {
-  std::vector<std::pair<TreeSearchBatchConfig::TokenId, int>> verifiedTree;
-  // verifiedTree.push_back(inputSerializedTree.at(0));
-  std::vector<std::pair<int, int>> new_committed_tokens =
-      std::vector<std::pair<int, int>>();
-
-  log_req_mgr.print("Input tree size (%zu) Output tree size (%zu)",
-                    inputSerializedTree.size(),
-                    outputSerializedTree.size());
-  { // Input tree
-    std::ostringstream oss;
-    // inputSerializedTree is the dfs_tree_inputs_map[guid] array og (token
-    // id, depth) pairs
-    for (auto const &pair : inputSerializedTree) {
-      oss << " " << pair.second << ":" << pair.first;
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-    }
-    log_req_mgr.print("Input tree:%s", oss.str().c_str());
-  }
-  { // Output tree
-    // log_req_mgr.print("========Output============");
-    // outputSerializedTree is an array of (token id, depth + 1) pairs
-    std::ostringstream oss;
-    for (auto const &pair : outputSerializedTree) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("Output tree:%s", oss.str().c_str());
-  }
-  {
-    // log_req_mgr.print("========Committed============");
-    //  committed_tokens[guid] is an array of (depth, result_index) pairs for
-    //  the given request
-    std::ostringstream oss;
-    for (auto const &pair : committed_tokens.at(guid)) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("Committed tokens:%s", oss.str().c_str());
-  }
-
-  // It's safe to have inputSerializedTree.size() >
-  // outputSerializedTree.size() In this case the inputSeriedTree ends with
-  // padding 0s
-  assert(inputSerializedTree.size() >= outputSerializedTree.size());
-
-  int *treeLayers = new int[inputSerializedTree.size()];
-  int node_num = 1;
-  int layer_num = 0;
-  for (int token_id = 0; token_id < inputSerializedTree.size(); token_id++) {
-    if (token_id == (inputSerializedTree.size() - 1) ||
-        inputSerializedTree.at(token_id + 1).second !=
-            inputSerializedTree.at(token_id).second) {
-      treeLayers[layer_num] = node_num;
-      layer_num += 1;
-      node_num = 1;
-    } else {
-      node_num++;
-    }
-  }
-
-  // to avoid branch switch when same tokens in input tree.
-  // todo, only checked for N->1->1->1 cases
-
-  bool findFirst = false;
-  layer_num = -1;
-  int first_layer_slot = 0;
-  int first_layer_slot_total = 0;
-  int processed_whole_layer_tokens = 0;
-
-  for (int i = 0; i < outputSerializedTree.size(); i++) {
-    auto input = inputSerializedTree.at(i);
-    auto output = outputSerializedTree.at(i);
-
-    if (i == 0 || inputSerializedTree.at(i - 1).second !=
-                      inputSerializedTree.at(i).second) {
-      layer_num += 1;
-      processed_whole_layer_tokens += i == 0 ? 0 : treeLayers[layer_num - 1];
-    }
-
-    if (i == 0) {
-      verifiedTree.push_back(output);
-
-      new_committed_tokens.push_back(std::make_pair(
-          input.second,
-          committed_tokens.at(guid).at(i).second)); // <input_abs_depth,
-                                                    // input_index_in_batch>
-      // std::cout << committed_tokens.at(guid).at(i).first << ", "
-      //           << committed_tokens.at(guid).at(i).second << std::endl;
-      // std::cout << input.first << ", " << input.second << std::endl;
-
-      assert(committed_tokens.at(guid).at(i).first == input.second);
-      continue;
-    }
-
-    if (input.first == verifiedTree.back().first &&
-        input.second == verifiedTree.back().second) {
-      if (findFirst) {
-        // must in this branch.
-        int layer_slot = i - processed_whole_layer_tokens;
-        int layer_slot_total = treeLayers[layer_num];
-        if ((first_layer_slot == layer_slot)) {
-          verifiedTree.push_back(output);
-          new_committed_tokens.push_back(std::make_pair(
-              input.second, committed_tokens.at(guid).at(i).second));
-          // at this point, you'll not go other branches
-          // std::cout << "verify tree push back: " << output.first
-          //           << ", tree size is: " << verifiedTree.size()
-          //           << ", ??: " << input.first << ", " << input.second <<
-          //           "\n";
-
-        } else {
-          printf("not correct slot\n");
-        }
-      } else {
-        verifiedTree.push_back(output);
-        first_layer_slot = i - processed_whole_layer_tokens;
-        first_layer_slot_total = treeLayers[layer_num];
-        findFirst = true;
-        new_committed_tokens.push_back(std::make_pair(
-            input.second,
-            committed_tokens.at(guid).at(i).second)); // <input_abs_depth,
-                                                      // input_index_in_batch>
-        // at this point, you'll not go other branches
-        // std::cout << "verify tree push back: " << output.first
-        //           << ", tree size is: " << verifiedTree.size()
-        //           << ", ??: " << input.first << ", " << input.second <<
-        //           "\n";
-      }
-
-      assert(committed_tokens.at(guid).at(i).first == input.second);
-    }
-  }
-  committed_tokens[guid] = new_committed_tokens;
-  {
-    // log_req_mgr.print("========Verified============");
-    std::ostringstream oss;
-    for (auto const &pair : verifiedTree) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("Verified:%s", oss.str().c_str());
-  }
-  {
-    // log_req_mgr.print("========New Committed============");
-    std::ostringstream oss;
-    for (auto const &pair : committed_tokens.at(guid)) {
-      // log_req_mgr.print("(%d, %d)", pair.first, pair.second);
-      oss << " " << pair.second << ":" << pair.first;
-    }
-    log_req_mgr.print("New committed:%s", oss.str().c_str());
-  }
-
-  return verifiedTree;
-}
-// TO BE REMOVED: END
-
 void RequestManager::get_verify_results_greedy(
     InferenceResult const &llm_verify_result) {
   int llm_result_offset = 0;

From 9e55600a352c7771f73b5df9a5e708e37e8243b3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 26 Apr 2024 00:27:33 +0800
Subject: [PATCH 133/667] chore: replace BatchConfig::current_phase with
 prompt_phase (bool); remove unused current_depth

---
 src/ops/beam_topk.cpp                        |  3 +--
 src/ops/beam_topk.cu                         |  3 +--
 src/ops/inc_multihead_self_attention.cu      | 10 +++++-----
 src/ops/spec_inc_multihead_self_attention.cu | 11 ++++-------
 src/ops/tree_inc_multihead_self_attention.cu |  5 ++---
 5 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
index 948fdd110..5ff3c29ea 100644
--- a/src/ops/beam_topk.cpp
+++ b/src/ops/beam_topk.cpp
@@ -600,8 +600,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
                       tokens_per_request.data(),
                       sizeof(int) * beam_num_blocks,
                       hipMemcpyHostToDevice));
-  // int depth =
-  //     bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth;
+
   beam_topk_forward_kernel<<<beam_num_blocks, num_shards, 0, stream>>>(
       input_ptr,
       shared_memory_size,
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index 09c09c2e3..606f08997 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -641,8 +641,7 @@ void BeamTopK::forward_kernel(BeamTopKMeta const *m,
                             sizeof(int) * beam_num_blocks,
                             cudaMemcpyHostToDevice,
                             stream));
-  // int depth =
-  //     bc->beamRequestsInfo[bc->tokensInfo[0].request_index].current_depth;
+
   beam_num_blocks = bc->num_active_tokens();
   beam_topk_forward_kernel<<<beam_num_blocks, num_shards, 0, stream>>>(
       input_ptr,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 1af539852..50d8579e1 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -837,14 +837,14 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
-  if (bc->current_phase == BatchConfig::ExecutionPhase::GENERATION) {
-    // phase 3: Compute attention score for generation tokens
-    compute_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
-  } else if (bc->current_phase == BatchConfig::ExecutionPhase::PROMPT) {
+  if (bc->prompt_phase) {
     // phase 3: Compute attention score for prompt tokens;
     compute_attention_kernel_prompt(
         m, bc, shard_id, bias_ptr, weight_ptr, stream);
+  } else {
+    // phase 3: Compute attention score for generation tokens
+    compute_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
   // compute output production and bias together for all tokens
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 15b3f0a57..5c047de79 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -329,7 +329,6 @@ __global__ void spec_inc_store_kv_cache(
     int vProjSize,
     int num_tokens,
     int max_seq_len,
-    bool is_root,
     int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
     int token_idx = i / (hidden_size);
@@ -372,7 +371,6 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             TreeSearchBatchConfig const *bc,
                             cudaStream_t stream) {
   int num_tokens = bc->num_active_tokens();
-  int curr_depth = bc->current_depth;
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
     spec_inc_store_kv_cache<<<GET_BLOCKS(parallelism),
@@ -391,7 +389,6 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
         num_tokens,
         BatchConfig::max_sequence_length() +
             BatchConfig::max_spec_tree_token_num(),
-        /*root*/ curr_depth == 0,
         m->hidden_size);
   }
 }
@@ -731,12 +728,12 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
   // phase 3: Compute attention score
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  if (bc->current_phase == BatchConfig::ExecutionPhase::GENERATION) {
-    compute_spec_inc_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
-  } else if (bc->current_phase == BatchConfig::ExecutionPhase::PROMPT) {
+  if (bc->prompt_phase) {
     compute_attention_kernel_prompt(
         m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+  } else {
+    compute_spec_inc_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
   // compute output production and bias together for all tokens
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 357bf7aa5..ab36c15bc 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -857,7 +857,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
       m->hidden_size);
 
   dim3 grid(m->num_q_heads, bc->num_active_requests());
-  bool const prompt_phase = (bc->current_phase == BatchConfig::ExecutionPhase::PROMPT);
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
   // 0->qk production size, 1->total shared size
@@ -865,11 +864,11 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   if (per_head_size == 64) {
     constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
     LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream, prompt_phase);
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream, bc->prompt_phase);
   } else if (per_head_size == 128) {
     constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
     LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream, prompt_phase);
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream, bc->prompt_phase);
   } else {
     assert(false && "a unsupported head size");
   }

From c2cc2d9b3c717ebd100630a568a42106f479b763 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Thu, 25 Apr 2024 22:21:21 -0400
Subject: [PATCH 134/667] fix style

---
 deps/legion                        |  2 +-
 include/flexflow/request_manager.h |  2 +-
 src/runtime/model.cc               | 12 +++++++-----
 src/runtime/request_manager.cc     | 23 ++++++++++++++---------
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/deps/legion b/deps/legion
index 626b55689..24e8c4523 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c
+Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 671763ef2..9577081b7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -311,7 +311,7 @@ class RequestManager {
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
 
-  // rm state 
+  // rm state
   std::mutex rm_state_mutex;
   int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
 
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index b3f499684..bc1e29fb2 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4459,16 +4459,18 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<BatchConfig,
-                                        RequestManager::get_next_batch_config_task>(
+      Runtime::preregister_task_variant<
+          BatchConfig,
+          RequestManager::get_next_batch_config_task>(
           registrar, "RequestManager Get Next Batch Config Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<BatchConfig,
-                                     RequestManager::get_next_batch_config_task>(
-          registrar);
+      runtime
+          ->register_task_variant<BatchConfig,
+                                  RequestManager::get_next_batch_config_task>(
+              registrar);
     }
   }
   // RequestManager prepare_next_batch
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3a54dc3c4..64795e2c5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -394,14 +394,18 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
     switch (request_manager_status) {
       case PREFILLING:
-        if (request.initial_len == request.llm_cache_size) { // all prompt tokens are prefilled
-          request.tokens.push_back(result.token_ids[request.num_tokens_in_batch]);
+        if (request.initial_len ==
+            request.llm_cache_size) { // all prompt tokens are prefilled
+          request.tokens.push_back(
+              result.token_ids[request.num_tokens_in_batch]);
           request_manager_status = DECODING;
         }
         break;
-      case DECODING: 
-        request.tokens.push_back(result.token_ids[request.first_token_offset_in_batch]);
-        if (request.tokens.size() == request.max_sequence_length) { // request is completed
+      case DECODING:
+        request.tokens.push_back(
+            result.token_ids[request.first_token_offset_in_batch]);
+        if (request.tokens.size() ==
+            request.max_sequence_length) { // request is completed
           request.status = Request::COMPLETED;
           trigger_request_completion_future(request.guid);
           guid_of_requests[i] = INVALID_GUID;
@@ -450,7 +454,8 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   // Per Request Info
   bc.requestsInfo[request_index].first_token_depth_in_request = 0;
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(bc.num_tokens, (int)new_request.tokens.size());
+  bc.requestsInfo[request_index].num_tokens_in_batch =
+      std::min(bc.num_tokens, (int)new_request.tokens.size());
 
   bc.request_completed[request_index] = false;
 
@@ -458,12 +463,12 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   new_request.num_tokens_in_batch = 0;
 
   // Delete those after update BatchConfig
-  bc.requestsInfo[request_index].max_sequence_length = new_request.max_sequence_length;
+  bc.requestsInfo[request_index].max_sequence_length =
+      new_request.max_sequence_length;
   bc.requestsInfo[request_index].request_guid = new_request.guid;
   bc.requestsInfo[request_index].prompt_phase = true;
   bc.requestsInfo[request_index].batch_config_request_id = request_index;
 
-
   // Per Token Info
   for (int j = 0; j < bc.requestsInfo[request_index].num_tokens_in_batch; j++) {
     int depth = bc.requestsInfo[request_index].first_token_depth_in_request + j;
@@ -475,7 +480,7 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
     new_request.llm_cache_size++;
     new_request.num_tokens_in_batch++;
   }
-  
+
   return bc;
 }
 

From 29f48fd734fc226fcff48d16c4f744d8bdd199e0 Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Thu, 25 Apr 2024 23:09:13 -0400
Subject: [PATCH 135/667] Using new serve decoding task.

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 58 ++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 9577081b7..56f8d530f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -151,6 +151,7 @@ class RequestManager {
 
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
+  void serve_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt,
                                    int max_sequence_length);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 64795e2c5..892510f57 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -46,7 +46,7 @@ std::string LoadBytesFromFile(std::string const &path) {
 RequestManager::RequestManager()
     : background_server_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
-      total_request_run_time(0.0f) {
+      total_request_run_time(0.0f), request_manager_status(PREFILLING) {
   // The following config parameters are set
   // during ffmodel.compile()
   // Initialize them to -1 to make sure no one
@@ -427,6 +427,8 @@ BatchConfig RequestManager::prepare_next_batch() {
     case DECODING:
       return prepare_decoding_batch();
     default:
+      std::cout << "Invalid request manager status: " << request_manager_status
+                << std::endl;
       assert(false);
   }
 }
@@ -2545,7 +2547,8 @@ void RequestManager::background_serving_task(
   }
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
-    rm->serve_incr_decoding(llm);
+    // rm->serve_incr_decoding(llm);
+    rm->serve_decoding(llm);
   } else {
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
@@ -2553,6 +2556,57 @@ void RequestManager::background_serving_task(
 }
 
 /*static*/
+void RequestManager::serve_decoding(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  // Compile the llm
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  im->compile_model_and_allocate_buffer(llm);
+  assert(im->model_weights_loaders.find(llm) !=
+         im->model_weights_loaders.end());
+  // Load model weights
+  im->model_weights_loaders[llm]->load_weights(llm);
+  // init operators
+  im->init_operators_inference(llm);
+  // Legion futures for inc_decoding and spec_infer
+  InferenceResultFuture last_irf;
+  {
+    // Initialize futures for incr decoding
+    InferenceResult ir;
+    last_irf = Future::from_value<InferenceResult>(ir);
+  }
+
+  std::queue<InferenceResultFuture> batch_pipeline;
+  { batch_pipeline.push(last_irf); }
+
+  while (!is_background_server_terminated()) {
+
+    if (batch_pipeline.size() >= 4) {
+      // Block here to avoid launching too many batches
+      auto const &ir = batch_pipeline.front();
+      ir.get_void_result();
+    }
+    // deque finished batches
+    while (batch_pipeline.size() > 1) {
+      auto const &ir = batch_pipeline.front();
+      if (ir.is_ready()) {
+        batch_pipeline.pop();
+      } else {
+        break;
+      }
+    }
+    runtime->begin_trace(ctx, 12346 /*trace_id*/);
+    InferenceResultFuture next_ir = batch_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
+    FutureMap fm = im->inference(llm, 0, bcf);
+    assert(fm.get_future_map_domain().get_volume() == 1);
+    InferenceResultFuture irf = fm.get_future(0);
+    batch_pipeline.push(irf);
+    last_irf = irf;
+    runtime->end_trace(ctx, 12346 /*trace_id*/);
+  }
+}
+
 void RequestManager::serve_incr_decoding(FFModel *llm) {
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;

From 1a1e069019ac2311b530be83e8df8b1839104de6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 26 Apr 2024 14:36:39 -0400
Subject: [PATCH 136/667] Modified the serve_spec_infer function based on the
 new APIs.

---
 src/runtime/request_manager.cc | 67 +++++++++++-----------------------
 1 file changed, 22 insertions(+), 45 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index dd4139cfb..f90b1a7fc 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1307,67 +1307,44 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     im->init_operators_inference(ssm);
   }
 
-  std::queue<std::pair<TreeVerifyBatchConfigFuture, InferenceResultFuture>>
-      batch_pipeline;
-  // Legion futures for inc_decoding and spec_infer
-  TreeVerifyBatchConfigFuture last_tree_bcf;
-  InferenceResultFuture last_tree_irf;
+  InferenceResultFuture last_irf;
   {
-    // Initialize futures for spec infer
-    TreeVerifyBatchConfig tree_bc;
-    InferenceResult tree_ir;
-    last_tree_bcf = Future::from_value<TreeVerifyBatchConfig>(tree_bc);
-    last_tree_irf = Future::from_value<InferenceResult>(tree_ir);
+    // Initialize futures for incr decoding
+    InferenceResult ir;
+    last_irf = Future::from_value<InferenceResult>(ir);
   }
-  batch_pipeline.push(std::make_pair(last_tree_bcf, last_tree_irf));
+  std::queue<InferenceResultFuture> batch_pipeline;
+  { batch_pipeline.push(last_irf); }
 
   while (!is_background_server_terminated()) {
 
     if (batch_pipeline.size() >= 4) {
       // Block here to avoid launching too many batches
-      auto const &batch = batch_pipeline.front();
-      batch.second.get_void_result();
+      auto const &ir = batch_pipeline.front();
+      ir.get_void_result();
     }
     // deque finished batches
     while (batch_pipeline.size() > 1) {
-      auto const &batch = batch_pipeline.front();
-      if (batch.second.is_ready()) {
+      auto const &ir = batch_pipeline.front();
+      if (ir.is_ready()) {
         batch_pipeline.pop();
       } else {
         break;
       }
     }
-    auto const &next_batch = batch_pipeline.back();
-    TreeSearchBatchConfigFuture beam_bcf = prepare_first_spec_batch_config(
-        next_batch.first, next_batch.second, 0, ctx, runtime);
-    std::vector<TreeSearchBatchConfigFuture> beam_bcf_vec(get_num_ssms());
-    for (size_t ssm_id = 0; ssm_id < get_num_ssms(); ssm_id++) {
-      beam_bcf_vec[ssm_id] = beam_bcf;
-    }
-    runtime->begin_trace(ctx, 12345 /*trace_id*/);
-
-    for (size_t i = 0; i < get_num_ssms(); i++) {
-      for (int depth = 0; depth < TreeSearchBatchConfig::MAX_BEAM_DEPTH;
-           depth++) {
-        beam_bcf = beam_bcf_vec[i];
-
-        FutureMap fm = im->inference(get_ssm_model(i), 0, beam_bcf_vec[i]);
-        assert(fm.get_future_map_domain().get_volume() == 1);
-        SsmInferenceResultFuture beam_irf = fm.get_future(0);
-        beam_bcf_vec[i] =
-            prepare_next_batch_beam(beam_bcf_vec[i], beam_irf, ctx, runtime);
-      }
-    }
-    // Token Tree Verification
-    {
-      TreeVerifyBatchConfigFuture tree_bcf =
-          prepare_next_batch_verify(beam_bcf_vec, ctx, runtime);
-      FutureMap fm = im->inference(llm, 0, tree_bcf);
+    runtime->begin_trace(ctx, 12346 /*trace_id*/);
+    InferenceResultFuture next_ir = batch_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
+    if (request_manager_status == LLM_VERIFY) {
+      FutureMap fm = im->inference(llm, 0, bcf);
+      assert(fm.get_future_map_domain().get_volume() == 1);
+      InferenceResultFuture irf = fm.get_future(0);
+      batch_pipeline.push(irf);
+    } else {
+      FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture tree_irf = fm.get_future(0);
-      batch_pipeline.push(std::make_pair(tree_bcf, tree_irf));
-      last_tree_bcf = tree_bcf;
-      last_tree_irf = tree_irf;
+      InferenceResultFuture irf = fm.get_future(0);
+      batch_pipeline.push(irf);
     }
     runtime->end_trace(ctx, 12345 /*trace_id*/);
   }

From 04967095263100b518388247ea5ab44d1c70076e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 26 Apr 2024 14:53:36 -0400
Subject: [PATCH 137/667] Fix bugs in pruning token tree.

---
 src/runtime/request_manager.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f90b1a7fc..cb5542764 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1410,6 +1410,8 @@ void RequestManager::add_root_to_spec_token_tree(
   speculative_token_tree.add_layer();
   auto node_ptr = std::make_shared<TokenTreeNode>(token_id, -1, 1.0);
   speculative_token_tree.tree_layers[0].push_back(node_ptr);
+  speculative_token_tree.tree_size++;
+  speculative_token_tree.tree_size_including_pruned++;
 }
 
 bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
@@ -1532,6 +1534,7 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
   for (auto it = last_layer.begin(); it != last_layer.end(); ++it) {
     if ((*it)->pruned) {
       last_layer.erase(it);
+      request.speculative_token_trees[0].tree_size--;
     }
   }
 }

From 15616c9acdb5d72f06b68bcaa172be5c3205679d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 29 Apr 2024 16:25:35 -0400
Subject: [PATCH 138/667] 1. Re-implemented LoadBytesFromFile. 2. Adjusted
 prepare_prefilling_batch and prepare_decoding_batch to the current design.

---
 src/runtime/request_manager.cc | 57 ++++++++++++++--------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cb5542764..60211236c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -33,13 +33,13 @@ LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
 std::string LoadBytesFromFile(std::string const &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
-  assert(!fs.fail() && "no such file");
-  std::string data;
+  assert(fs.is_open() && "Failed to open file for reading.");
   fs.seekg(0, std::ios::end);
-  size_t size = static_cast<size_t>(fs.tellg());
+  size_t size = fs.tellg();
   fs.seekg(0, std::ios::beg);
-  data.resize(size);
-  fs.read(data.data(), size);
+  std::string data(size, '\0');
+  fs.read(&data[0], size);
+  assert(!fs.fail() && "Failed to read data from file.");
   return data;
 }
 
@@ -464,12 +464,15 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   int request_index = get_empty_request_index();
   assert(request_index != -1);
 
+  // The following should be moved to update_inference_results()
   Request new_request = pending_request_queue.front();
   pending_request_queue.pop();
   all_requests[new_request.guid] = new_request;
   guid_of_requests[request_index] = new_request.guid;
+  request_available[request_index] = true;
 
   // Per Request Info
+  // TODO: what if the prompt phase needs multiple runs to finish?
   bc.requestsInfo[request_index].first_token_index_in_request = 0;
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
   bc.requestsInfo[request_index].num_tokens_in_batch =
@@ -480,20 +483,16 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   new_request.first_token_offset_in_batch = 0;
   new_request.num_tokens_in_batch = 0;
 
-  // Delete those after update BatchConfig
-  bc.requestsInfo[request_index].max_sequence_length =
-      new_request.max_sequence_length;
-  bc.requestsInfo[request_index].request_guid = new_request.guid;
-  bc.requestsInfo[request_index].prompt_phase = true;
-  bc.requestsInfo[request_index].batch_config_request_id = request_index;
-
   // Per Token Info
-  for (int j = 0; j < bc.requestsInfo[request_index].num_tokens_in_batch; j++) {
-    int depth = bc.requestsInfo[request_index].first_token_depth_in_request + j;
-    bc.tokensInfo[j].request_index = request_index;
-    bc.tokensInfo[j].abs_depth_in_request = depth;
+  for (int token_idx = 0;
+       token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
+       token_idx++) {
+    int depth =
+        bc.requestsInfo[request_index].first_token_index_in_request + token_idx;
     assert(depth < new_request.tokens.size());
-    bc.tokensInfo[j].token_id = new_request.tokens[depth];
+    bc.tokensInfo[token_idx].request_index = request_index;
+    bc.tokensInfo[token_idx].abs_index_in_request = depth;
+    bc.tokensInfo[token_idx].token_id = new_request.tokens[depth];
 
     new_request.llm_cache_size++;
     new_request.num_tokens_in_batch++;
@@ -507,42 +506,32 @@ BatchConfig RequestManager::prepare_decoding_batch() {
   // fills the last token of each request in the current batch to the
   // BatchConfig for the LLM to decode.
 
-  // TODO:
-  // 1. Adept this function to the new design
-  // 2. Move the following part to the update_inference_results() function
-  // 3. Change the BatchConfig::prompt_phase
-
   BatchConfig bc;
-  bc.num_tokens = 0;
+  bc.prompt_phase = false;
 
   for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-    if (guid_of_requests[i] == INVALID_GUID) {
+    if (!request_available[i]) {
       continue;
     }
+    bc.request_available[i] = true;
+    bc.num_available_requests++;
 
     Request &request = all_requests[guid_of_requests[i]];
 
     // Per Request Info
-    bc.requestsInfo[i].first_token_depth_in_request = request.llm_cache_size;
+    bc.requestsInfo[i].first_token_index_in_request = request.llm_cache_size;
     bc.requestsInfo[i].first_token_offset_in_batch = bc.num_tokens;
     bc.requestsInfo[i].num_tokens_in_batch = 1;
 
-    bc.request_completed[i] = false;
-
     request.first_token_offset_in_batch = bc.num_tokens;
     request.num_tokens_in_batch = 1;
 
-    // Delete those after update BatchConfig
-    bc.requestsInfo[i].max_sequence_length = request.max_sequence_length;
-    bc.requestsInfo[i].request_guid = request.guid;
-    bc.requestsInfo[i].prompt_phase = false;
-    bc.requestsInfo[i].batch_config_request_id = i;
-
     // Per Token Info
     bc.tokensInfo[bc.num_tokens].request_index = i;
-    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
+    // TODO: this should be updated in the update_inference_results() function
     request.llm_cache_size++;
     bc.num_tokens++;
   }

From bfb1da72b661198b1efe7e2269f5eca2a05803b9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 29 Apr 2024 23:00:07 -0400
Subject: [PATCH 139/667] Fix some compilation errors.

---
 include/flexflow/batch_config.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 24122f394..4b9b115d4 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -83,7 +83,8 @@ class BatchConfig {
 
       Bitset(Bitset const &other) {
         // Copy the entire array of bits from 'other' to this object
-        std::memcpy(bits, other.bits, sizeof(bits));
+        std::copy(
+            std::begin(other.bits), std::end(other.bits), std::begin(bits));
       }
 
       void set_bit(size_t pos) {
@@ -105,11 +106,11 @@ class BatchConfig {
       }
 
       void clear() {
-        std::memset(bits, 0, sizeof(bits));
+        std::fill(std::begin(bits), std::end(bits), 0);
       }
 
     private:
-      uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 8]; // Array to hold 256 bits
+      uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 64];
     };
 
   public:

From 67ec56810e2279011e02f2854de86c6d74fce668 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 16:00:48 -0400
Subject: [PATCH 140/667] Modified register_new_request.

---
 src/runtime/request_manager.cc | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 60211236c..93641a43d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -205,10 +205,8 @@ RequestManager::RequestGuid
               << std::endl;
   } else {
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-    for (int i = 0; i < get_num_ssms(); i++) {
-      BeamTree beam_tree = BeamTree{};
-      request.beam_trees.push_back(beam_tree);
-    }
+    assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
+    init_token_tree(request.guid);
   }
 
   pending_request_queue.push(request);
@@ -269,10 +267,8 @@ RequestManager::RequestGuid
               << std::endl;
   } else {
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-    for (int i = 0; i < get_num_ssms(); i++) {
-      BeamTree beam_tree = BeamTree{};
-      request.beam_trees.push_back(beam_tree);
-    }
+    assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
+    init_token_tree(request.guid);
   }
 
   pending_request_queue.push(request);

From 01cf4850ba2456ff2d1137b8495901703107f272 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 16:17:21 -0400
Subject: [PATCH 141/667] Disabled the serve_incr_decoding method, it will be
 removed.

---
 include/flexflow/request_manager.h |   2 +-
 src/runtime/request_manager.cc     | 115 +++++++++++++++--------------
 2 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5fca893cf..33699460f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -26,7 +26,7 @@
 namespace FlexFlow {
 
 class FFModel;
-class BeamTree;
+class TokenTree;
 class RequestManager;
 using tokenizers::Tokenizer;
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 93641a43d..efde88917 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1208,62 +1208,65 @@ void RequestManager::serve_decoding(FFModel *llm) {
   }
 }
 
-void RequestManager::serve_incr_decoding(FFModel *llm) {
-  Context ctx = llm->config.lg_ctx;
-  Runtime *runtime = llm->config.lg_hlr;
-  // Compile the llm
-  InferenceManager *im = InferenceManager::get_inference_manager();
-  im->compile_model_and_allocate_buffer(llm);
-  assert(im->model_weights_loaders.find(llm) !=
-         im->model_weights_loaders.end());
-  // Load model weights
-  im->model_weights_loaders[llm]->load_weights(llm);
-  // init operators
-  im->init_operators_inference(llm);
-  // Legion futures for inc_decoding and spec_infer
-  BatchConfigFuture last_bcf;
-  InferenceResultFuture last_irf;
-  {
-    // Initialize futures for incr decoding
-    BatchConfig bc;
-    InferenceResult ir;
-    last_bcf = Future::from_value<BatchConfig>(bc);
-    last_irf = Future::from_value<InferenceResult>(ir);
-  }
-
-  std::queue<std::pair<BatchConfigFuture, InferenceResultFuture>>
-      batch_pipeline;
-  { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); }
-
-  while (!is_background_server_terminated()) {
-
-    if (batch_pipeline.size() >= 4) {
-      // Block here to avoid launching too many batches
-      auto const &batch = batch_pipeline.front();
-      batch.second.get_void_result();
-    }
-    // deque finished batches
-    while (batch_pipeline.size() > 1) {
-      auto const &batch = batch_pipeline.front();
-      if (batch.second.is_ready()) {
-        batch_pipeline.pop();
-      } else {
-        break;
-      }
-    }
-    runtime->begin_trace(ctx, 12346 /*trace_id*/);
-    auto const &next_batch = batch_pipeline.back();
-    BatchConfigFuture bcf =
-        prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime);
-    FutureMap fm = im->inference(llm, 0, bcf);
-    assert(fm.get_future_map_domain().get_volume() == 1);
-    InferenceResultFuture irf = fm.get_future(0);
-    batch_pipeline.push(std::make_pair(bcf, irf));
-    last_bcf = bcf;
-    last_irf = irf;
-    runtime->end_trace(ctx, 12346 /*trace_id*/);
-  }
-}
+// TO BE REMOVED: START
+// void RequestManager::serve_incr_decoding(FFModel *llm) {
+//   Context ctx = llm->config.lg_ctx;
+//   Runtime *runtime = llm->config.lg_hlr;
+//   // Compile the llm
+//   InferenceManager *im = InferenceManager::get_inference_manager();
+//   im->compile_model_and_allocate_buffer(llm);
+//   assert(im->model_weights_loaders.find(llm) !=
+//          im->model_weights_loaders.end());
+//   // Load model weights
+//   im->model_weights_loaders[llm]->load_weights(llm);
+//   // init operators
+//   im->init_operators_inference(llm);
+//   // Legion futures for inc_decoding and spec_infer
+//   BatchConfigFuture last_bcf;
+//   InferenceResultFuture last_irf;
+//   {
+//     // Initialize futures for incr decoding
+//     BatchConfig bc;
+//     InferenceResult ir;
+//     last_bcf = Future::from_value<BatchConfig>(bc);
+//     last_irf = Future::from_value<InferenceResult>(ir);
+//   }
+
+//   std::queue<std::pair<BatchConfigFuture, InferenceResultFuture>>
+//       batch_pipeline;
+//   { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); }
+
+//   while (!is_background_server_terminated()) {
+
+//     if (batch_pipeline.size() >= 4) {
+//       // Block here to avoid launching too many batches
+//       auto const &batch = batch_pipeline.front();
+//       batch.second.get_void_result();
+//     }
+//     // deque finished batches
+//     while (batch_pipeline.size() > 1) {
+//       auto const &batch = batch_pipeline.front();
+//       if (batch.second.is_ready()) {
+//         batch_pipeline.pop();
+//       } else {
+//         break;
+//       }
+//     }
+//     runtime->begin_trace(ctx, 12346 /*trace_id*/);
+//     auto const &next_batch = batch_pipeline.back();
+//     BatchConfigFuture bcf =
+//         prepare_next_batch(next_batch.first, next_batch.second, ctx,
+//         runtime);
+//     FutureMap fm = im->inference(llm, 0, bcf);
+//     assert(fm.get_future_map_domain().get_volume() == 1);
+//     InferenceResultFuture irf = fm.get_future(0);
+//     batch_pipeline.push(std::make_pair(bcf, irf));
+//     last_bcf = bcf;
+//     last_irf = irf;
+//     runtime->end_trace(ctx, 12346 /*trace_id*/);
+//   }
+// }
+// TO BE REMOVED: END
 
 /*static*/
 void RequestManager::serve_spec_infer(FFModel *llm) {

From 93522a3f90829de6c12d52004eb5bbf09d6ddfc8 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 16:51:59 -0400
Subject: [PATCH 142/667] Add a virtual descructor to the class InferenceResult
 to make it a polymorphic class.

---
 include/flexflow/batch_config.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 4b9b115d4..08c0fb326 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -172,6 +172,7 @@ class TreeVerifyBatchConfig : public BatchConfig {
 struct InferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
+  virtual ~InferenceResult() = default;
 };
 
 class TreeSearchBatchConfig : public BatchConfig {

From e2be0768865ca255858b07306df4b38da8c47b93 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 16:53:24 -0400
Subject: [PATCH 143/667] 1. Added a field in request manager with type
 InferenceMode to indicate whether the request manager is performing
 incremental decoding or speculative decoding. 2. Re-arranged the APIs in
 request_manager, make some of them private.

---
 include/flexflow/request_manager.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 33699460f..d391805e9 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -181,6 +181,11 @@ class RequestManager {
     SERVING = 2002,
     TERMINATED = 2003,
   };
+  enum InferenceMode {
+    INCREMENTAL_DECODING = 3001,
+    SPECULATIVE_DECODING = 3002,
+  };
+
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
@@ -262,8 +267,6 @@ class RequestManager {
   BatchConfig get_next_batch_config(InferenceResult const &result);
   void update_inference_results(InferenceResult const &result);
   BatchConfig prepare_next_batch();
-  BatchConfig prepare_prefilling_batch();
-  BatchConfig prepare_decoding_batch();
 
   int get_num_active_requests();
   int get_empty_request_index();
@@ -276,6 +279,7 @@ class RequestManager {
   int max_sequence_length;
   State request_manager_status;
   BackgroundServerStatus background_server_status;
+  InferenceMode inference_mode;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -332,11 +336,18 @@ class RequestManager {
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
+  /* ---------- Incremental Decoding Helper Functions ---------- */
+  bool update_llm_prefill_results(InferenceResult const &result);
+  void update_llm_decode_results(InferenceResult const &result);
+  BatchConfig prepare_prefilling_batch();
+  BatchConfig prepare_decoding_batch();
+  /* ---------- Incremental Decoding Helper Functions ---------- */
 
   /* ---------- Spec Decoding Helper Functions ---------- */
   bool update_llm_verify_results(InferenceResult const &llm_verify_result);
   bool update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
+  void update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
   // Prepare the next speculation batch config. This function is called before
   // the second step of the speculation.
   TreeSearchBatchConfig prepare_next_spec_batch_config();

From 175fbcd4467cae6a49bfa06b656cf1eb8cb4105b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 16:55:46 -0400
Subject: [PATCH 144/667] Re-implemented the function update_inference_results.

---
 src/runtime/request_manager.cc | 48 ++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index efde88917..e95642bf8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -374,6 +374,49 @@ BatchConfig
   return prepare_next_batch();
 }
 
+void RequestManager::update_inference_results(InferenceResult const &result) {
+  // Update the inference results
+  std::lock_guard<std::mutex> const lock(rm_state_mutex);
+  switch (request_manager_status) {
+    case PREFILLING:
+      if (update_llm_prefill_results(result)) {
+        // This indicates that the prefilling phase finishes
+        if (inference_mode == INCREMENTAL_DECODING) {
+          request_manager_status = DECODING;
+        } else if (inference_mode == SPECULATIVE_DECODING) {
+          request_manager_status = SSM_SPEC;
+        } else {
+          assert(false && "Invalid inference mode.");
+        }
+      }
+      // else, continue the unfinished prefilling
+      break;
+    case DECODING:
+      update_llm_decode_results(result);
+      break;
+    case LLM_VERIFY:
+      if (update_llm_verify_results(result)) {
+        // A request completed after the verification
+        if (pending_request_queue.empty()) {
+          // No pending request to process, continue the speculation
+          request_manager_status = SSM_SPEC;
+        } else {
+          request_manager_status = PREFILLING;
+        }
+      }
+      break;
+    case SSM_SPEC:
+      SsmInferenceResult const &ssm_result =
+          dynamic_cast<SsmInferenceResult const &>(result);
+      if (update_ssm_inference_results(ssm_result)) {
+        // Stop condition for the speculation phase has been reached
+        request_manager_status = LLM_VERIFY;
+      }
+      // else, keep the current status
+      break;
+  }
+}
+
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   std::lock_guard<std::mutex> const lock(rm_state_mutex);
@@ -409,6 +452,11 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   }
 }
 
+bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
+}
+
+void RequestManager::update_llm_decode_results(InferenceResult const &result) {}
+
 BatchConfig RequestManager::prepare_next_batch() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
 

From 85e0fe3aa9c3119b77944778335e32323acac1a7 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 17:21:30 -0400
Subject: [PATCH 145/667] Rename InferenceMode and inference_mode to
 DecodingMode and decoding_mode, respectively, to avoid name conflicts.

---
 include/flexflow/request_manager.h | 4 ++--
 src/runtime/request_manager.cc     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d391805e9..8f5a4684c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -181,7 +181,7 @@ class RequestManager {
     SERVING = 2002,
     TERMINATED = 2003,
   };
-  enum InferenceMode {
+  enum DecodingMode {
     INCREMENTAL_DECODING = 3001,
     SPECULATIVE_DECODING = 3002,
   };
@@ -279,7 +279,7 @@ class RequestManager {
   int max_sequence_length;
   State request_manager_status;
   BackgroundServerStatus background_server_status;
-  InferenceMode inference_mode;
+  DecodingMode decoding_mode;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e95642bf8..91e4e45cc 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -381,9 +381,9 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
     case PREFILLING:
       if (update_llm_prefill_results(result)) {
         // This indicates that the prefilling phase finishes
-        if (inference_mode == INCREMENTAL_DECODING) {
+        if (decoding_mode == INCREMENTAL_DECODING) {
           request_manager_status = DECODING;
-        } else if (inference_mode == SPECULATIVE_DECODING) {
+        } else if (decoding_mode == SPECULATIVE_DECODING) {
           request_manager_status = SSM_SPEC;
         } else {
           assert(false && "Invalid inference mode.");

From 41abe077577c27c3a8017ac61c6c9417df0c98e7 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 17:30:35 -0400
Subject: [PATCH 146/667] Modified load_batch_config_task according to the new
 BatchConfig classes.

---
 src/runtime/request_manager.cpp | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index 4f7f3e82c..2dc74b018 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -91,29 +91,12 @@ void RequestManager::load_batch_config_task(
 
   // load speculative metadata
   if (batch_config->get_mode() == TREE_SEARCH_MODE) {
-    TreeSearchBatchConfig const *beam_batch_config =
+    TreeSearchBatchConfig const *tree_search_batch_config =
         static_cast<TreeSearchBatchConfig const *>(batch_config);
 
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(beam_batch_config->beamTokenInfo),
-                             sizeof(TreeSearchBatchConfig::beamTokenInfo),
-                             hipMemcpyHostToDevice,
-                             stream));
-
-    total_copy_size += sizeof(TreeSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
-                             &(beam_batch_config->beamRequestsInfo),
-                             sizeof(TreeSearchBatchConfig::beamRequestsInfo),
-                             hipMemcpyHostToDevice,
-                             stream));
-    total_copy_size += sizeof(TreeSearchBatchConfig::beamRequestsInfo);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
-                             &(beam_batch_config->causalMask),
+                             &(tree_search_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));

From 795b7d60196d94da58faee6a3222f4f04389021b Mon Sep 17 00:00:00 2001
From: Linshuhuai <shuhuail@andrew.cmu.edu>
Date: Tue, 30 Apr 2024 18:14:48 -0400
Subject: [PATCH 147/667] implement create_llm_bitmask

---
 src/runtime/request_manager.cc | 35 ++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 60211236c..c09898687 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -987,6 +987,41 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   // TODO: implement this function
   // 1. Create the bitmask based on the pruned request token tree
   // 2. Maintain all other fields
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &ssm_bitmask = request.causal_mask;
+  BatchConfig::BitMask &llm_bitmask = new BatchConfig::BitMask();
+  llm_bitmask.clear_bitmask(); // is it necessary?
+
+  // Set the mask for the root
+  bitmask.bit_mask[0].set_bit(0);
+  int parent_offset = 0;
+  int child_offset = 1;
+
+  // Traversal of pruned token tree
+  TokenTree &token_tree = request.speculative_token_trees[0];
+  for (auto const &tree_layer : token_tree.tree_layers) {
+    int child_idx = 0;
+    for (auto const &tree_node : tree_layer) {
+      if (tree_node->pruned == false) {
+        // Each child copy its parent's mask
+        llm_bitmask.bit_mask[child_offset + child_idx] =
+            llm_bitmask.bit_mask[parent_offset + tree_node->parent_pos];
+        // Each child attend to itself
+        bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
+                                                          child_idx);
+        child_idx++;
+      }
+    }
+    if (child_idx == 0) break;
+    parent_offset = child_offset;
+    child_offset += child_idx;
+  }
+
+  // Maintain other fields of llm_bitmask
+  llm_bitmask.non_tree_cache_size = ssm_bitmask.non_tree_cache_size; // not sure
+  llm_bitmask.current_layer_size = child_offset - parent_offset; // needed for llm_bitmask?
+  llm_bitmask.tree_or_prompt_size = child_offset;
+  return llm_bitmask;
 }
 /* --------- Bitmask Related Functions --------- */
 

From 9e1d153620e01f2c506dfb3875a4ad18cddaf9b3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 20:11:28 -0400
Subject: [PATCH 148/667] Modified the implementation of create_llm_bitmask.

---
 src/runtime/request_manager.cc | 51 ++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c520d2b7d..f4570a6e2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1031,40 +1031,43 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   // TODO: implement this function
   // 1. Create the bitmask based on the pruned request token tree
   // 2. Maintain all other fields
-  Request &request = all_requests[guid];
-  BatchConfig::BitMask &ssm_bitmask = request.causal_mask;
-  BatchConfig::BitMask &llm_bitmask = new BatchConfig::BitMask();
-  llm_bitmask.clear_bitmask(); // is it necessary?
 
-  // Set the mask for the root
-  bitmask.bit_mask[0].set_bit(0);
-  int parent_offset = 0;
-  int child_offset = 1;
-
-  // Traversal of pruned token tree
+  Request &request = all_requests[guid];
   TokenTree &token_tree = request.speculative_token_trees[0];
+  BatchConfig::BitMask llm_bitmask = BatchConfig::BitMask();
+
+  int abs_index_in_tree = 0;
+  std::vector<int> parent_pos_2_abs_index;
+  std::vector<int> current_layer_abs_index;
   for (auto const &tree_layer : token_tree.tree_layers) {
-    int child_idx = 0;
     for (auto const &tree_node : tree_layer) {
+      current_layer_abs_index.push_back(abs_index_in_tree);
       if (tree_node->pruned == false) {
-        // Each child copy its parent's mask
-        llm_bitmask.bit_mask[child_offset + child_idx] =
-            llm_bitmask.bit_mask[parent_offset + tree_node->parent_pos];
-        // Each child attend to itself
-        bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
-                                                          child_idx);
-        child_idx++;
+        if (abs_index_in_tree == 0) {
+          // The root token, set itself
+          llm_bitmask.bit_mask[0].set_bit(0);
+        } else {
+          // Copy from the parent, and set itself
+          int parent_abs_index = parent_pos_2_abs_index[tree_node->parent_pos];
+          llm_bitmask.bit_mask[abs_index_in_tree] =
+              llm_bitmask.bit_mask[parent_abs_index];
+          llm_bitmask.bit_mask[abs_index_in_tree].set_bit(abs_index_in_tree);
+        }
+        abs_index_in_tree++;
       }
     }
-    if (child_idx == 0) break;
-    parent_offset = child_offset;
-    child_offset += child_idx;
+    parent_pos_2_abs_index.clear();
+    parent_pos_2_abs_index.swap(current_layer_abs_index);
   }
 
+  // A sanity check
+  assert(abs_index_in_tree == token_tree.tree_size);
+
   // Maintain other fields of llm_bitmask
-  llm_bitmask.non_tree_cache_size = ssm_bitmask.non_tree_cache_size; // not sure
-  llm_bitmask.current_layer_size = child_offset - parent_offset; // needed for llm_bitmask?
-  llm_bitmask.tree_or_prompt_size = child_offset;
+  llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size;
+  // We don't need to set llm_bitmask.current_layer_size and
+  // llm_bitmask.tree_or_prompt_size here because they are not used in LLM
+  // verification.
   return llm_bitmask;
 }
 /* --------- Bitmask Related Functions --------- */

From 6c54da223c91d97b55ef9bdb1276292dcebdcc4f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 21:02:49 -0400
Subject: [PATCH 149/667] 1. Made update_llm_decode_results return bool. 2.
 Update some comments.

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8f5a4684c..4f2ed6c7a 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -338,7 +338,7 @@ class RequestManager {
   double total_request_run_time;
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
-  void update_llm_decode_results(InferenceResult const &result);
+  bool update_llm_decode_results(InferenceResult const &result);
   BatchConfig prepare_prefilling_batch();
   BatchConfig prepare_decoding_batch();
   /* ---------- Incremental Decoding Helper Functions ---------- */
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f4570a6e2..d060938f3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -392,7 +392,15 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
       // else, continue the unfinished prefilling
       break;
     case DECODING:
-      update_llm_decode_results(result);
+      if (update_llm_decode_results(result)) {
+        // A request completed after the decode
+        if (pending_request_queue.empty()) {
+          // No pending request to process, continue the speculation
+          request_manager_status = DECODING;
+        } else {
+          request_manager_status = PREFILLING;
+        }
+      }
       break;
     case LLM_VERIFY:
       if (update_llm_verify_results(result)) {
@@ -453,9 +461,20 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 }
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
+  // TODO:
+  // 1. Iterate over all requests, find the prefilling request:
+  // request.num_tokens_in_batch != 0
+  // 2. Check if the prefilling is finished
+  // 3. If the prefilling is finished, return true
 }
 
-void RequestManager::update_llm_decode_results(InferenceResult const &result) {}
+bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
+  // TODO:
+  // 1. Iterate over all requests
+  // request.num_tokens_in_batch != 0
+  // 2. Check if the prefilling is finished
+  // 3. If at least one request is completed, return true
+}
 
 BatchConfig RequestManager::prepare_next_batch() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);

From 472ed82f2ee4c987152c49f42529f3b0ab1b8170 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 23:58:37 -0400
Subject: [PATCH 150/667] Modified the constructor of BatchConfig to use std
 functions.

---
 src/runtime/batch_config.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 1b5215975..1acf6ae95 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -27,9 +27,9 @@ using Legion::Memory;
 
 BatchConfig::BatchConfig()
     : num_tokens(0), num_available_requests(0), prompt_phase(false) {
-  memset(request_available, false, sizeof(bool) * MAX_NUM_REQUESTS);
-  // Don't need to initialize requestInfo ,tokensInfo, and causalMask here
-  // because they initialize themselves.
+  std::fill(std::begin(request_available), std::end(request_available), 0);
+  // Don't need to initialize requestInfo ,tokensInfo, and causalMask
+  // here because they initialize themselves.
 }
 
 /*static*/

From 866a8cde4d83a12a8e7350fedd9ad31d861a32d9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 30 Apr 2024 23:59:57 -0400
Subject: [PATCH 151/667] 1. Removed max_sequence_length and initial_len from
 Request. 2. Removed unused prepare_next_batch_task function. 3. Added a field
 in RequestManager to keep track of the prefilling request. 4. Added a
 function in RequestManager to load pending request from pending_request_queue
 to batch. 5. Rewrote prepare_prefilling_batch due to the changes.

---
 include/flexflow/request_manager.h | 10 ++--
 src/runtime/request_manager.cc     | 86 ++++++++++++++++--------------
 2 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4f2ed6c7a..e4bb741ac 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -65,8 +65,7 @@ struct Request {
     FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
-  int max_sequence_length;
-  int initial_len;
+  int batch_index = -1;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
@@ -250,11 +249,6 @@ class RequestManager {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
-  static BatchConfig prepare_next_batch_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
   // API for rm state machine
   BatchConfigFuture get_next_batch_config(InferenceResultFuture const &result,
                                           Legion::Context ctx,
@@ -294,6 +288,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
+  std::shared_ptr<Request> prefill_request = nullptr;
 
   // Added to make the request manager stateful. During the processing of the
   // first small model inference results, the step equals to 1. That is, every
@@ -336,6 +331,7 @@ class RequestManager {
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
+  void load_pending_reqeust_to_batch();
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d060938f3..893d12f6e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -185,7 +185,6 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
 
   if (prompt.size() >= get_max_sequence_length()) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
@@ -195,7 +194,6 @@ RequestManager::RequestGuid
     printf("tokens size: %zu\n", request.tokens.size());
     return INVALID_GUID;
   } else {
-    request.initial_len = prompt.size();
     request.tokens = prompt;
   }
 
@@ -242,7 +240,6 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
@@ -259,7 +256,6 @@ RequestManager::RequestGuid
     std::cout << "[" << i << "]" << tokens.at(i) << "\n";
   }
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
-  request.initial_len = request.tokens.size();
 
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
@@ -373,6 +369,23 @@ BatchConfig
   update_inference_results(result);
   return prepare_next_batch();
 }
+void RequestManager::load_pending_reqeust_to_batch() {
+  assert(!pending_request_queue.empty() && "No pending request to process.");
+  Request &new_request = pending_request_queue.front();
+  all_requests[new_request.guid] = new_request;
+  BatchConfig::RequestGuid guid = new_request.guid;
+  pending_request_queue.pop();
+  prefill_request = std::make_shared<Request>(all_requests[guid]);
+
+  // Find an empty slot
+  int request_index = get_empty_request_index();
+  assert(request_index != -1 && "No empty request slot to load the request.");
+  prefill_request->batch_index = request_index;
+  guid_of_requests[request_index] = guid;
+  request_available[request_index] = true;
+  num_available_requests++;
+  request_available[request_index] = true;
+}
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
@@ -399,6 +412,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           request_manager_status = DECODING;
         } else {
           request_manager_status = PREFILLING;
+          load_pending_reqeust_to_batch();
         }
       }
       break;
@@ -410,6 +424,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           request_manager_status = SSM_SPEC;
         } else {
           request_manager_status = PREFILLING;
+          load_pending_reqeust_to_batch();
         }
       }
       break;
@@ -511,56 +526,49 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   // 2. Move the following part to the update_inference_results() function
   // 3. Change the BatchConfig.prompt_phase
 
-  // The following part should be moved to the update_inference_results()
-  // function
-  if (pending_request_queue.empty()) {
-    if (get_num_active_requests() == 0) {
-      return BatchConfig();
-    } else {
-      return prepare_decoding_batch();
-    }
-  }
-
   BatchConfig bc;
-  bc.num_tokens = BatchConfig::MAX_NUM_TOKENS;
+  bc.prompt_phase = true;
 
-  int request_index = get_empty_request_index();
-  assert(request_index != -1);
+  assert(prefill_request != nullptr &&
+         "No prefilling request to process in the prefilling phase.");
+  int request_index = prefill_request->batch_index;
 
-  // The following should be moved to update_inference_results()
-  Request new_request = pending_request_queue.front();
-  pending_request_queue.pop();
-  all_requests[new_request.guid] = new_request;
-  guid_of_requests[request_index] = new_request.guid;
-  request_available[request_index] = true;
-
-  // Per Request Info
-  // TODO: what if the prompt phase needs multiple runs to finish?
-  bc.requestsInfo[request_index].first_token_index_in_request = 0;
+  // Request Info
+  bc.requestsInfo[request_index].first_token_index_in_request =
+      prefill_request->llm_cache_size;
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].num_tokens_in_batch =
-      std::min(bc.num_tokens, (int)new_request.tokens.size());
+  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
+      BatchConfig::MAX_NUM_TOKENS,
+      (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
 
   bc.request_available[request_index] = true;
 
-  new_request.first_token_offset_in_batch = 0;
-  new_request.num_tokens_in_batch = 0;
+  prefill_request->first_token_offset_in_batch = 0;
+  prefill_request->num_tokens_in_batch =
+      bc.requestsInfo[request_index].num_tokens_in_batch;
 
-  // Per Token Info
+  // Token Info
   for (int token_idx = 0;
        token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
        token_idx++) {
-    int depth =
-        bc.requestsInfo[request_index].first_token_index_in_request + token_idx;
-    assert(depth < new_request.tokens.size());
+    int abs_idx = prefill_request->llm_cache_size + token_idx;
+    assert(abs_idx < prefill_request->tokens.size());
     bc.tokensInfo[token_idx].request_index = request_index;
-    bc.tokensInfo[token_idx].abs_index_in_request = depth;
-    bc.tokensInfo[token_idx].token_id = new_request.tokens[depth];
+    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
 
-    new_request.llm_cache_size++;
-    new_request.num_tokens_in_batch++;
+    bc.num_tokens++;
+    prefill_request->llm_cache_size++;
+    prefill_request->num_tokens_in_batch++;
   }
 
+  // Other metadata
+  bc.num_available_requests = num_available_requests;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.prompt_phase = true;
+
   return bc;
 }
 

From 026017fda5007b7f3db0f68d7823cb0df6b32e9c Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 00:08:02 -0400
Subject: [PATCH 152/667] Changed some comments on update_llm_prefill_results.

---
 src/runtime/request_manager.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 893d12f6e..4eecd3fbf 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -477,10 +477,13 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   // TODO:
-  // 1. Iterate over all requests, find the prefilling request:
-  // request.num_tokens_in_batch != 0
-  // 2. Check if the prefilling is finished
-  // 3. If the prefilling is finished, return true
+  // The pending request can be found at Request_manager.prefill_request
+  // 1. Check if the prefilling is finished (request.tokens.size() ==
+  // request.llm_cache_size)
+  // 2. If the prefilling is finished, push the last token in result to
+  // request.tokens
+  // 3. Otherwise, no need to push
+  // 4. Return true if the prefilling is finished
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {

From 5c3532718a0fc897c370aaf2905c59e2eb4a6e9e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 14:25:25 -0400
Subject: [PATCH 153/667] Added field prefill_model to distinguish ssm prefill
 and llm prefill.

---
 include/flexflow/request_manager.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e4bb741ac..4053d0167 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -184,6 +184,10 @@ class RequestManager {
     INCREMENTAL_DECODING = 3001,
     SPECULATIVE_DECODING = 3002,
   };
+  enum PrefillModel {
+    LLM = 4001,
+    SSM = 4002,
+  };
 
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
@@ -274,6 +278,7 @@ class RequestManager {
   State request_manager_status;
   BackgroundServerStatus background_server_status;
   DecodingMode decoding_mode;
+  PrefillModel prefill_model;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;

From a231773b92e051b8635dea4e0ebb12cea39a75ad Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 14:29:22 -0400
Subject: [PATCH 154/667] Added prefill to current speculative inference logic.

---
 src/runtime/request_manager.cc | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4eecd3fbf..c15c034c9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1439,16 +1439,32 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     runtime->begin_trace(ctx, 12346 /*trace_id*/);
     InferenceResultFuture next_ir = batch_pipeline.back();
     BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
-    if (request_manager_status == LLM_VERIFY) {
+    if (request_manager_status == PREFILLING) {
+      if (prefill_model == LLM) {
+        FutureMap fm = im->inference(llm, 0, bcf);
+        assert(fm.get_future_map_domain().get_volume() == 1);
+        InferenceResultFuture irf = fm.get_future(0);
+        batch_pipeline.push(irf);
+      } else if (prefill_model == SSM) {
+        FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
+        assert(fm.get_future_map_domain().get_volume() == 1);
+        InferenceResultFuture irf = fm.get_future(0);
+        batch_pipeline.push(irf);
+      } else {
+        assert(false && "Invalid prefill model");
+      }
+    } else if (request_manager_status == LLM_VERIFY) {
       FutureMap fm = im->inference(llm, 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
       InferenceResultFuture irf = fm.get_future(0);
       batch_pipeline.push(irf);
-    } else {
+    } else if (request_manager_status == SSM_SPEC) {
       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
       InferenceResultFuture irf = fm.get_future(0);
       batch_pipeline.push(irf);
+    } else {
+      assert(false && "Invalid request manager status");
     }
     runtime->end_trace(ctx, 12345 /*trace_id*/);
   }

From 8a614a76a2dd1ac846f46897d6772ca4165f3b05 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 14:54:44 -0400
Subject: [PATCH 155/667] 1. Changed the return type of
 update_ssm_prefill_results to bool. 2. Updated update_inference_results due
 to the recent added prefill_model. 3. Re-implemented
 prepare_prefilling_batch.

---
 include/flexflow/request_manager.h |   2 +-
 src/runtime/request_manager.cc     | 172 +++++++++++++++++------------
 2 files changed, 100 insertions(+), 74 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4053d0167..665da7a05 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -348,7 +348,7 @@ class RequestManager {
   bool update_llm_verify_results(InferenceResult const &llm_verify_result);
   bool update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
-  void update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
+  bool update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
   // Prepare the next speculation batch config. This function is called before
   // the second step of the speculation.
   TreeSearchBatchConfig prepare_next_spec_batch_config();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c15c034c9..94d9bfc90 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -392,17 +392,29 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   std::lock_guard<std::mutex> const lock(rm_state_mutex);
   switch (request_manager_status) {
     case PREFILLING:
-      if (update_llm_prefill_results(result)) {
-        // This indicates that the prefilling phase finishes
-        if (decoding_mode == INCREMENTAL_DECODING) {
+      if (decoding_mode == INCREMENTAL_DECODING) {
+        if (update_llm_prefill_results(result)) {
+          // This indicates that the prefilling phase finishes
           request_manager_status = DECODING;
-        } else if (decoding_mode == SPECULATIVE_DECODING) {
-          request_manager_status = SSM_SPEC;
+        }
+      } else if (decoding_mode == SPECULATIVE_DECODING) {
+        if (prefill_model == SSM) {
+          if (update_ssm_prefill_results(result)) {
+            // This indicates that the prefilling phase for SSM finishes
+            // We need to start the LLM prefilling
+            prefill_model = LLM;
+          }
+        } else if (prefill_model == LLM) {
+          if (update_llm_prefill_results(result)) {
+            // This indicates that the prefilling phase finishes
+            request_manager_status = SSM_SPEC;
+          }
         } else {
           assert(false && "Invalid inference mode.");
         }
+      } else {
+        assert(false && "Invalid inference mode.");
       }
-      // else, continue the unfinished prefilling
       break;
     case DECODING:
       if (update_llm_decode_results(result)) {
@@ -425,6 +437,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         } else {
           request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
+          prefill_model = SSM;
         }
       }
       break;
@@ -440,40 +453,43 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   }
 }
 
-void RequestManager::update_inference_results(InferenceResult const &result) {
-  // Update the inference results
-  std::lock_guard<std::mutex> const lock(rm_state_mutex);
-  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-    if (guid_of_requests[i] == INVALID_GUID) {
-      continue;
-    }
-    Request &request = all_requests[guid_of_requests[i]];
-
-    switch (request_manager_status) {
-      case PREFILLING:
-        if (request.initial_len ==
-            request.llm_cache_size) { // all prompt tokens are prefilled
-          request.tokens.push_back(
-              result.token_ids[request.num_tokens_in_batch]);
-          request_manager_status = DECODING;
-        }
-        break;
-      case DECODING:
-        request.tokens.push_back(
-            result.token_ids[request.first_token_offset_in_batch]);
-        if (request.tokens.size() ==
-            request.max_sequence_length) { // request is completed
-          request.status = Request::COMPLETED;
-          trigger_request_completion_future(request.guid);
-          guid_of_requests[i] = INVALID_GUID;
-          request_manager_status = PREFILLING;
-        }
-        break;
-      default:
-        assert(false);
-    }
-  }
-}
+// TO BE REMOVED: START
+// void RequestManager::update_inference_results(InferenceResult const &result)
+// {
+//   // Update the inference results
+//   std::lock_guard<std::mutex> const lock(rm_state_mutex);
+//   for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+//     if (guid_of_requests[i] == INVALID_GUID) {
+//       continue;
+//     }
+//     Request &request = all_requests[guid_of_requests[i]];
+
+//     switch (request_manager_status) {
+//       case PREFILLING:
+//         if (request.initial_len ==
+//             request.llm_cache_size) { // all prompt tokens are prefilled
+//           request.tokens.push_back(
+//               result.token_ids[request.num_tokens_in_batch]);
+//           request_manager_status = DECODING;
+//         }
+//         break;
+//       case DECODING:
+//         request.tokens.push_back(
+//             result.token_ids[request.first_token_offset_in_batch]);
+//         if (request.tokens.size() ==
+//             request.max_sequence_length) { // request is completed
+//           request.status = Request::COMPLETED;
+//           trigger_request_completion_future(request.guid);
+//           guid_of_requests[i] = INVALID_GUID;
+//           request_manager_status = PREFILLING;
+//         }
+//         break;
+//       default:
+//         assert(false);
+//     }
+//   }
+// }
+// TO BE REMOVED: END
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   // TODO:
@@ -519,15 +535,9 @@ BatchConfig RequestManager::prepare_next_batch() {
 
 BatchConfig RequestManager::prepare_prefilling_batch() {
   // This function is called when the request_manager_status is PREFILLING,
-  // which means that there is at least one empty slot in the currnet decoding
-  // batch, and there is at least one pending request in the request queue.
-  // This function takes the pending request, and load its prefilling tokens,
-  // constructing a BatchConfig with only one request.
-
-  // TODO:
-  // 1. Adept this function to the new design
-  // 2. Move the following part to the update_inference_results() function
-  // 3. Change the BatchConfig.prompt_phase
+  // which means that there is a request in the prefilling phase.
+  // This function load its prefilling tokens, constructing a BatchConfig with
+  // only one request.
 
   BatchConfig bc;
   bc.prompt_phase = true;
@@ -536,15 +546,29 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
          "No prefilling request to process in the prefilling phase.");
   int request_index = prefill_request->batch_index;
 
-  // Request Info
-  bc.requestsInfo[request_index].first_token_index_in_request =
-      prefill_request->llm_cache_size;
-  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-      BatchConfig::MAX_NUM_TOKENS,
-      (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
-
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
   bc.request_available[request_index] = true;
+  bc.num_available_requests = num_available_requests;
+
+  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+  if (prefill_model == SSM) {
+    // Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        prefill_request->ssm_cache_size;
+    bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
+        BatchConfig::MAX_NUM_TOKENS,
+        (int)prefill_request->tokens.size() - prefill_request->ssm_cache_size);
+
+  } else if (prefill_model == LLM) {
+    // Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        prefill_request->llm_cache_size;
+    bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
+        BatchConfig::MAX_NUM_TOKENS,
+        (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
+  }
 
   prefill_request->first_token_offset_in_batch = 0;
   prefill_request->num_tokens_in_batch =
@@ -554,24 +578,26 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   for (int token_idx = 0;
        token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
        token_idx++) {
-    int abs_idx = prefill_request->llm_cache_size + token_idx;
+    int abs_idx = -1;
+    if (prefill_model == SSM) {
+      abs_idx = prefill_request->ssm_cache_size + token_idx;
+    } else if (prefill_model == LLM) {
+      abs_idx = prefill_request->llm_cache_size + token_idx;
+    } else {
+      assert(false && "Invalid prefill model.");
+    }
     assert(abs_idx < prefill_request->tokens.size());
+
     bc.tokensInfo[token_idx].request_index = request_index;
     bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
     bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
 
     bc.num_tokens++;
-    prefill_request->llm_cache_size++;
     prefill_request->num_tokens_in_batch++;
+    // TODO: move the following line to update_inference_results
+    // prefill_request->llm_cache_size++;
   }
 
-  // Other metadata
-  bc.num_available_requests = num_available_requests;
-  std::copy(std::begin(request_available),
-            std::end(request_available),
-            std::begin(bc.request_available));
-  bc.prompt_phase = true;
-
   return bc;
 }
 
@@ -875,9 +901,9 @@ bool RequestManager::update_llm_verify_results(
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
   // token, this is implemented in get_verify_results_greedy(). This function
-  // stores the commmitted tokens into the corresponding fields in the Request.
-  // For the sampling construction of the speculative token tree, we need to
-  // implement a CPU based verify function.
+  // stores the commmitted tokens into the corresponding fields in the
+  // Request. For the sampling construction of the speculative token tree, we
+  // need to implement a CPU based verify function.
   // 2. Call init_token_tree() add_root_token_to_spec_token_tree() to add the
   // root token to the requests' speculative token tree. The root token is the
   // last committed token.
@@ -1521,10 +1547,10 @@ void RequestManager::add_root_to_spec_token_tree(
   // This method is called by update_llm_verify_results()
   // The last token in the accepted sequence should be the root of the next
   // speculation tree. The reason is that the KV cache of this token is not
-  // computed yet, and we need the large model to decode the logit of this token
-  // to verify its childs (the tokens in the first layer).
-  // This method should: construct and add the root token to the empty
-  // speculative token tree, with parent_pos being -1 and joint_prob being 1.0
+  // computed yet, and we need the large model to decode the logit of this
+  // token to verify its childs (the tokens in the first layer). This method
+  // should: construct and add the root token to the empty speculative token
+  // tree, with parent_pos being -1 and joint_prob being 1.0
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
   speculative_token_tree.add_layer();

From d092bdc78e15ffd366fa2ad2c866e4460acf2cbc Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 15:03:22 -0400
Subject: [PATCH 156/667] Implemented update_ssm_prefill_results. Small fix on
 update_inference_results.

---
 src/runtime/request_manager.cc | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 94d9bfc90..3a87a91eb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -408,9 +408,11 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           if (update_llm_prefill_results(result)) {
             // This indicates that the prefilling phase finishes
             request_manager_status = SSM_SPEC;
+            // Reset the prefill_request
+            prefill_request = nullptr;
           }
         } else {
-          assert(false && "Invalid inference mode.");
+          assert(false && "Invalid prefill model.");
         }
       } else {
         assert(false && "Invalid inference mode.");
@@ -450,6 +452,8 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
       }
       // else, keep the current status
       break;
+    default:
+      assert(false && "Invalid request manager status.");
   }
 }
 
@@ -510,6 +514,19 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   // 3. If at least one request is completed, return true
 }
 
+bool RequestManager::update_ssm_prefill_results(
+    InferenceResult const &ssm_prefill_result) {
+  // This function is called by update_inference_results when the
+  // request_manager_status is PREFILLING and the prefill_model is SSM.
+  // There's no results to update, but we should update some SSM related states
+  // related to SSM.
+  prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
+  if (prefill_request->ssm_cache_size == prefill_request->tokens.size()) {
+    return true;
+  }
+  return false;
+}
+
 BatchConfig RequestManager::prepare_next_batch() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
 

From 9edcdeab66c4ab9e09223eeca8cfd4d5bb173c2e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 15:04:39 -0400
Subject: [PATCH 157/667] Added some comments in update_llm_prefill_results.

---
 src/runtime/request_manager.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3a87a91eb..0a2fc9d10 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -498,12 +498,13 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   // TODO:
   // The pending request can be found at Request_manager.prefill_request
-  // 1. Check if the prefilling is finished (request.tokens.size() ==
+  // 1. Update request.llm_cache_size
+  // 2. Check if the prefilling is finished (request.tokens.size() ==
   // request.llm_cache_size)
-  // 2. If the prefilling is finished, push the last token in result to
+  // 3. If the prefilling is finished, push the last token in result to
   // request.tokens
-  // 3. Otherwise, no need to push
-  // 4. Return true if the prefilling is finished
+  // 4. Otherwise, no need to push
+  // 5. Return true if the prefilling is finished
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {

From 323528c7b2da07abcf6a0f3f4c8a8fd13ed8a598 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 15:06:45 -0400
Subject: [PATCH 158/667] Updated comments on update_llm_decode_results.

---
 src/runtime/request_manager.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0a2fc9d10..8fded0056 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -509,8 +509,9 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   // TODO:
-  // 1. Iterate over all requests
-  // request.num_tokens_in_batch != 0
+  // 1. Iterate over all requests, update the llm_cache_size and push token to
+  // request.tokens (find the token index in result by
+  // first_token_offset_in_batch)
   // 2. Check if the prefilling is finished
   // 3. If at least one request is completed, return true
 }

From 9f26bfbee2c0ca8c9576fc98be324f60a4a1fe2d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 15:30:10 -0400
Subject: [PATCH 159/667] Removed everything related to beam topk.

---
 include/flexflow/ffconst.h              |   2 +-
 include/flexflow/model.h                |   8 +-
 include/flexflow/operator_params.h      |   4 +-
 include/flexflow/ops/beam_topk.h        | 112 ----
 include/flexflow/ops/beam_topk_params.h |  28 -
 src/ops/beam_topk.cc                    | 476 ---------------
 src/ops/beam_topk.cpp                   | 723 ----------------------
 src/ops/beam_topk.cu                    | 765 ------------------------
 src/runtime/ffconst_utils.cc            |   4 +-
 src/runtime/graph.cc                    |  40 +-
 src/runtime/model.cc                    |  78 +--
 src/runtime/operator_params.cc          |   6 +-
 12 files changed, 72 insertions(+), 2174 deletions(-)
 delete mode 100644 include/flexflow/ops/beam_topk.h
 delete mode 100644 include/flexflow/ops/beam_topk_params.h
 delete mode 100644 src/ops/beam_topk.cc
 delete mode 100644 src/ops/beam_topk.cpp
 delete mode 100644 src/ops/beam_topk.cu

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 296f80311..b24adc080 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -166,7 +166,7 @@ enum OperatorType {
   OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html
   OP_RMS_NORM,
   OP_RESIDUAL_RMS_NORM,
-  OP_BEAM_TOPK,
+  //   OP_BEAM_TOPK,
   OP_ARGMAX,
   OP_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 7edb8579e..0f1d08ced 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -164,8 +164,8 @@ enum TaskIDs {
   RMSNORM_INF_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
-  BEAM_TOPK_INIT_TASK_ID,
-  BEAM_TOPK_INF_TASK_ID,
+  //   BEAM_TOPK_INIT_TASK_ID,
+  //   BEAM_TOPK_INF_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
@@ -1204,8 +1204,8 @@ class FFModel {
       std::unordered_map<
           std::pair<ParallelTensorShape, IncMultiHeadSelfAttentionParams>,
           IncMultiHeadSelfAttention *>,
-      std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
-                         BeamTopK *>,
+      //   std::unordered_map<std::pair<ParallelTensorShape, BeamTopKParams>,
+      //                      BeamTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, SamplingParams>,
                          Sampling *>,
       std::unordered_map<std::pair<ParallelTensorShape, ArgMaxParams>,
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 5b187839e..e87408438 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -8,7 +8,7 @@
 #include "flexflow/ops/argmax_params.h"
 #include "flexflow/ops/attention_params.h"
 #include "flexflow/ops/batch_matmul_params.h"
-#include "flexflow/ops/beam_topk_params.h"
+// #include "flexflow/ops/beam_topk_params.h"
 #include "flexflow/ops/cast_params.h"
 #include "flexflow/ops/concat_params.h"
 #include "flexflow/ops/conv_2d_params.h"
@@ -69,7 +69,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        LinearParams,
                                        MultiHeadAttentionParams,
                                        IncMultiHeadSelfAttentionParams,
-                                       BeamTopKParams,
+                                       //    BeamTopKParams,
                                        SpecIncMultiHeadSelfAttentionParams,
                                        TreeIncMultiHeadSelfAttentionParams,
                                        RMSNormParams,
diff --git a/include/flexflow/ops/beam_topk.h b/include/flexflow/ops/beam_topk.h
deleted file mode 100644
index 32b103b5c..000000000
--- a/include/flexflow/ops/beam_topk.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef _FLEXFLOW_BEAM_TOPK_H_
-#define _FLEXFLOW_BEAM_TOPK_H_
-
-#include "flexflow/inference.h"
-#include "flexflow/model.h"
-#include "flexflow/node.h"
-#include "flexflow/ops/beam_topk_params.h"
-#include "flexflow/utils/memory_allocator.h"
-
-namespace FlexFlow {
-
-class BeamTopKMeta : public OpMeta {
-public:
-  BeamTopKMeta(FFHandler handle,
-               Op const *op,
-               MemoryAllocator &gpu_mem_allocator);
-  ~BeamTopKMeta(void);
-  bool sorted;
-  int max_beam_width;
-  int *parent_ids;
-  void *acc_probs;
-  int *block_start_index;
-  int *request_id;
-  int *tokens_per_request;
-  Realm::RegionInstance reserveInst;
-};
-
-class BeamTopK : public Op {
-public:
-  using Params = BeamTopKParams;
-  using Input = ParallelTensor;
-  BeamTopK(FFModel &model,
-           ParallelTensor const input,
-           LayerID const &_layer_guid,
-           int max_beam_width,
-           bool sorted,
-           char const *name);
-  BeamTopK(FFModel &model, BeamTopK const &other, ParallelTensor const input);
-  BeamTopK(FFModel &model,
-           Params const &params,
-           Input const input,
-           char const *name = nullptr);
-  void init(FFModel const &) override;
-  void init_inference(FFModel const &,
-                      std::vector<ParallelTensor> const &,
-                      std::vector<ParallelTensor> const &,
-                      MachineView const *mv = nullptr) override;
-  void forward(FFModel const &) override;
-  void backward(FFModel const &) override;
-  Legion::FutureMap inference(FFModel const &,
-                              BatchConfigFuture const &,
-                              std::vector<ParallelTensor> const &,
-                              std::vector<ParallelTensor> const &,
-                              MachineView const *mv = nullptr) override;
-  void print_layer(FFModel const &model) override {
-    assert(0);
-  }
-  static Op *
-      create_operator_from_layer(FFModel &model,
-                                 Layer const *layer,
-                                 std::vector<ParallelTensor> const &inputs);
-
-  static OpMeta *init_task(Legion::Task const *task,
-                           std::vector<Legion::PhysicalRegion> const &regions,
-                           Legion::Context ctx,
-                           Legion::Runtime *runtime);
-  static SsmInferenceResult
-      inference_task(Legion::Task const *task,
-                     std::vector<Legion::PhysicalRegion> const &regions,
-                     Legion::Context ctx,
-                     Legion::Runtime *runtime);
-  void serialize(Legion::Serializer &s) const override;
-  static PCG::Node deserialize(FFModel &ff,
-                               Legion::Deserializer &d,
-                               ParallelTensor inputs[],
-                               int num_inputs);
-  Op *materialize(FFModel &ff,
-                  ParallelTensor inputs[],
-                  int num_inputs) const override;
-  bool measure_operator_cost(Simulator *sim,
-                             MachineView const &pc,
-                             CostMetrics &cost_metrics) const override;
-  template <typename DT>
-  static void forward_kernel(BeamTopKMeta const *m,
-                             TreeSearchBatchConfig const *bc,
-                             DT const *input_ptr,
-                             float *output_ptr,
-                             int *indices_ptr,
-                             int *parent_ptr,
-                             int batch_size,
-                             int length,
-                             bool sorted,
-                             ffStream_t stream);
-  static void forward_kernel_wrapper(BeamTopKMeta const *m,
-                                     TreeSearchBatchConfig const *bc,
-                                     GenericTensorAccessorR const &input,
-                                     float *output_ptr,
-                                     int *indices_ptr,
-                                     int *parent_ptr,
-                                     int batch_size,
-                                     int length,
-                                     bool sorted);
-  Params get_params() const;
-
-public:
-  bool sorted;
-  int max_beam_width;
-};
-
-}; // namespace FlexFlow
-
-#endif
diff --git a/include/flexflow/ops/beam_topk_params.h b/include/flexflow/ops/beam_topk_params.h
deleted file mode 100644
index 3e09848c9..000000000
--- a/include/flexflow/ops/beam_topk_params.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _FLEXFLOW_BEAM_TOPK_PARAMS_H
-#define _FLEXFLOW_BEAM_TOPK_PARAMS_H
-
-#include "flexflow/ffconst.h"
-#include "flexflow/fftype.h"
-#include "flexflow/parallel_tensor.h"
-
-namespace FlexFlow {
-
-struct BeamTopKParams {
-  LayerID layer_guid;
-  bool sorted;
-  int max_beam_width;
-  char name[MAX_OPNAME];
-  bool is_valid(ParallelTensorShape const &) const;
-};
-bool operator==(BeamTopKParams const &, BeamTopKParams const &);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<FlexFlow::BeamTopKParams> {
-  size_t operator()(FlexFlow::BeamTopKParams const &) const;
-};
-} // namespace std
-
-#endif // _FLEXFLOW_BEAM_TOPK_PARAMS_H
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
deleted file mode 100644
index 72fc5e96b..000000000
--- a/src/ops/beam_topk.cc
+++ /dev/null
@@ -1,476 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ops/beam_topk.h"
-#include "flexflow/model.h"
-#include "flexflow/utils/hash_utils.h"
-#include "legion/legion_utilities.h"
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "flexflow/utils/cuda_helper.h"
-#else
-#include "flexflow/utils/hip_helper.h"
-#endif
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::Future;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
-
-// For an input tensor, computes the top k entries in each row
-// (resp. vector along the last dimension). Thus,
-// values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::beam_top_k(Tensor const input,
-                           int max_beam_width,
-                           bool sorted,
-                           char const *name) {
-  Layer *li = new Layer(this,
-                        OP_BEAM_TOPK,
-                        input->data_type,
-                        name,
-                        1 /*inputs*/,
-                        0 /*weights*/,
-                        3 /*outputs*/,
-                        input);
-  {
-    int numdims = input->num_dims;
-
-    int dims[MAX_TENSOR_DIM];
-    for (int i = 0; i < numdims; i++) {
-      dims[i] = input->dims[i];
-    }
-    dims[0] = max_beam_width;
-
-    std::cout << "beam input dimen:" << numdims << "\n";
-    for (int i = 0; i < numdims; i++) {
-      std::cout << input->dims[i] << ", ";
-    }
-
-    // beam width is dynamic
-    li->outputs[0] = create_tensor_legion_ordering(
-        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
-    li->outputs[1] = create_tensor_legion_ordering(
-        numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
-    li->outputs[2] = create_tensor_legion_ordering(
-        numdims, dims, DT_INT32, li, 1, false /*create_grad*/);
-  }
-  li->add_int_property("sorted", sorted);
-  li->add_int_property("max_beam_width", max_beam_width);
-  layers.push_back(li);
-  // outputs[0] = li->outputs[0];
-  // outputs[1] = li->outputs[1];
-  return li->outputs[1];
-}
-
-Op *BeamTopK::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("sorted", value);
-  bool sorted = (bool)value;
-  layer->get_int_property("max_beam_width", value);
-  int max_beam_width = value;
-  return new BeamTopK(
-      model, inputs[0], layer->layer_guid, max_beam_width, sorted, layer->name);
-}
-
-BeamTopKParams BeamTopK::get_params() const {
-  BeamTopKParams params;
-  params.layer_guid = this->layer_guid;
-  params.sorted = this->sorted;
-  params.max_beam_width = this->max_beam_width;
-  return params;
-}
-
-bool BeamTopKParams::is_valid(ParallelTensorShape const &) const {
-  // topk is always valid
-  return true;
-}
-
-bool operator==(BeamTopKParams const &lhs, BeamTopKParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.sorted == rhs.sorted &&
-         lhs.max_beam_width == rhs.max_beam_width;
-}
-
-BeamTopK::BeamTopK(FFModel &model,
-                   ParallelTensor const _input,
-                   LayerID const &_layer_guid,
-                   int _max_beam_width,
-                   bool _sorted,
-                   char const *name)
-    : Op(model,
-         OP_BEAM_TOPK,
-         _input->data_type,
-         name,
-         1 /*inputs*/,
-         0 /*weights*/,
-         3 /*outputs*/,
-         _input) {
-  sorted = _sorted;
-  max_beam_width = _max_beam_width;
-  layer_guid = _layer_guid;
-  int numdim = inputs[0]->num_dims;
-  assert(inputs[0]->dims[0].degree == 1);
-  assert(inputs[0]->dims[0].parallel_idx == -1);
-  //   outputs[0] = model.create_parallel_tensor_legion_ordering(
-  //       numdim, dims, _input->data_type, this, 0 /*owner_idx*/);
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      numdim, inputs[0]->dims, DT_INT32, this, 0 /*owner_idx*/);
-  outputs[1] = model.create_parallel_tensor_legion_ordering(
-      numdim, inputs[0]->dims, DT_FLOAT, this, 1 /*owner_idx*/);
-  outputs[2] = model.create_parallel_tensor_legion_ordering(
-      numdim, inputs[0]->dims, DT_INT32, this, 2 /*owner_idx*/);
-}
-
-BeamTopK::BeamTopK(FFModel &model,
-                   BeamTopK const &other,
-                   ParallelTensor const input)
-    : BeamTopK(model,
-               input,
-               other.layer_guid,
-               other.max_beam_width,
-               other.sorted,
-               other.name) {}
-
-BeamTopK::BeamTopK(FFModel &model,
-                   BeamTopKParams const &params,
-                   ParallelTensor const input,
-                   char const *name)
-    : BeamTopK(model,
-               input,
-               params.layer_guid,
-               params.max_beam_width,
-               params.sorted,
-               params.name) {}
-
-void BeamTopK::init_inference(FFModel const &ff,
-                              std::vector<ParallelTensor> const &batch_inputs,
-                              std::vector<ParallelTensor> const &batch_outputs,
-                              MachineView const *mv) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = batch_outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  size_t machine_view_hash = view->hash();
-  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
-  IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(BeamTopK)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[2]->region));
-  launcher.add_field(3, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
-}
-
-void BeamTopK::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(BEAM_TOPK_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(BeamTopK)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[2]->region));
-  launcher.add_field(3, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
-
-OpMeta *BeamTopK::init_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  BeamTopK *topk = (BeamTopK *)task->args;
-  FFHandler handle = *((FFHandler *)task->local_args);
-  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
-                       .only_kind(Memory::GPU_FB_MEM)
-                       .best_affinity_to(task->target_proc)
-                       .first();
-  MemoryAllocator gpu_mem_allocator(gpu_mem);
-  BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator);
-  m->profiling = topk->profiling;
-  m->inference_debugging = topk->inference_debugging;
-  std::strcpy(m->op_name, topk->name);
-  m->layer_guid = topk->layer_guid;
-  m->sorted = topk->sorted;
-  m->max_beam_width = topk->max_beam_width;
-  m->input_type[0] = topk->inputs[0]->data_type;
-  return m;
-}
-
-void BeamTopK::forward(FFModel const &ff) {
-  assert(false);
-}
-
-FutureMap BeamTopK::inference(FFModel const &ff,
-                              /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                              std::vector<ParallelTensor> const &batch_inputs,
-                              std::vector<ParallelTensor> const &batch_outputs,
-                              MachineView const *mv) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  size_t machine_view_hash = view->hash();
-
-  IndexLauncher launcher(BEAM_TOPK_INF_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[1]->region));
-  launcher.add_field(2, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[2]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[2]->region));
-  launcher.add_field(3, FID_DATA);
-
-  return runtime->execute_index_space(ctx, launcher);
-}
-
-SsmInferenceResult
-    BeamTopK::inference_task(Task const *task,
-                             std::vector<PhysicalRegion> const &regions,
-                             Context ctx,
-                             Runtime *runtime) {
-
-  assert(regions.size() == 4);
-  assert(task->regions.size() == 4);
-
-  BeamTopKMeta *m = *((BeamTopKMeta **)task->local_args);
-  TreeSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
-
-  if (bc.num_active_tokens() == 0) {
-    SsmInferenceResult ir;
-    return ir;
-  }
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW index = helperGetGenericTensorAccessorWO(
-      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW value = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
-      DT_INT32, regions[3], task->regions[3], FID_DATA, ctx, runtime);
-
-  Domain input_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-
-  int *index_ptr = index.get_int32_ptr();
-  float *value_ptr = value.get_float_ptr();
-  int *parent_ptr = parent.get_int32_ptr();
-
-  // embedding size: eg. 4096
-  int length = input_domain.hi()[0] - input_domain.lo()[0] + 1;
-  // total token nums
-  size_t batch_size = bc.num_active_tokens();
-
-  // need meta for: how many sub requests in a main request
-  BeamTopK::forward_kernel_wrapper(m,
-                                   &bc,
-                                   input,
-                                   value_ptr,
-                                   index_ptr,
-                                   parent_ptr,
-                                   batch_size,
-                                   length,
-                                   m->sorted);
-
-  SsmInferenceResult ir;
-
-  download_tensor<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
-  download_tensor<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
-  download_tensor<int>(
-      parent_ptr, ir.parent_id, batch_size * m->max_beam_width);
-
-  if (m->inference_debugging) {
-    assert(task->index_point.get_dim() == 1);
-    int shard_id = task->index_point.point_data[0];
-    BeamTopK::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, {}, {index, value, parent});
-  }
-
-  return ir;
-}
-
-void BeamTopK::backward(FFModel const &ff) {
-  assert(false);
-}
-
-void BeamTopK::serialize(Legion::Serializer &sez) const {
-  sez.serialize(this->layer_guid.id);
-  sez.serialize(this->layer_guid.transformer_layer_id);
-  sez.serialize(this->layer_guid.model_id);
-  sez.serialize(this->sorted);
-  sez.serialize(this->max_beam_width);
-  sez.serialize(strlen(this->name));
-  sez.serialize(this->name, strlen(this->name));
-}
-
-Node BeamTopK::deserialize(FFModel &ff,
-                           Legion::Deserializer &dez,
-                           ParallelTensor inputs[],
-                           int num_inputs) {
-  assert(num_inputs == 1);
-  bool sorted;
-  size_t id, transformer_layer_id, deserialized_model_id;
-  int max_beam_width;
-  dez.deserialize(id);
-  dez.deserialize(transformer_layer_id);
-  dez.deserialize(deserialized_model_id);
-  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
-  dez.deserialize(sorted);
-  dez.deserialize(max_beam_width);
-  size_t name_len;
-  char name[MAX_OPNAME] = {0};
-  dez.deserialize(name_len);
-  dez.deserialize(name, name_len);
-
-  BeamTopKParams params;
-  params.layer_guid = layer_guid;
-  params.sorted = sorted;
-  params.max_beam_width = max_beam_width;
-  strcpy(params.name, name);
-  return ff.get_or_create_node<BeamTopK>(inputs[0], params);
-}
-
-Op *BeamTopK::materialize(FFModel &ff,
-                          ParallelTensor inputs[],
-                          int num_inputs) const {
-  BeamTopKParams params = get_params();
-  return new BeamTopK(ff, params, inputs[0], this->name);
-}
-
-bool BeamTopK::measure_operator_cost(Simulator *sim,
-                                     MachineView const &mv,
-                                     CostMetrics &cost_metrics) const {
-  return false;
-}
-
-}; // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::BeamTopKParams>::operator()(
-    FlexFlow::BeamTopKParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.layer_guid.id);
-  hash_combine(key, params.sorted);
-  hash_combine(key, params.max_beam_width);
-  return key;
-}
-}; // namespace std
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
deleted file mode 100644
index 5ff3c29ea..000000000
--- a/src/ops/beam_topk.cpp
+++ /dev/null
@@ -1,723 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ops/beam_topk.h"
-#include "flexflow/ffconst_utils.h"
-#include "flexflow/utils/hip_helper.h"
-#include <hip/hip_runtime.h>
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::coord_t;
-
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
-
-LegionRuntime::Logger::Category log_beam_topk("BeamTopK");
-
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapBeamTopK walks over [input, input+length) with `step_size` stride
-// starting at `start_index`. It builds a top-`k` heap that is stored in
-// `heap_entries` using `Accessor` to access elements in `heap_entries`. If
-// sorted=true, the elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapBeamTopK(T const *__restrict__ input,
-                             int batch_index,
-                             int length,
-                             int k,
-                             Entry<T> *__restrict__ heap_entries,
-                             bool sorted = false,
-                             int start_index = 0,
-                             int step_size = 1) {
-  assert(k <= length);
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
-  }
-
-  // if(batch_index == 0){
-  //   printf("top elemmments: %d, value %.15f\n", start_index,
-  //   heap.root().value);
-  // }
-}
-
-template <typename T>
-__device__ void mergeBeamShards(int num_shards,
-                                int batch_index,
-                                int k,
-                                int max_heap_size,
-                                int request_id,
-                                int *parent_id,
-                                T *probs,
-                                Entry<T> *__restrict__ entries,
-                                Entry<T> *__restrict__ top_k_heap,
-                                float *top_k_values,
-                                int *top_k_indices,
-                                int *top_k_parents,
-                                bool verbose) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-  // printf("see value: %f", entries[0].value);
-  // Min-heap part.
-
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      // int beam = (slot % max_heap_size) / k;
-      /* Reserved: BatchConfig Updated, leave beamsearch to kill */T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((slot % max_heap_size) / k)];
-      min_heap.assign(slot, {slot, (entries[slot].value * prob)});
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((shard % max_heap_size) / k)];
-      if (entry.value * prob < root.value) {
-        continue;
-      }
-      if (entry.value * prob == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value * prob}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      top_k_values[rank] = __half2float(max_element.value);
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      top_k_parents[rank] =
-          parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                    ((shard_index % max_heap_size) / k)];
-      int next_shard_index = shard_index + num_shards;
-
-      T prob = probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((next_shard_index % max_heap_size) / k)];
-
-      max_heap.replace_root(
-          {next_shard_index, entries[next_shard_index].value * prob},
-          heap_size);
-    }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    top_k_values[last_k] = __half2float(max_element.value);
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
-    top_k_parents[last_k] =
-        parent_id[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-                  ((shard_index % max_heap_size) / k)];
-  }
-}
-
-template <typename T>
-__global__ void
-    mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void beam_topk_forward_kernel(T const *__restrict__ input,
-                                         size_t shared_memory_size,
-                                         int length,
-                                         int k,
-                                         int max_heap_size,
-                                         int *parent_ids,
-                                         T *acc_probs,
-                                         int *gpu_block_start_index,
-                                         int *gpu_request_id,
-                                         int *tokens_per_request,
-                                         bool sorted,
-                                         float *__restrict__ output,
-                                         int *__restrict__ indices,
-                                         int *__restrict__ parents,
-                                         bool verbose) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  // T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  int const request_id = gpu_request_id[batch_index];
-  int const token_nums = tokens_per_request[batch_index];
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-
-  int sub_request_id = thread_index / k;
-  // if (verbose) {
-  //   printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d,
-  //   "
-  //          "request_id %d, token_nums %d\n",
-  //          batch_index,
-  //          thread_index,
-  //          sub_request_id,
-  //          request_id,
-  //          token_nums);
-  // }
-
-  T const *batch_input = input + gpu_block_start_index[batch_index] +
-                         (sub_request_id * token_nums * length);
-
-  // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index,
-  // thread_count, batch_index);
-  heapBeamTopK<T, StridedData>(batch_input,
-                               batch_index,
-                               length,
-                               k,
-                               shared_entries,
-                               true,
-                               thread_index % k,
-                               k);
-  __syncthreads();
-  // printf("beam thread index %d, thread_count %d, thread index %d, batch_index
-  // "
-  //        "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d,
-  //        offset: %d, offset2 %d, sub_request_id %d\n", thread_index,
-  //        thread_count,
-  //        thread_index,
-  //        batch_index,
-  //        k,
-  //        parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], sub_request_id, request_id,
-  //        gpu_block_start_index[batch_index],
-  //        batch_index * length,
-  //        sub_request_id);
-
-  if (thread_index == 0) {
-    // merge beam_width heaps and store the parent
-    // find which req it belongs to, replace the offset
-    // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n",
-    //       batch_index,
-    //       sub_request_id,
-    //       acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-    //                 sub_request_id]);
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    auto batch_parents = parents + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-
-    // if(batch_index == 0 && verbose) {
-    //   for(int i = 0; i < 18; i++){
-    //       printf("see value: %.15f\n", shared_entries[i].value);
-    //   }
-    // }
-
-    // get parent/acc based on the sub request and main request
-    mergeBeamShards(thread_count,
-                    batch_index,
-                    k,
-                    max_heap_size,
-                    request_id,
-                    parent_ids,
-                    acc_probs,
-                    shared_entries,
-                    top_k_heap,
-                    batch_output,
-                    batch_indices,
-                    batch_parents,
-                    verbose /*verbose prints*/);
-  }
-}
-
-/*static*/
-template <typename DT>
-void BeamTopK::forward_kernel(BeamTopKMeta const *m,
-                              TreeSearchBatchConfig const *bc,
-                              DT const *input_ptr,
-                              float *output_ptr,
-                              int *indices_ptr,
-                              int *parent_ptr,
-                              int batch_size,
-                              int length,
-                              bool sorted,
-                              hipStream_t stream) {
-  // Adopted from TensorFlow's BeamTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-
-  int num_shards = 0;
-  int max_heap_size = 0;
-  int max_beam_width = 0;
-  int req_index = 0;
-
-  // sub request
-  int const *sub_requests = bc->sub_requests;
-
-  // std::vector<BatchConfig::BeamSlot> beam_slots = bc->beam_slots;
-  // assert(bc->beam_slots.size() > 0);
-
-  int beam_num_blocks = 0;
-  std::vector<int> beam_block_start_index;
-  std::vector<int> request_id;
-  std::vector<int> tokens_per_request;
-
-  int block_start_index = 0;
-
-  // a data structure for prob, parent_id,
-  int max_total_requests =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests();
-  int parent_ids[max_total_requests];
-  DT acc_probs[max_total_requests];
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i]) {
-      continue;
-    }
-    assert(bc->beamRequestsInfo[i].beam_size > 0);
-
-    // int num_new_tokens = bc->num_processing_tokens[i];
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-
-    // get beam size;
-    int beam_size = bc->beamRequestsInfo[i].beam_size;
-
-    // initial request
-    log_beam_topk.debug() << "sub_requests: " << i << ", " << sub_requests[i]
-                          << "\n";
-    assert(sub_requests[i] > 0);
-    // process sub requests
-    for (int j = 0; j < sub_requests[i]; j++) {
-      parent_ids[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] = j;
-      // beam_slots[i].parent_id[j];
-      acc_probs[req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j] =
-          bc->beamRequestsInfo[i].probs[j];
-      log_beam_topk.debug()
-          << "probbbb req: " << i
-          << ", sub req probability : " << bc->beamRequestsInfo[i].probs[j]
-          << ", sub request id " << j << ", parent id "
-          << bc->beamRequestsInfo[i].parent_id[j] << ", data inddd"
-          << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j << "\n";
-    }
-
-    // process tokens
-    for (int k = 0; k < num_new_tokens; k++) {
-      beam_block_start_index.push_back(block_start_index);
-      request_id.push_back(i);
-      tokens_per_request.push_back(num_new_tokens);
-      block_start_index += length;
-      beam_num_blocks++;
-    }
-
-    max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]);
-    max_beam_width = std::max(max_beam_width, beam_size);
-    req_index += 1;
-    block_start_index += (sub_requests[i] - 1) * num_new_tokens * length;
-  }
-  log_beam_topk.debug() << "what index: " << block_start_index
-                        << ", block num: " << beam_num_blocks << "\n";
-
-  assert(batch_size >= beam_num_blocks);
-  assert(bc->num_active_requests() == req_index);
-
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = max_heap_size * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-    log_beam_topk.debug() << "maxheap size:  " << max_heap_size << "\n";
-    log_beam_topk.debug() << "maxbeam width:  " << max_beam_width
-                          << ", heap size: " << heap_size << "\n";
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size =
-      (num_shards + 1) * max_heap_size * sizeof(Entry<DT>);
-
-  assert(num_shards >= (size_t)max_heap_size);
-  num_shards = max_heap_size;
-
-  checkCUDA(hipMemcpy(m->parent_ids,
-                      parent_ids,
-                      sizeof(int) * max_total_requests,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->acc_probs,
-                      acc_probs,
-                      sizeof(DT) * max_total_requests,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->block_start_index,
-                      beam_block_start_index.data(),
-                      sizeof(int) * beam_num_blocks,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->request_id,
-                      request_id.data(),
-                      sizeof(int) * beam_num_blocks,
-                      hipMemcpyHostToDevice));
-  checkCUDA(hipMemcpy(m->tokens_per_request,
-                      tokens_per_request.data(),
-                      sizeof(int) * beam_num_blocks,
-                      hipMemcpyHostToDevice));
-
-  beam_topk_forward_kernel<<<beam_num_blocks, num_shards, 0, stream>>>(
-      input_ptr,
-      shared_memory_size,
-      length,
-      max_beam_width,
-      max_heap_size,
-      m->parent_ids,
-      static_cast<DT *>(m->acc_probs),
-      m->block_start_index,
-      m->request_id,
-      m->tokens_per_request,
-      sorted,
-      output_ptr,
-      indices_ptr,
-      parent_ptr,
-      false /*verbose*/ // depth == 1
-  );
-
-  // merge sub
-}
-
-/*static*/
-void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
-                                      TreeSearchBatchConfig const *bc,
-                                      GenericTensorAccessorR const &input,
-                                      float *output_ptr,
-                                      int *indices_ptr,
-                                      int *parent_ptr,
-                                      int batch_size,
-                                      int length,
-                                      bool sorted) {
-  hipStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  hipEvent_t t_start, t_end;
-  if (m->profiling) {
-    checkCUDA(hipEventCreate(&t_start));
-    checkCUDA(hipEventCreate(&t_end));
-    checkCUDA(hipEventRecord(t_start, stream));
-  }
-
-  if (input.data_type == DT_HALF) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_half_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  } else if (input.data_type == DT_FLOAT) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_float_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  }
-
-  if (m->profiling) {
-    checkCUDA(hipEventRecord(t_end, stream));
-    checkCUDA(hipEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
-    checkCUDA(hipEventDestroy(t_start));
-    checkCUDA(hipEventDestroy(t_end));
-    printf("[BeamTopK] forward time = %.2lfms\n", elapsed);
-  }
-}
-
-BeamTopKMeta::BeamTopKMeta(FFHandler handler,
-                           Op const *op,
-                           MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
-  DataType data_type = op->inputs[0]->data_type;
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-  int max_requests_per_batch = BatchConfig::max_requests_per_batch();
-  size_t parent_id_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t acc_probs_size =
-      BeamSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t request_id_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t tokens_per_request_size =
-      max_tokens_per_batch * max_requests_per_batch;
-  size_t totalSize = sizeof(int) * parent_id_size +
-                     data_type_size(data_type) * acc_probs_size +
-                     sizeof(int) * block_start_index_size +
-                     sizeof(int) * request_id_size +
-                     sizeof(int) * tokens_per_request_size;
-
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
-  parent_ids = gpu_mem_allocator.allocate_instance<int>(parent_id_size);
-  if (data_type == DT_FLOAT) {
-    acc_probs = gpu_mem_allocator.allocate_instance<float>(acc_probs_size);
-  } else if (data_type == DT_HALF) {
-    acc_probs = gpu_mem_allocator.allocate_instance<half>(acc_probs_size);
-  } else {
-    assert(false);
-  }
-
-  block_start_index =
-      gpu_mem_allocator.allocate_instance<int>(block_start_index_size);
-  request_id = gpu_mem_allocator.allocate_instance<int>(request_id_size);
-  tokens_per_request =
-      gpu_mem_allocator.allocate_instance<int>(tokens_per_request_size);
-}
-
-BeamTopKMeta::~BeamTopKMeta(void) {
-  if (reserveInst != Realm::RegionInstance::NO_INST) {
-    reserveInst.destroy();
-  }
-}
-}; // namespace FlexFlow
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
deleted file mode 100644
index 606f08997..000000000
--- a/src/ops/beam_topk.cu
+++ /dev/null
@@ -1,765 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ffconst_utils.h"
-#include "flexflow/ops/beam_topk.h"
-#include "flexflow/request_manager.h"
-#include "flexflow/utils/cuda_helper.h"
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::coord_t;
-
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
-
-LegionRuntime::Logger::Category log_beam_topk("BeamTopK");
-
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapBeamTopK walks over [input, input+length) with `step_size` stride
-// starting at `start_index`. It builds a top-`k` heap that is stored in
-// `heap_entries` using `Accessor` to access elements in `heap_entries`. If
-// sorted=true, the elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapBeamTopK(T const *__restrict__ input,
-                             int batch_index,
-                             int length,
-                             int k,
-                             Entry<T> *__restrict__ heap_entries,
-                             bool sorted = false,
-                             int start_index = 0,
-                             int step_size = 1) {
-  assert(k <= length);
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
-  }
-
-  // if(batch_index == 0){
-  //   printf("top elemmments: %d, value %.15f\n", start_index,
-  //   heap.root().value);
-  // }
-}
-
-template <typename T>
-__device__ void mergeBeamShards(int num_shards,
-                                int batch_index,
-                                int k,
-                                int max_heap_size,
-                                int request_id,
-                                int *parent_id,
-                                T *probs,
-                                Entry<T> *__restrict__ entries,
-                                Entry<T> *__restrict__ top_k_heap,
-                                float *top_k_values,
-                                int *top_k_indices,
-                                int *top_k_parents,
-                                bool verbose) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-  // printf("see value: %f", entries[0].value);
-  // Min-heap part.
-
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      // int beam = (slot % max_heap_size) / k;
-      /* Reserved: BatchConfig Updated, leave beamsearch to kill */T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((slot % max_heap_size) / k)];
-      min_heap.assign(slot, {slot, (entries[slot].value * prob)});
-      if (verbose && batch_index == 0) {
-        printf("slot %d, value %.15f, prob %15f\n",
-               slot,
-               static_cast<float>(entries[slot].value),
-               static_cast<float>(prob));
-      }
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-
-      T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((shard % max_heap_size) / k)];
-      if (verbose && batch_index == 0) {
-        printf("shard %d, index %d, value %.15f, prob %.15f\n",
-               shard,
-               entry.index,
-               static_cast<float>(entry.value),
-               static_cast<float>(prob));
-      }
-      if (entry.value * prob < root.value) {
-        continue;
-      }
-      if (entry.value * prob == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value * prob}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      top_k_values[rank] = __half2float(max_element.value);
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      top_k_parents[rank] =
-          parent_id[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
-                    ((shard_index % max_heap_size) / k)];
-      int next_shard_index = shard_index + num_shards;
-
-      T prob = probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
-                     ((next_shard_index % max_heap_size) / k)];
-      // if (batch_index == 0) {
-      //   printf("next_shard_index %d, value %.15f, prob %.15f\n",
-      //          next_shard_index,
-      //          entries[next_shard_index].value,
-      //          prob);
-      // }
-      max_heap.replace_root(
-          {next_shard_index, entries[next_shard_index].value * prob},
-          heap_size);
-    }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    top_k_values[last_k] = __half2float(max_element.value);
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
-    top_k_parents[last_k] =
-        parent_id[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
-                  ((shard_index % max_heap_size) / k)];
-  }
-}
-
-template <typename T>
-__global__ void
-    mergeSubRequestsKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  int64_t const i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    int64_t const index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void beam_topk_forward_kernel(T const *__restrict__ input,
-                                         size_t shared_memory_size,
-                                         int length,
-                                         int k,
-                                         int max_heap_size,
-                                         int *parent_ids,
-                                         T *acc_probs,
-                                         int *gpu_block_start_index,
-                                         int *gpu_request_id,
-                                         int *tokens_per_request,
-                                         bool sorted,
-                                         float *__restrict__ output,
-                                         int *__restrict__ indices,
-                                         int *__restrict__ parents,
-                                         bool verbose) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  // T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  int const request_id = gpu_request_id[batch_index];
-  int const token_nums = tokens_per_request[batch_index];
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-
-  int sub_request_id = thread_index / k;
-  // if (verbose) {
-  //   printf("beam kernel: batch_index: %d, thread_index %d, sub_request_id %d,
-  //   "
-  //          "request_id %d, token_nums %d\n",
-  //          batch_index,
-  //          thread_index,
-  //          sub_request_id,
-  //          request_id,
-  //          token_nums);
-  // }
-
-  T const *batch_input = input + gpu_block_start_index[batch_index] +
-                         (sub_request_id * token_nums * length);
-
-  if (verbose && batch_index == 0) {
-    printf("request 0 start index: thread index %d, offset %d, batch_input %p, "
-           "acc index %d acc "
-           "prob %f, thread_count %d, request_id %d\n",
-           thread_index,
-           gpu_block_start_index[batch_index] +
-               (sub_request_id * token_nums * length),
-           batch_input,
-           request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH + sub_request_id,
-           static_cast<float>(
-               acc_probs[request_id * TreeSearchBatchConfig::MAX_BEAM_WIDTH +
-                         sub_request_id]),
-           thread_count,
-           request_id);
-  }
-  // printf("thread index %d, thread_count %d, batch_index %d\n", thread_index,
-  // thread_count, batch_index);
-  heapBeamTopK<T, StridedData>(batch_input,
-                               batch_index,
-                               length,
-                               k,
-                               shared_entries,
-                               true,
-                               thread_index % k,
-                               k);
-  __syncthreads();
-  // printf("beam thread index %d, thread_count %d, thread index %d, batch_index
-  // "
-  //        "%d, k %d, parent_id %d, acc_prob: %f, sub id: %d, request_id: %d,
-  //        offset: %d, offset2 %d, sub_request_id %d\n", thread_index,
-  //        thread_count,
-  //        thread_index,
-  //        batch_index,
-  //        k,
-  //        parent_ids[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], acc_probs[request_id * BatchConfig::MAX_NUM_BEAMS +
-  //        sub_request_id], sub_request_id, request_id,
-  //        gpu_block_start_index[batch_index],
-  //        batch_index * length,
-  //        sub_request_id);
-
-  if (thread_index == 0) {
-    // merge beam_width heaps and store the parent
-    // find which req it belongs to, replace the offset
-    // printf("merge heaps, batch index: %d, sub_request_id %d, value %f\n",
-    //       batch_index,
-    //       sub_request_id,
-    //       acc_probs[request_id * BeamSearchBatchConfig::MAX_BEAM_WIDTH +
-    //                 sub_request_id]);
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    auto batch_parents = parents + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-
-    // if(batch_index == 0 && verbose) {
-    //   for(int i = 0; i < 18; i++){
-    //       printf("see value: %.15f\n", shared_entries[i].value);
-    //   }
-    // }
-
-    // get parent/acc based on the sub request and main request
-    mergeBeamShards(thread_count,
-                    batch_index,
-                    k,
-                    max_heap_size,
-                    request_id,
-                    parent_ids,
-                    acc_probs,
-                    shared_entries,
-                    top_k_heap,
-                    batch_output,
-                    batch_indices,
-                    batch_parents,
-                    verbose /*verbose prints*/);
-  }
-}
-
-/*static*/
-template <typename DT>
-void BeamTopK::forward_kernel(BeamTopKMeta const *m,
-                              TreeSearchBatchConfig const *bc,
-                              DT const *input_ptr,
-                              float *output_ptr,
-                              int *indices_ptr,
-                              int *parent_ptr,
-                              int batch_size,
-                              int length,
-                              bool sorted,
-                              cudaStream_t stream) {
-  // Adopted from TensorFlow's BeamTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-
-  int num_shards = 0;
-  int max_heap_size = 0;
-  int max_beam_width = 0;
-  int req_index = 0;
-
-  // sub request
-  int const *sub_requests = bc->sub_requests;
-
-  // std::vector<BatchConfig::BeamSlot> beam_slots = bc->beam_slots;
-  // assert(bc->beam_slots.size() > 0);
-
-  int beam_num_blocks = 0;
-  std::vector<int> beam_block_start_index;
-  std::vector<int> request_id;
-  std::vector<int> tokens_per_request;
-
-  int block_start_index = 0;
-
-  // a data structure for prob, parent_id,
-  int max_total_requests =
-      TreeSearchBatchConfig::MAX_BEAM_WIDTH * bc->num_active_requests();
-  int parent_ids[max_total_requests];
-  DT acc_probs[max_total_requests];
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i]) {
-      continue;
-    }
-    assert(bc->beamRequestsInfo[i].beam_size > 0);
-
-    // int num_new_tokens = bc->num_processing_tokens[i];
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-
-    // get beam size;
-    int beam_size = bc->beamRequestsInfo[i].beam_size;
-
-    // initial request
-    assert(sub_requests[i] > 0);
-    // process sub requests
-    for (int j = 0; j < sub_requests[i]; j++) {
-      parent_ids[req_index * TreeSearchBatchConfig::MAX_BEAM_WIDTH + j] = j;
-      // beam_slots[i].parent_id[j];
-      acc_probs[req_index * TreeSearchBatchConfig::MAX_BEAM_WIDTH + j] =
-          bc->beamRequestsInfo[i].probs[j];
-      // std::cout << "probbbb req: " << i << ", sub req probability : "
-      //           << bc->beamRequestsInfo[i].probs[j] << ", sub request id " <<
-      //           j
-      //           << ", parent id " << bc->beamRequestsInfo[i].parent_id[j]
-      //           << ", data inddd"
-      //           << req_index * BeamSearchBatchConfig::MAX_BEAM_WIDTH + j
-      //           << "\n";
-    }
-
-    // process tokens
-    for (int k = 0; k < num_new_tokens; k++) {
-      beam_block_start_index.push_back(block_start_index);
-      request_id.push_back(i);
-      tokens_per_request.push_back(num_new_tokens);
-      block_start_index += length;
-      beam_num_blocks++;
-    }
-
-    max_heap_size = std::max(max_heap_size, beam_size * sub_requests[i]);
-    max_beam_width = std::max(max_beam_width, beam_size);
-
-    req_index += 1;
-    block_start_index += (sub_requests[i] - 1) * num_new_tokens * length;
-  }
-  log_beam_topk.debug() << "what index: " << block_start_index
-                        << ", block num: " << beam_num_blocks << "\n";
-
-  assert(batch_size >= beam_num_blocks);
-  assert(bc->num_active_requests() == req_index);
-
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = max_heap_size * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-    log_beam_topk.debug() << "maxheap size:  " << max_heap_size << "\n";
-    log_beam_topk.debug() << "maxbeam width:  " << max_beam_width
-                          << ", heap size: " << heap_size << "\n";
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size =
-      (num_shards + 1) * max_heap_size * sizeof(Entry<DT>);
-
-  assert(num_shards >= (size_t)max_heap_size);
-  num_shards = max_heap_size;
-
-  checkCUDA(cudaMemcpyAsync(m->parent_ids,
-                            parent_ids,
-                            sizeof(int) * max_total_requests,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  checkCUDA(cudaMemcpyAsync(m->acc_probs,
-                            acc_probs,
-                            sizeof(DT) * max_total_requests,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  // trick, set acc_probs to 0;
-  checkCUDA(cudaMemsetAsync(
-      m->acc_probs, 1.0, max_total_requests * sizeof(DT), stream));
-  checkCUDA(cudaMemcpyAsync(m->block_start_index,
-                            beam_block_start_index.data(),
-                            sizeof(int) * beam_num_blocks,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  checkCUDA(cudaMemcpyAsync(m->request_id,
-                            request_id.data(),
-                            sizeof(int) * beam_num_blocks,
-                            cudaMemcpyHostToDevice,
-                            stream));
-  checkCUDA(cudaMemcpyAsync(m->tokens_per_request,
-                            tokens_per_request.data(),
-                            sizeof(int) * beam_num_blocks,
-                            cudaMemcpyHostToDevice,
-                            stream));
-
-  beam_num_blocks = bc->num_active_tokens();
-  beam_topk_forward_kernel<<<beam_num_blocks, num_shards, 0, stream>>>(
-      input_ptr,
-      shared_memory_size,
-      length,
-      max_beam_width,
-      max_heap_size,
-      m->parent_ids,
-      static_cast<DT *>(m->acc_probs),
-      m->block_start_index,
-      m->request_id,
-      m->tokens_per_request,
-      sorted,
-      output_ptr,
-      indices_ptr,
-      parent_ptr,
-      false /*verbose*/ // depth == 1
-  );
-
-  // merge sub
-}
-
-/*static*/
-void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
-                                      TreeSearchBatchConfig const *bc,
-                                      GenericTensorAccessorR const &input,
-                                      float *output_ptr,
-                                      int *indices_ptr,
-                                      int *parent_ptr,
-                                      int batch_size,
-                                      int length,
-                                      bool sorted) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
-
-  if (input.data_type == DT_HALF) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_half_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  } else if (input.data_type == DT_FLOAT) {
-    BeamTopK::forward_kernel(m,
-                             bc,
-                             input.get_float_ptr(),
-                             output_ptr,
-                             indices_ptr,
-                             parent_ptr,
-                             batch_size,
-                             length,
-                             sorted,
-                             stream);
-  }
-
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    printf("[BeamTopK] forward time = %.2lfms\n", elapsed);
-  }
-}
-
-BeamTopKMeta::BeamTopKMeta(FFHandler handler,
-                           Op const *op,
-                           MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
-  DataType data_type = op->inputs[0]->data_type;
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-  int max_requests_per_batch = BatchConfig::max_requests_per_batch();
-  size_t parent_id_size =
-      TreeSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t acc_probs_size =
-      TreeSearchBatchConfig::MAX_BEAM_WIDTH * max_requests_per_batch;
-  size_t block_start_index_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t request_id_size = max_tokens_per_batch * max_requests_per_batch;
-  size_t tokens_per_request_size =
-      max_tokens_per_batch * max_requests_per_batch;
-  size_t totalSize = sizeof(int) * parent_id_size +
-                     data_type_size(data_type) * acc_probs_size +
-                     sizeof(int) * block_start_index_size +
-                     sizeof(int) * request_id_size +
-                     sizeof(int) * tokens_per_request_size;
-
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
-  parent_ids = gpu_mem_allocator.allocate_instance<int>(parent_id_size);
-  if (data_type == DT_FLOAT) {
-    acc_probs = gpu_mem_allocator.allocate_instance<float>(acc_probs_size);
-  } else if (data_type == DT_HALF) {
-    acc_probs = gpu_mem_allocator.allocate_instance<half>(acc_probs_size);
-  } else {
-    assert(false);
-  }
-
-  block_start_index =
-      gpu_mem_allocator.allocate_instance<int>(block_start_index_size);
-  request_id = gpu_mem_allocator.allocate_instance<int>(request_id_size);
-  tokens_per_request =
-      gpu_mem_allocator.allocate_instance<int>(tokens_per_request_size);
-}
-
-BeamTopKMeta::~BeamTopKMeta(void) {
-  if (reserveInst != Realm::RegionInstance::NO_INST) {
-    reserveInst.destroy();
-  }
-}
-}; // namespace FlexFlow
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index c7b6e1257..b274e547c 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -116,8 +116,8 @@ std::string get_operator_type_name(OperatorType type) {
       return "TopK";
     case OP_ARG_TOPK:
       return "ArgTopK";
-    case OP_BEAM_TOPK:
-      return "BeamTopK";
+    // case OP_BEAM_TOPK:
+    //   return "BeamTopK";
     case OP_WHERE:
       return "Where";
     case OP_CEIL:
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index f8e8240cc..11a9bf363 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -21,7 +21,7 @@
 #include "flexflow/ops/argmax.h"
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/batch_matmul.h"
-#include "flexflow/ops/beam_topk.h"
+// #include "flexflow/ops/beam_topk.h"
 #include "flexflow/ops/cast.h"
 #include "flexflow/ops/concat.h"
 #include "flexflow/ops/conv_2d.h"
@@ -69,7 +69,7 @@ using FlexFlow::MachineView;
 LegionRuntime::Logger::Category log_graph("graph");
 LegionRuntime::Logger::Category log_simplify("graph_simplify");
 
-const Node Node::INVALID_NODE = Node();
+Node const Node::INVALID_NODE = Node();
 
 Node::Node(void) : guid(0), ptr(NULL) {}
 
@@ -1604,17 +1604,15 @@ T SearchHelper::graph_cost(Graph const *graph,
                            bool include_sink_compute_time) const {
   TAG_ENTER(this->logger);
   this->logger->debug() << "PCG::SearchHelper::graph_cost: sink("
-                        << sink.node.guid << ") "
-                        << "sink.view(" << sink.view.ndims << " "
-                        << sink.view.start_device_id << " " << sink.view.dim[0]
-                        << ") "
-                        << "source(" << source.node.guid << ") "
-                        << "source.view(" << source.view.ndims << " "
+                        << sink.node.guid << ") " << "sink.view("
+                        << sink.view.ndims << " " << sink.view.start_device_id
+                        << " " << sink.view.dim[0] << ") " << "source("
+                        << source.node.guid << ") " << "source.view("
+                        << source.view.ndims << " "
                         << source.view.start_device_id << " "
-                        << source.view.dim[0] << ") "
-                        << "resources(" << resources.num_nodes << " "
-                        << resources.start_gpu_id << " "
-                        << resources.available_gpus_per_node << ")";
+                        << source.view.dim[0] << ") " << "resources("
+                        << resources.num_nodes << " " << resources.start_gpu_id
+                        << " " << resources.available_gpus_per_node << ")";
   if (this->model->config.profiling) {
     graph->print_dot();
   }
@@ -1737,11 +1735,11 @@ T SearchHelper::graph_cost(Graph const *graph,
     this->logger->spew() << "  op_total_mem: " << metrics.op_total_mem;
     float op_total_mem_mb = (float)((metrics.op_total_mem) / 1e4) / 1e2;
     this->logger->debug() << "[PCG::SearchHelper::graph_cost] Sink node cost ["
-                          << sink.node.to_string() << "]: "
-                          << "forward(" << metrics.forward_time << ") "
-                          << "backward(" << metrics.backward_time << ") "
-                          << "sync(" << metrics.sync_time << ") "
-                          << "memory(" << op_total_mem_mb << " MB)";
+                          << sink.node.to_string() << "]: " << "forward("
+                          << metrics.forward_time << ") " << "backward("
+                          << metrics.backward_time << ") " << "sync("
+                          << metrics.sync_time << ") " << "memory("
+                          << op_total_mem_mb << " MB)";
     this->add_sink_node_costs<T>(sink, metrics, &result);
   }
 
@@ -2976,10 +2974,10 @@ void FFModel::deserialize_graph_optimal_view(
         node = ArgTopK::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
-      case OP_BEAM_TOPK: {
-        node = BeamTopK::deserialize(*this, dez, inputs, num_inputs);
-        break;
-      }
+        //   case OP_BEAM_TOPK: {
+        //     node = BeamTopK::deserialize(*this, dez, inputs, num_inputs);
+        //     break;
+        //   }
       case OP_SAMPLING: {
         node = Sampling::deserialize(*this, dez, inputs, num_inputs);
         break;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index b5dbe5cf8..5fe664297 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -30,7 +30,7 @@
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
-#include "flexflow/ops/beam_topk.h"
+// #include "flexflow/ops/beam_topk.h"
 #include "flexflow/ops/cache.h"
 #include "flexflow/ops/cast.h"
 #include "flexflow/ops/concat.h"
@@ -3246,11 +3246,11 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
-    case OP_BEAM_TOPK: {
-      Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs);
-      operators.push_back(op);
-      return op;
-    }
+    // case OP_BEAM_TOPK: {
+    //   Op *op = BeamTopK::create_operator_from_layer(*this, layer, inputs);
+    //   operators.push_back(op);
+    //   return op;
+    // }
     case OP_SAMPLING: {
       Op *op = Sampling::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
@@ -6077,37 +6077,41 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   // BeamTopk task
-  {
-    TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK Init");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<OpMeta *, BeamTopK::init_task>(
-          registrar, "BeamTopK Init Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<OpMeta *, BeamTopK::init_task>(registrar);
-    }
-  }
-  {
-    TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK Inference");
-    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<SsmInferenceResult,
-                                        BeamTopK::inference_task>(
-          registrar, "BeamTopK Inference Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime
-          ->register_task_variant<SsmInferenceResult, BeamTopK::inference_task>(
-              registrar);
-    }
-  }
+  //   {
+  //     TaskVariantRegistrar registrar(BEAM_TOPK_INIT_TASK_ID, "BeamTopK
+  //     Init");
+  //     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+  //     registrar.set_leaf();
+  //     if (pre_register) {
+  //       Runtime::preregister_task_variant<OpMeta *, BeamTopK::init_task>(
+  //           registrar, "BeamTopK Init Task");
+  //     } else {
+  //       if (enable_control_replication) {
+  //         registrar.global_registration = false;
+  //       }
+  //       runtime->register_task_variant<OpMeta *,
+  //       BeamTopK::init_task>(registrar);
+  //     }
+  //   }
+  //   {
+  //     TaskVariantRegistrar registrar(BEAM_TOPK_INF_TASK_ID, "BeamTopK
+  //     Inference");
+  //     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+  //     registrar.set_leaf();
+  //     if (pre_register) {
+  //       Runtime::preregister_task_variant<SsmInferenceResult,
+  //                                         BeamTopK::inference_task>(
+  //           registrar, "BeamTopK Inference Task");
+  //     } else {
+  //       if (enable_control_replication) {
+  //         registrar.global_registration = false;
+  //       }
+  //       runtime
+  //           ->register_task_variant<SsmInferenceResult,
+  //           BeamTopK::inference_task>(
+  //               registrar);
+  //     }
+  //   }
   // Sampling task
   {
     TaskVariantRegistrar registrar(SAMPLING_INIT_TASK_ID, "Sampling Init");
diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc
index 6b2d223f5..442d09254 100644
--- a/src/runtime/operator_params.cc
+++ b/src/runtime/operator_params.cc
@@ -7,7 +7,7 @@
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
-#include "flexflow/ops/beam_topk.h"
+// #include "flexflow/ops/beam_topk.h"
 #include "flexflow/ops/cache.h"
 #include "flexflow/ops/cast.h"
 #include "flexflow/ops/concat.h"
@@ -141,8 +141,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((ResidualRMSNorm *)op)->get_params();
     case OP_ARG_TOPK:
       return ((ArgTopK *)op)->get_params();
-    case OP_BEAM_TOPK:
-      return ((BeamTopK *)op)->get_params();
+    // case OP_BEAM_TOPK:
+    //   return ((BeamTopK *)op)->get_params();
     case OP_SAMPLING:
       return ((Sampling *)op)->get_params();
     case OP_ARGMAX:

From 361fea98be7f844cb287034a2ab7e009027561c9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 15:37:07 -0400
Subject: [PATCH 160/667] Remove beam topk.

---
 include/flexflow/model.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 0f1d08ced..79e721f52 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -340,7 +340,7 @@ class ArgTopK;
 class Transpose;
 class RMSNorm;
 class ResidualRMSNorm;
-class BeamTopK;
+// class BeamTopK;
 class SpecIncMultiHeadSelfAttention;
 class Sampling;
 class ArgMax;

From eeb799c45cf3931f924828c1677be4cccc663ff3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 17:19:01 -0400
Subject: [PATCH 161/667] Removed TreeSearchBatchConfig.current_depth.

---
 include/flexflow/batch_config.h         | 1 -
 src/runtime/request_manager.cc          | 8 ++++----
 src/runtime/tree_search_batch_config.cc | 2 --
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 08c0fb326..c3569b186 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -193,7 +193,6 @@ class TreeSearchBatchConfig : public BatchConfig {
   inline static int const MAX_TREE_DEPTH = 16;
 
   // how many requests is in speculative phase
-  int current_depth = 0;
   int model_id;
 };
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8fded0056..6c122ba2d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -650,10 +650,12 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
-    // TODO: this should be updated in the update_inference_results() function
-    request.llm_cache_size++;
     bc.num_tokens++;
+
+    // TODO: this should be updated in the update_inference_results() function
+    // request.llm_cache_size++;
   }
+  assert(bc.num_available_requests == num_available_requests);
 
   return bc;
 }
@@ -679,7 +681,6 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   // Assume that only one small model is in use now
   new_bc.model_id = 0;
   new_bc.num_tokens = 0;
-  new_bc.current_depth = 0;
   new_bc.num_available_requests = 0;
   new_bc.prompt_phase = true;
   assert(current_speculation_step == 0);
@@ -744,7 +745,6 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
   new_bc.num_tokens = 0;
-  new_bc.current_depth = current_speculation_step;
   new_bc.num_available_requests = 0;
   new_bc.prompt_phase = false;
 
diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
index 6f78fb71e..ae05725dc 100644
--- a/src/runtime/tree_search_batch_config.cc
+++ b/src/runtime/tree_search_batch_config.cc
@@ -69,8 +69,6 @@ std::ostream &
      << std::endl;
   os << "Max num branch: "
      << TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES << std::endl;
-  os << "Current depth: " << tree_search_batch_config.current_depth
-     << std::endl;
 
   os << "Per-request info:\n";
   for (int i = 0; i < tree_search_batch_config.max_requests_per_batch(); i++) {

From 0ceffe94fe4301e7a9c2856532bc034d10894122 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 17:26:39 -0400
Subject: [PATCH 162/667] Removed some outputs.

---
 src/runtime/tree_verify_batch_config.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index 7df2d3307..f778ae738 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -53,11 +53,7 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_available[i] << std::endl;
-      os << "    Request running: " << bc.request_running[i] << std::endl;
+      os << "    Request available: " << bc.request_available[i] << std::endl;
     }
   }
 

From 40e8a5d1f3c4712235e1019768aa2f46e26e561e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 17:35:16 -0400
Subject: [PATCH 163/667] Updated the implementation of
 update_ssm_inference_results.

---
 src/runtime/request_manager.cc | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6c122ba2d..b929d4650 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -680,8 +680,6 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   TreeSearchBatchConfig new_bc;
   // Assume that only one small model is in use now
   new_bc.model_id = 0;
-  new_bc.num_tokens = 0;
-  new_bc.num_available_requests = 0;
   new_bc.prompt_phase = true;
   assert(current_speculation_step == 0);
 
@@ -691,7 +689,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
       new_bc.request_available[request_index] = false;
       continue;
     }
-    int guid = guid_of_requests[request_index];
+    BatchConfig::RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
@@ -722,7 +720,8 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
       new_bc.tokensInfo[new_bc.num_tokens].token_id = committed_token.token_id;
       new_bc.num_tokens++;
     }
-    // 4. Copy the causal mask, it should already been updated
+    // 4. Copy the causal mask, it should already been updated in
+    // update_llm_verify_results
     new_bc.causalMask[request_index] = request.causal_mask;
   }
   if (verbose) {
@@ -993,17 +992,11 @@ bool RequestManager::update_ssm_inference_results(
     }
     append_bitmask(guid);
   }
-  return token_added_to_spec_tree;
 
-  /* Move this to update_inference_results() */
-  // State maintenance
   current_speculation_step++;
-  if (!token_added_to_spec_tree ||
-      current_speculation_step > TreeSearchBatchConfig::MAX_TREE_DEPTH) {
-    // No token is added to the token tree, which indicates that the ssm
-    // inference phase is done. Proceed to the large model verification phase.
-    request_manager_status = LLM_VERIFY;
-  }
+  // Stop conditions
+  return !token_added_to_spec_tree ||
+         current_speculation_step > TreeSearchBatchConfig::MAX_TREE_DEPTH;
 }
 
 /* --------- Bitmask Related Functions --------- */

From c654361b9e14cfe1dd4efb5b5ee682f6e333253d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 17:51:54 -0400
Subject: [PATCH 164/667] Fixed current_speculation_step initialization in
 update_inference_results. It is initialized to 1 because before the first
 update_ssm_inference_results is called, there is a ssm decoding step that
 commits the accepted tokens and decodes the last token at the same time.

---
 src/runtime/request_manager.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b929d4650..74505f558 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -410,6 +410,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             request_manager_status = SSM_SPEC;
             // Reset the prefill_request
             prefill_request = nullptr;
+            current_speculation_step = 1;
           }
         } else {
           assert(false && "Invalid prefill model.");
@@ -436,6 +437,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         if (pending_request_queue.empty()) {
           // No pending request to process, continue the speculation
           request_manager_status = SSM_SPEC;
+          current_speculation_step = 1;
         } else {
           request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
@@ -743,8 +745,6 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   TreeSearchBatchConfig new_bc;
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
-  new_bc.num_tokens = 0;
-  new_bc.num_available_requests = 0;
   new_bc.prompt_phase = false;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
@@ -1033,8 +1033,8 @@ void RequestManager::update_bitmask_prompt(RequestGuid guid,
 void RequestManager::init_bitmask_spec(RequestGuid guid,
                                        int num_committed_tokens) {
   // This method modifies the bitmask in place
-  // This method is called by the first call of update_ssm_verify_results in a
-  // speculative iteration
+  // This method is called by the first call of update_ssm_inference_results in
+  // a speculative iteration
   // CAUTION: You should still call append_bitmask() after this method
   // 1. Clear the causal mask and add a root into it, because the tree is
   // currently empty but we have a root.

From 76e5aeb5d99452513720a223bf0367a7a560e191 Mon Sep 17 00:00:00 2001
From: Linshuhuai <shuhuail@andrew.cmu.edu>
Date: Wed, 1 May 2024 19:09:52 -0400
Subject: [PATCH 165/667] finish update_llm_prefill_results and
 update_llm_decode_results

---
 src/runtime/request_manager.cc | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6c122ba2d..85d76d215 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -505,6 +505,12 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   // request.tokens
   // 4. Otherwise, no need to push
   // 5. Return true if the prefilling is finished
+  prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
+  if (prefill_request->llm_cache_size == prefill_request->tokens.size()) { 
+    prefill_request->tokens.push_back(result.token_ids[prefill_request->num_tokens_in_batch]);
+    return true;
+  }
+  return false;
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
@@ -512,8 +518,21 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   // 1. Iterate over all requests, update the llm_cache_size and push token to
   // request.tokens (find the token index in result by
   // first_token_offset_in_batch)
-  // 2. Check if the prefilling is finished
+  // 2. Check if the decoding is finished
   // 3. If at least one request is completed, return true
+  int completed_request = 0;
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    request.llm_cache_size++;
+    request.tokens.push_back(result.token_ids[request.first_token_offset_in_batch]);
+    if (request.tokens.size() == get_max_sequence_length()) {
+      request.status = Request::COMPLETED;
+      completed_request++;
+    }
+  }
+  return completed_request >= 1;
 }
 
 bool RequestManager::update_ssm_prefill_results(

From d3c5a31d860a8b7a382afc16f26376e6ac3e705e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 19:52:54 -0400
Subject: [PATCH 166/667] Removed some comments.

---
 src/runtime/request_manager.cc | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2766c29bc..75b68e368 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -498,40 +498,30 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 // TO BE REMOVED: END
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
-  // TODO:
-  // The pending request can be found at Request_manager.prefill_request
-  // 1. Update request.llm_cache_size
-  // 2. Check if the prefilling is finished (request.tokens.size() ==
-  // request.llm_cache_size)
-  // 3. If the prefilling is finished, push the last token in result to
-  // request.tokens
-  // 4. Otherwise, no need to push
-  // 5. Return true if the prefilling is finished
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
-  if (prefill_request->llm_cache_size == prefill_request->tokens.size()) { 
-    prefill_request->tokens.push_back(result.token_ids[prefill_request->num_tokens_in_batch]);
+  if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
+    prefill_request->tokens.push_back(
+        result.token_ids[prefill_request->num_tokens_in_batch]);
     return true;
   }
   return false;
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
-  // TODO:
-  // 1. Iterate over all requests, update the llm_cache_size and push token to
-  // request.tokens (find the token index in result by
-  // first_token_offset_in_batch)
-  // 2. Check if the decoding is finished
-  // 3. If at least one request is completed, return true
   int completed_request = 0;
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     request.llm_cache_size++;
-    request.tokens.push_back(result.token_ids[request.first_token_offset_in_batch]);
+    request.tokens.push_back(
+        result.token_ids[request.first_token_offset_in_batch]);
     if (request.tokens.size() == get_max_sequence_length()) {
       request.status = Request::COMPLETED;
       completed_request++;
+      trigger_request_completion_future(request.guid);
+      guid_of_requests[request_index] = INVALID_GUID;
+      request_available[request_index] = false;
     }
   }
   return completed_request >= 1;

From e55eda9658aea492c87239b7c9efb72a76d9eee9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 1 May 2024 23:50:30 -0400
Subject: [PATCH 167/667] Fixed errors related to fields already removed.

---
 src/runtime/batch_config.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 1acf6ae95..def2e7a17 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -114,11 +114,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << bc.requestsInfo[i].max_sequence_length << std::endl;
-      os << "    Request completed: " << bc.request_available[i] << std::endl;
-      os << "    Request running: " << bc.request_running[i] << std::endl;
+      os << "    Request available: " << bc.request_available[i] << std::endl;
     }
   }
 

From fecb6b9f756ff63ff7c0d793e2d5f134c2b8e514 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 00:17:28 -0400
Subject: [PATCH 168/667] 1. Modified   void init_bitmask_spec(RequestGuid
 guid, int num_committed_tokens) to   void init_bitmask_spec(RequestGuid
 guid). 2. Fixed some issue with the usage of current_speculation_step. 3.
 Removed some TODOs. 4. Removed some redundant code due to the initialization
 of BatchConfig. 5. Added init_bitmask_spec into update_ssm_inference_results.

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 44 +++++++++++++-----------------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 665da7a05..d01f680f4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -367,7 +367,7 @@ class RequestManager {
   void init_bitmask_prompt(RequestGuid guid, int prompt_length);
   void append_bitmask(RequestGuid guid);
   void update_bitmask_prompt(RequestGuid guid, int num_committed_tokens);
-  void init_bitmask_spec(RequestGuid guid, int num_committed_tokens);
+  void init_bitmask_spec(RequestGuid guid);
   BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
 
   // Token tree related
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 75b68e368..a0de50003 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -410,7 +410,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             request_manager_status = SSM_SPEC;
             // Reset the prefill_request
             prefill_request = nullptr;
-            current_speculation_step = 1;
+            current_speculation_step = 0;
           }
         } else {
           assert(false && "Invalid prefill model.");
@@ -437,7 +437,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         if (pending_request_queue.empty()) {
           // No pending request to process, continue the speculation
           request_manager_status = SSM_SPEC;
-          current_speculation_step = 1;
+          current_speculation_step = 0;
         } else {
           request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
@@ -624,8 +624,6 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
 
     bc.num_tokens++;
     prefill_request->num_tokens_in_batch++;
-    // TODO: move the following line to update_inference_results
-    // prefill_request->llm_cache_size++;
   }
 
   return bc;
@@ -662,9 +660,6 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
     bc.num_tokens++;
-
-    // TODO: this should be updated in the update_inference_results() function
-    // request.llm_cache_size++;
   }
   assert(bc.num_available_requests == num_available_requests);
 
@@ -679,7 +674,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     std::cout << "\n############### prepare_first_spec_batch_config "
                  "##############\n";
   }
-  // TODO: Clean up the code, this method does the following:
+  // This method does the following:
   // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do
   // this request by request. The infomation of the committed tokens are
   // stored in Request.ssm_committed_tokens. Put the information of the
@@ -697,7 +692,6 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
-      new_bc.request_available[request_index] = false;
       continue;
     }
     BatchConfig::RequestGuid guid = guid_of_requests[request_index];
@@ -747,19 +741,18 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
 TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
-    std::cout << "\n############### prepare_next_batch_spec ###############\n";
-    std::cout << "Current tree depth: " << current_speculation_step << "\n";
+    std::cout
+        << "\n############### prepare_next_spec_batch_config ###############\n";
+    std::cout << "Current tree depth: " << current_speculation_step + 1 << "\n";
   }
   // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
-  new_bc.prompt_phase = false;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
-      new_bc.request_available[request_index] = false;
       continue;
     }
     int guid = guid_of_requests[request_index];
@@ -823,7 +816,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     std::cout
         << "\n############### prepare_next_batch_verify ###############\n";
   }
-  // TODO: Clean up the code, this method does the following:
+  // This method does the following:
   // 1. Commit the verified tokens in the last iteration through the
   // TreeVerifyBatchConfig. We can do this request by request.
   // The information of the committed tokens is stored in
@@ -944,8 +937,9 @@ bool RequestManager::update_ssm_inference_results(
     SsmInferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
-  assert(current_speculation_step >= 1 &&
-         "The current speculation step should be no less than 1");
+  assert(current_speculation_step >= 0 &&
+         "The current speculation step should be no less than 0");
+  current_speculation_step++;
 
   int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
@@ -999,10 +993,12 @@ bool RequestManager::update_ssm_inference_results(
         parent_pos++;
       }
     }
+    if (current_speculation_step == 1) {
+      init_bitmask_spec(guid);
+    }
     append_bitmask(guid);
   }
 
-  current_speculation_step++;
   // Stop conditions
   return !token_added_to_spec_tree ||
          current_speculation_step > TreeSearchBatchConfig::MAX_TREE_DEPTH;
@@ -1039,8 +1035,7 @@ void RequestManager::update_bitmask_prompt(RequestGuid guid,
   bitmask.current_layer_size = num_committed_tokens;
 }
 
-void RequestManager::init_bitmask_spec(RequestGuid guid,
-                                       int num_committed_tokens) {
+void RequestManager::init_bitmask_spec(RequestGuid guid) {
   // This method modifies the bitmask in place
   // This method is called by the first call of update_ssm_inference_results in
   // a speculative iteration
@@ -1051,13 +1046,12 @@ void RequestManager::init_bitmask_spec(RequestGuid guid,
   assert(current_speculation_step == 1 &&
          "The current speculation step should be 1");
   Request &request = all_requests[guid];
-  BatchConfig::BitMask &bitmask = request.causal_mask;
-  bitmask.clear_bitmask();
+  request.causal_mask = BatchConfig::BitMask();
   // Set the mask for the root
-  bitmask.bit_mask[0].set_bit(0);
-  bitmask.tree_or_prompt_size = 1;
-  bitmask.non_tree_cache_size += num_committed_tokens;
-  bitmask.current_layer_size = 1;
+  request.causal_mask.bit_mask[0].set_bit(0);
+  request.causal_mask.tree_or_prompt_size = 1;
+  request.causal_mask.non_tree_cache_size = request.tokens.size() - 1;
+  request.causal_mask.current_layer_size = 1;
 }
 
 void RequestManager::append_bitmask(RequestGuid guid) {

From 64288cb479db06a3f97a322f810fcaebeae10d7a Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 00:21:33 -0400
Subject: [PATCH 169/667] Removed some TODOs.

---
 src/runtime/request_manager.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a0de50003..66a2c3635 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1098,10 +1098,7 @@ void RequestManager::append_bitmask(RequestGuid guid) {
 BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   // This method creates a new bitmask for LLM verification model's bitmask,
   // it does not modify the small model's bitmask This method is called by
-  // prepare_verify_batch_config()
-  // TODO: implement this function
-  // 1. Create the bitmask based on the pruned request token tree
-  // 2. Maintain all other fields
+  // prepare_verify_batch_config().
 
   Request &request = all_requests[guid];
   TokenTree &token_tree = request.speculative_token_trees[0];

From e7e58994fb15a208e4b4fac7c2e3705421d7591e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 00:30:50 -0400
Subject: [PATCH 170/667] Fixed errors related to fields in
 TreeSearchBatchConfig that are removed.

---
 src/runtime/tree_search_batch_config.cc | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
index ae05725dc..612f4f390 100644
--- a/src/runtime/tree_search_batch_config.cc
+++ b/src/runtime/tree_search_batch_config.cc
@@ -84,17 +84,8 @@ std::ostream &
       os << "    Number of tokens in batch: "
          << tree_search_batch_config.requestsInfo[i].num_tokens_in_batch
          << std::endl;
-      os << "    GUID: "
-         << tree_search_batch_config.requestsInfo[i].request_guid << std::endl;
-      os << "    Max sequence length: "
-         << tree_search_batch_config.requestsInfo[i].max_sequence_length
-         << std::endl;
-      os << "    Request completed: "
+      os << "    Request available: "
          << tree_search_batch_config.request_available[i] << std::endl;
-      os << "    Request running: "
-         << tree_search_batch_config.request_running[i] << std::endl;
-      os << "    Tree Search Specific: " << std::endl;
-      os << "        Number of tokens in the current batch: " os << std::endl;
     }
   }
 

From 0b4506cd50d6a0bfa57aeec61ddf2efcecce4a79 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 00:32:15 -0400
Subject: [PATCH 171/667] Removed unused parameter max_sequence_length of
 register_new_request.

---
 include/flexflow/request_manager.h |  6 ++----
 src/runtime/request_manager.cc     | 10 ++++------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d01f680f4..0b6bc202a 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -220,10 +220,8 @@ class RequestManager {
   void serve_spec_infer(FFModel *model);
   void serve_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
-  RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length);
+  RequestGuid register_new_request(std::string const &prompt);
+  RequestGuid register_new_request(std::vector<TokenId> const &prompt);
   // Methods to start and terminate request manager's background task
   void start_background_server(FFModel *model);
   bool is_background_server_terminated();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 66a2c3635..739c0d82b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -177,8 +177,7 @@ size_t RequestManager::get_num_ssms() {
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::vector<TokenId> const &prompt,
-                                         int max_sequence_length) {
+    RequestManager::register_new_request(std::vector<TokenId> const &prompt) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
 
   // Add a new request
@@ -233,8 +232,7 @@ RequestManager::RequestGuid
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::string const &prompt,
-                                         int max_sequence_length) {
+    RequestManager::register_new_request(std::string const &prompt) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   // Add a new request
   Request request;
@@ -1238,13 +1236,13 @@ void RequestManager::get_verify_results_greedy(
   }
 }
 
+// TODO: the max_seq_length is not used in the current implementation
 std::vector<GenerationResult>
     FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
   for (int i = 0; i < prompts.size(); i++) {
-    RequestManager::RequestGuid guid =
-        rm->register_new_request(prompts.at(i), max_seq_length);
+    RequestManager::RequestGuid guid = rm->register_new_request(prompts.at(i));
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }

From 8d5951aa96a5bb377250be34170c11586d244a94 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 2 May 2024 05:31:11 -0400
Subject: [PATCH 172/667] chore: fix kernel cuh

---
 .../ops/kernels/inc_multihead_self_attention_utils.cuh        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index d1e0e050b..aa86f7710 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -482,13 +482,13 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head,
   int max_query_length = 0;
   int max_total_length = 0;
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (!bc->request_available[i]) {
       continue;
     }
     max_query_length =
         max(max_query_length, bc->requestsInfo[i].num_tokens_in_batch);
     max_total_length = max(max_total_length,
-                           bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].first_token_index_in_request +
                                bc->requestsInfo[i].num_tokens_in_batch);
   }
 

From 3d449a2e7904574fe73df1439e6d68b61be790c2 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 2 May 2024 06:08:32 -0400
Subject: [PATCH 173/667] fix: kernel compile errors

---
 include/flexflow/batch_config.h               |  1 -
 .../inc_multihead_self_attention_utils.cuh    |  3 ++
 src/ops/spec_inc_multihead_self_attention.cu  | 51 ++++++++++---------
 src/ops/tree_inc_multihead_self_attention.cu  | 28 +++++-----
 4 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index c3569b186..efb57d50c 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -109,7 +109,6 @@ class BatchConfig {
         std::fill(std::begin(bits), std::end(bits), 0);
       }
 
-    private:
       uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 64];
     };
 
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index aa86f7710..5804023ba 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -520,5 +520,8 @@ struct threads_per_value_t {
   static int const value = Dh * sizeof(T) / 16;
 };
 
+#define test_bit(bit_mask, idx, pos)                                           \
+  (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
+
 } // namespace FlexFlow
 #endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 5c047de79..e636d629b 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -85,7 +85,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
 
   // request_idx = re
 
-  BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
+  // BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
+  BatchConfig::BitMask* bitmask = &causalMask[requext_idx_in_batch];
 
   int const first_step = 0;
 
@@ -93,11 +94,11 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   //     request_infos[requext_idx_in_batch].first_token_depth_in_request +
   //     request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
-  //   int const totalCacheSize = bitmask.non_tree_cache_size +
-  //                              bitmask.tree_or_prompt_size +
-  //                              bitmask.prompt_size - 1;
+  //   int const totalCacheSize = bitmask->non_tree_cache_size +
+  //                              bitmask->tree_or_prompt_size +
+  //                              bitmask->prompt_size - 1;
   int const totalCacheSize =
-      bitmask.non_tree_cache_size + bitmask.tree_or_prompt_size;
+      bitmask->non_tree_cache_size + bitmask->tree_or_prompt_size;
 
   int const first_token_idx = request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
@@ -146,10 +147,10 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
           ii * THREADS_PER_KEY * K_VEC_SIZE);
     }
 
-    // int const query_token = bitmask.prompt_size + bitmask.tree_or_prompt_size
+    // int const query_token = bitmask->prompt_size + bitmask->tree_or_prompt_size
     // -
     //                         1 - tree_branch_num + qi;
-    int const query_token = bitmask.tree_or_prompt_size - tree_branch_num + qi;
+    int const query_token = bitmask->tree_or_prompt_size - tree_branch_num + qi;
 
     __syncthreads();
     for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
@@ -170,9 +171,10 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
       if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) {
         // todo add alobi here
         // bool const mask = ti_circ >= totalCacheSize;
-        bool const mask = (ti >= bitmask.non_tree_cache_size &&
-                           (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                              (1 << query_token))));
+        bool const mask = (ti >= bitmask->non_tree_cache_size &&
+                          !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, query_token));
+                          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+                          //   (1 << query_token))));
 
         // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
         //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
@@ -222,9 +224,10 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     float exp_sum = 0.f;
     for (int ti = first_step + tidx; ti < totalCacheSize;
          ti += THREADS_PER_BLOCK) {
-      bool const mask = (ti >= bitmask.non_tree_cache_size &&
-                         (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                            (1 << query_token))));
+      bool const mask = (ti >= bitmask->non_tree_cache_size &&
+                          !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, query_token));
+                          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+                          //   (1 << query_token))));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -269,9 +272,10 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
         V_vec v = *reinterpret_cast<V_vec const *>(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
 
-        bool const mask = (ti >= bitmask.non_tree_cache_size &&
-                           (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                              (1 << query_token))));
+        bool const mask = (ti >= bitmask->non_tree_cache_size &&
+                            !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, query_token));
+                            // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+                            //   (1 << query_token))));
         float logit = mask ? 0.0f : qk_smem[ti - first_step];
         out = FlexFlow::fma(logit, cast_to_float(v), out);
       }
@@ -346,18 +350,19 @@ __global__ void spec_inc_store_kv_cache(
     int const request_token_offset =
         requestInfo[req_id].first_token_offset_in_batch;
 
-    BatchConfig::BitMask bitmask = causalMask[req_id];
+    // BatchConfig::BitMask bitmask = causalMask[req_id];
+    BatchConfig::BitMask* bitmask = &causalMask[req_id];
 
     // if prompt token -> token id
     // if tree token:
 
-    // int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size +
-    //                       bitmask.tree_or_prompt_size - 1 -
-    //                       bitmask.current_layer_size + token_idx -
+    // int const cache_idx = bitmask->prompt_size + bitmask->non_tree_cache_size +
+    //                       bitmask->tree_or_prompt_size - 1 -
+    //                       bitmask->current_layer_size + token_idx -
     //                       request_token_offset;
     int const cache_idx =
-        bitmask.non_tree_cache_size + bitmask.tree_or_prompt_size -
-        bitmask.current_layer_size + token_idx - request_token_offset;
+        bitmask->non_tree_cache_size + bitmask->tree_or_prompt_size -
+        bitmask->current_layer_size + token_idx - request_token_offset;
 
     kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
                offset] = kVal;
@@ -516,7 +521,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     // int total_tokens = bc->token_last_available_idx[i] + 1;
 
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+    int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
 
     if (num_new_tokens <= 0) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index ab36c15bc..3ab39ed88 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -97,7 +97,8 @@ __global__ void compute_attention_kernel_fused_kernel(
   int const qlength =
       request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
-  BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
+  // BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
+  BatchConfig::BitMask* bitmask = &causalMask[requext_idx_in_batch];
 
   int const first_token_idx = request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
@@ -170,9 +171,10 @@ __global__ void compute_attention_kernel_fused_kernel(
       if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
         bool const mask =
             prompt_phase ? (qi + q_start < ti)
-                         : (ti >= bitmask.non_tree_cache_size &&
-                            (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                               (1 << qi))));
+                         : (ti >= bitmask->non_tree_cache_size &&
+                            !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
+                            // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+                            //    (1 << qi))));
 
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
 
@@ -185,7 +187,7 @@ __global__ void compute_attention_kernel_fused_kernel(
         //          qk,
         //          q_vecs[ki_o][0].x,
         //          k[0].x,
-        //          bitmask.non_tree_cache_size);
+        //          bitmask->non_tree_cache_size);
         // }
         qk_smem[ti - first_step] = mask ? 0.0f : qk;
       }
@@ -228,9 +230,10 @@ __global__ void compute_attention_kernel_fused_kernel(
     for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
       bool const mask =
           prompt_phase ? (q_start + qi < ti)
-                       : (ti >= bitmask.non_tree_cache_size &&
-                          (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                             (1 << qi))));
+                       : (ti >= bitmask->non_tree_cache_size &&
+                          !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
+                          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+                          //    (1 << qi))));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -279,9 +282,10 @@ __global__ void compute_attention_kernel_fused_kernel(
           bool const mask =
               prompt_phase
                   ? (q_start + qi < ti)
-                  : (ti >= bitmask.non_tree_cache_size &&
-                     (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
-                        (1 << qi))));
+                  : (ti >= bitmask->non_tree_cache_size &&
+                      !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
+                      // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+                      //   (1 << qi))));
           float logit = mask ? 0.0f : qk_smem[ti - first_step];
           out = FlexFlow::fma(logit, cast_to_float(v), out);
         }
@@ -1073,7 +1077,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
             reinterpret_cast<char *>(handler.batch_config_metadata) +
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
-            sizeof(BatchConfig::request_available)) +
+            sizeof(BatchConfig::request_available) +
             sizeof(BatchConfig::causalMask));
   }
 

From 0d2a2ad89f8059e7e8121653010d92eda0a627c1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 10:25:05 -0400
Subject: [PATCH 174/667] Fixed a index-related bug.

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 739c0d82b..18d5cf4d8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -499,7 +499,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
     prefill_request->tokens.push_back(
-        result.token_ids[prefill_request->num_tokens_in_batch]);
+        result.token_ids[prefill_request->num_tokens_in_batch - 1]);
     return true;
   }
   return false;

From 19f8e6f4316bf66a80cec5d7c7936324a51f0096 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 10:29:02 -0400
Subject: [PATCH 175/667] Removed unused functions.

---
 include/flexflow/request_manager.h |  1 -
 src/runtime/request_manager.cc     | 61 ------------------------------
 2 files changed, 62 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 0b6bc202a..e7d35cb06 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -216,7 +216,6 @@ class RequestManager {
 
   FFModel *get_ssm_model(int model_id);
 
-  void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
   void serve_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 18d5cf4d8..2792dd4fb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1300,7 +1300,6 @@ void RequestManager::background_serving_task(
   }
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
-    // rm->serve_incr_decoding(llm);
     rm->serve_decoding(llm);
   } else {
     // Registered SSMs: perform speculative inference
@@ -1360,66 +1359,6 @@ void RequestManager::serve_decoding(FFModel *llm) {
   }
 }
 
-// TO BE REMOVED: START
-// void RequestManager::serve_incr_decoding(FFModel *llm) {
-//   Context ctx = llm->config.lg_ctx;
-//   Runtime *runtime = llm->config.lg_hlr;
-//   // Compile the llm
-//   InferenceManager *im = InferenceManager::get_inference_manager();
-//   im->compile_model_and_allocate_buffer(llm);
-//   assert(im->model_weights_loaders.find(llm) !=
-//          im->model_weights_loaders.end());
-//   // Load model weights
-//   im->model_weights_loaders[llm]->load_weights(llm);
-//   // init operators
-//   im->init_operators_inference(llm);
-//   // Legion futures for inc_decoding and spec_infer
-//   BatchConfigFuture last_bcf;
-//   InferenceResultFuture last_irf;
-//   {
-//     // Initialize futures for incr decoding
-//     BatchConfig bc;
-//     InferenceResult ir;
-//     last_bcf = Future::from_value<BatchConfig>(bc);
-//     last_irf = Future::from_value<InferenceResult>(ir);
-//   }
-
-//   std::queue<std::pair<BatchConfigFuture, InferenceResultFuture>>
-//       batch_pipeline;
-//   { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); }
-
-//   while (!is_background_server_terminated()) {
-
-//     if (batch_pipeline.size() >= 4) {
-//       // Block here to avoid launching too many batches
-//       auto const &batch = batch_pipeline.front();
-//       batch.second.get_void_result();
-//     }
-//     // deque finished batches
-//     while (batch_pipeline.size() > 1) {
-//       auto const &batch = batch_pipeline.front();
-//       if (batch.second.is_ready()) {
-//         batch_pipeline.pop();
-//       } else {
-//         break;
-//       }
-//     }
-//     runtime->begin_trace(ctx, 12346 /*trace_id*/);
-//     auto const &next_batch = batch_pipeline.back();
-//     BatchConfigFuture bcf =
-//         prepare_next_batch(next_batch.first, next_batch.second, ctx,
-//         runtime);
-//     FutureMap fm = im->inference(llm, 0, bcf);
-//     assert(fm.get_future_map_domain().get_volume() == 1);
-//     InferenceResultFuture irf = fm.get_future(0);
-//     batch_pipeline.push(std::make_pair(bcf, irf));
-//     last_bcf = bcf;
-//     last_irf = irf;
-//     runtime->end_trace(ctx, 12346 /*trace_id*/);
-//   }
-// }
-// TO BE REMOVED: END
-
 /*static*/
 void RequestManager::serve_spec_infer(FFModel *llm) {
   Context ctx = llm->config.lg_ctx;

From 1e7af0448a812d332edfc197b9218f2da278ddcd Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 11:42:08 -0400
Subject: [PATCH 176/667] Removed obselete code.

---
 src/mapper/mapper.cc | 19 ----------
 src/runtime/model.cc | 83 --------------------------------------------
 2 files changed, 102 deletions(-)

diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index 4fa76f125..b83907f4d 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -507,25 +507,6 @@ std::string humanReadableSize(size_t size, bool mb = false) {
   return std::string(buffer);
 }
 
-std::string humanReadableSize(size_t size, bool mb = false) {
-  assert(size >= 0);
-  char const *units[] = {"B", "KiB", "MiB", "GiB", "TiB"};
-  int i = 0;
-  double finalSize = size;
-  if (mb) {
-    finalSize /= 1024 * 1024;
-    i = 2;
-  } else {
-    while (finalSize >= 1024 && i < 4) {
-      finalSize /= 1024;
-      i++;
-    }
-  }
-  char buffer[256];
-  snprintf(buffer, sizeof(buffer), "%.2lf %s", finalSize, units[i]);
-  return std::string(buffer);
-}
-
 void FFMapper::map_task(MapperContext const ctx,
                         Task const &task,
                         MapTaskInput const &input,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 5fe664297..7b3ed3468 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4473,89 +4473,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
               registrar);
     }
   }
-  // RequestManager prepare_next_batch
-  {
-    TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID,
-                                   "RequestManager Prepare Next Batch");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<
-          BatchConfig,
-          RequestManager::prepare_next_batch_task>(
-          registrar, "RequestManager Prepare Next Batch Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<BatchConfig,
-                                     RequestManager::prepare_next_batch_task>(
-          registrar);
-    }
-  }
-  // RequestManager prepare_next_batch_beam
-  {
-    TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
-                                   "RequestManager Prepare Next Batch (Beam)");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<
-          TreeSearchBatchConfig,
-          RequestManager::prepare_next_batch_beam_task>(
-          registrar, "RequestManager Prepare Next Batch (Beam) Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime
-          ->register_task_variant<TreeSearchBatchConfig,
-                                  RequestManager::prepare_next_batch_beam_task>(
-              registrar);
-    }
-  }
-  // RequestManager prepare_next_batch_init
-  {
-    TaskVariantRegistrar registrar(
-        RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
-        "RequestManager Prepare Next Batch (Init Beam)");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<
-          TreeSearchBatchConfig,
-          RequestManager::prepare_first_spec_batch_config_task>(
-          registrar, "RequestManager Prepare Next Batch (Init Beam) Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<
-          TreeSearchBatchConfig,
-          RequestManager::prepare_first_spec_batch_config_task>(registrar);
-    }
-  }
-  // RequestManager prepare_next_batch_verify
-  {
-    TaskVariantRegistrar registrar(
-        RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
-        "RequestManager Prepare Next Batch (Verify)");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    registrar.set_leaf();
-    if (pre_register) {
-      Runtime::preregister_task_variant<
-          TreeVerifyBatchConfig,
-          RequestManager::prepare_verify_batch_config_task>(
-          registrar, "RequestManager Prepare Next Batch (Verify) Task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<
-          TreeVerifyBatchConfig,
-          RequestManager::prepare_verify_batch_config_task>(registrar);
-    }
-  }
   // RequestManager background serving task
   {
     TaskVariantRegistrar registrar(RM_BACKGROUND_SERVING_TASK_ID,

From af49c99df5325dcdec5a82e13518df0ecb8904d5 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 11:43:15 -0400
Subject: [PATCH 177/667] Fixed a bug in update_inference_results.

---
 src/runtime/request_manager.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2792dd4fb..bdce1c644 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -388,6 +388,7 @@ void RequestManager::load_pending_reqeust_to_batch() {
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   std::lock_guard<std::mutex> const lock(rm_state_mutex);
+  SsmInferenceResult const *ssm_result_ptr;
   switch (request_manager_status) {
     case PREFILLING:
       if (decoding_mode == INCREMENTAL_DECODING) {
@@ -444,9 +445,8 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
       }
       break;
     case SSM_SPEC:
-      SsmInferenceResult const &ssm_result =
-          dynamic_cast<SsmInferenceResult const &>(result);
-      if (update_ssm_inference_results(ssm_result)) {
+      ssm_result_ptr = dynamic_cast<SsmInferenceResult const *>(&result);
+      if (update_ssm_inference_results(*ssm_result_ptr)) {
         // Stop condition for the speculation phase has been reached
         request_manager_status = LLM_VERIFY;
       }

From d0652c19dc294890d226537716546ec19af6c5f9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 13:34:30 -0400
Subject: [PATCH 178/667] Removed max_beam_width and max_beam_depth from model
 configurations.

---
 inference/models/falcon.h               |  8 +++-----
 inference/models/llama.cc               |  4 +---
 inference/models/llama.h                | 10 +++-------
 inference/models/mpt.h                  |  5 ++---
 inference/models/opt.cc                 |  3 +--
 inference/models/opt.h                  |  8 +++-----
 inference/models/starcoder.cc           |  4 ++--
 inference/models/starcoder.h            |  5 ++---
 src/runtime/tree_search_batch_config.cc |  2 +-
 src/runtime/tree_verify_batch_config.cc |  2 +-
 10 files changed, 19 insertions(+), 32 deletions(-)

diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index ccbe6ae79..bfbf288be 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -61,8 +61,7 @@ class FALCON {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -79,15 +78,14 @@ class FALCON {
 
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
-      std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
-      std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
+      std::cout << "\tk_of_arg_topk: " << k_of_arg_topk << std::endl;
     }
 
     bool bias, multi_query, parallel_attn;
     int hidden_size, n_head, n_head_kv, n_layer, vocab_size;
     float layer_norm_epsilon;
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
   };
 
   static void create_falcon_model(FFModel &ff,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 8e86d73b5..365722578 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -249,9 +249,7 @@ void LLAMA::create_llama_model(FFModel &ff,
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(dense, -1);
-    // output = ff.beam_top_k(softmax, llama_config.max_beam_width, false);
-    // output = ff.argmax(softmax, /*beam_Search*/ true);
-    output = ff.arg_top_k(softmax, llama_config.max_beam_width, false, true);
+    output = ff.arg_top_k(softmax, llama_config.k_of_arg_topk, false, true);
     // output = ff.top_k(softmax, )
   } else {
     // Tensor softmax = ff.softmax(dense, -1);
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 1a1481d38..2de105217 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -49,10 +49,7 @@ class LLAMA {
                   << std::endl;
         assert(false);
       }
-      // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
-      // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-      max_beam_depth = TreeSearchBatchConfig::MAX_TREE_DEPTH;
+      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -67,12 +64,11 @@ class LLAMA {
 
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
-      std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
-      std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
+      std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl;
     }
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
         intermediate_size;
     float rms_norm_eps;
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 7cfec2687..00cef5c6e 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -48,8 +48,7 @@ class MPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -61,7 +60,7 @@ class MPT {
     }
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     int hidden_size, n_heads, n_layers, vocab_size;
   };
 
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 0623a941c..c29b53d10 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -248,8 +248,7 @@ void OPT::create_opt_model(FFModel &ff,
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
-    // output = ff.beam_top_k(softmax, opt_config.max_beam_width, false);
-    output = ff.argmax(softmax, /*beam_Search*/ true);
+    output = ff.arg_top_k(softmax, opt_config.k_of_arg_topk, false, true);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
     output = ff.argmax(lm_head, /*beam_Search*/ false);
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 14a1f087d..8756db460 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -56,8 +56,7 @@ class OPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
@@ -81,12 +80,11 @@ class OPT {
 
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
-      std::cout << "\tmax_beam_width: " << max_beam_width << std::endl;
-      std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl;
+      std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl;
     }
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     bool do_layer_norm_before, enable_bias, layer_norm_elementwise_affine;
     float dropout;
     int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 5295d92cb..0eaf8731a 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -207,8 +207,8 @@ void STARCODER::create_starcoder_model(
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
-    // output = ff.beam_top_k(softmax, startcoder_config.max_beam_width, false);
-    output = ff.argmax(softmax, /*beam_Search*/ true);
+    output =
+        ff.arg_top_k(softmax, startcoder_config.k_of_arg_topk, false, true);
   } else {
     // Tensor softmax = ff.softmax(dense, -1);
     if (generationConfig.do_sample) {
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index f19db4a1f..1c593f00a 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -53,14 +53,13 @@ class STARCODER {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      max_beam_width = TreeSearchBatchConfig::MAX_BEAM_WIDTH;
-      max_beam_depth = TreeSearchBatchConfig::MAX_BEAM_DEPTH;
+      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {}
 
     // int max_seq_len, max_num_tokens;
-    int max_beam_width, max_beam_depth;
+    int k_of_arg_topk;
     int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
         intermediate_size, max_position_embeddings;
     float layer_norm_epsilon, dropout_p;
diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
index 612f4f390..fcc1d3a0c 100644
--- a/src/runtime/tree_search_batch_config.cc
+++ b/src/runtime/tree_search_batch_config.cc
@@ -24,7 +24,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_tree_bc("TreeSearchBatchConfig");
+LegionRuntime::Logger::Category log_tree_search_bc("TreeSearchBatchConfig");
 
 TreeSearchBatchConfig::TreeSearchBatchConfig() : BatchConfig() {}
 
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index f778ae738..4b5fbcb63 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -21,7 +21,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig");
+LegionRuntime::Logger::Category log_tree_verify_bc("TreeVerifyBatchConfig");
 
 TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {}
 

From aa5de31453b3caa11091e0accb3d6ce830a9a1bf Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 2 May 2024 14:04:03 -0400
Subject: [PATCH 179/667] Removed things related to beam_top_k.

---
 include/flexflow/flexflow_c.h      | 140 ++++++++++++++---------------
 include/flexflow/model.h           |  10 +--
 inference/spec_infer/spec_infer.cc |   7 --
 src/c/flexflow_c.cc                |  20 ++---
 4 files changed, 85 insertions(+), 92 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 0b74b7fce..aec5b0d57 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -133,71 +133,71 @@ flexflow_tensor_t flexflow_model_get_label_tensor(flexflow_model_t handle);
 void flexflow_model_zero_gradients(flexflow_model_t handle);
 
 flexflow_tensor_t flexflow_model_add_exp(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
+                                         flexflow_tensor_t const x,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_sin(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
+                                         flexflow_tensor_t const x,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_cos(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
+                                         flexflow_tensor_t const x,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_add(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
-                                         const flexflow_tensor_t y,
+                                         flexflow_tensor_t const x,
+                                         flexflow_tensor_t const y,
                                          bool inplace_a,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_subtract(flexflow_model_t handle,
-                                              const flexflow_tensor_t x,
-                                              const flexflow_tensor_t y,
+                                              flexflow_tensor_t const x,
+                                              flexflow_tensor_t const y,
                                               bool inplace_a,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_multiply(flexflow_model_t handle,
-                                              const flexflow_tensor_t x,
-                                              const flexflow_tensor_t y,
+                                              flexflow_tensor_t const x,
+                                              flexflow_tensor_t const y,
                                               bool inplace_a,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_divide(flexflow_model_t handle,
-                                            const flexflow_tensor_t x,
-                                            const flexflow_tensor_t y,
+                                            flexflow_tensor_t const x,
+                                            flexflow_tensor_t const y,
                                             bool inplace_a,
                                             char const *name);
 
 flexflow_tensor_t flexflow_model_add_max(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
-                                         const flexflow_tensor_t y,
+                                         flexflow_tensor_t const x,
+                                         flexflow_tensor_t const y,
                                          bool inplace_a,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_min(flexflow_model_t handle,
-                                         const flexflow_tensor_t x,
-                                         const flexflow_tensor_t y,
+                                         flexflow_tensor_t const x,
+                                         flexflow_tensor_t const y,
                                          bool inplace_a,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_reduce_sum(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
+                                                flexflow_tensor_t const input_,
                                                 int *axes,
                                                 int n,
                                                 bool keepdims,
                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_rsqrt(flexflow_model_t handle_,
-                                           const flexflow_tensor_t input_,
+                                           flexflow_tensor_t const input_,
                                            char const *name);
 
 flexflow_tensor_t flexflow_model_add_pow(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input_,
+                                         flexflow_tensor_t const input_,
                                          float const exponent,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
-                                          const flexflow_tensor_t input_,
+                                          flexflow_tensor_t const input_,
                                           int *dims,
                                           int n,
                                           bool keepdims,
@@ -205,7 +205,7 @@ flexflow_tensor_t flexflow_model_add_mean(flexflow_model_t handle_,
 
 flexflow_tensor_t
     flexflow_model_add_conv2d(flexflow_model_t handle,
-                              const flexflow_tensor_t input,
+                              flexflow_tensor_t const input,
                               int out_channels,
                               int kernel_h,
                               int kernel_w,
@@ -223,7 +223,7 @@ flexflow_tensor_t
 
 flexflow_tensor_t
     flexflow_model_add_embedding(flexflow_model_t handle,
-                                 const flexflow_tensor_t input,
+                                 flexflow_tensor_t const input,
                                  int num_entries,
                                  int out_dim,
                                  enum AggrMode aggr,
@@ -246,12 +246,12 @@ flexflow_tensor_t
                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_batch_norm(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 bool relu,
                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 int n,
                                                 int *axes,
                                                 bool elementwise_affine,
@@ -261,9 +261,9 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
 
 flexflow_tensor_t *
     flexflow_model_add_residual_layer_norm(flexflow_model_t handle,
-                                           const flexflow_tensor_t input,
-                                           const flexflow_tensor_t residual1,
-                                           const flexflow_tensor_t residual2,
+                                           flexflow_tensor_t const input,
+                                           flexflow_tensor_t const residual1,
+                                           flexflow_tensor_t const residual2,
                                            bool use_two_residuals,
                                            int n,
                                            int *axes,
@@ -274,8 +274,8 @@ flexflow_tensor_t *
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     flexflow_model_t handle,
-    const flexflow_tensor_t input,
-    const flexflow_tensor_t residual,
+    flexflow_tensor_t const input,
+    flexflow_tensor_t const residual,
     int n,
     int *axes,
     bool elementwise_affine,
@@ -285,20 +285,20 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
 
 flexflow_tensor_t
     flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle,
-                                          const flexflow_tensor_t input1,
-                                          const flexflow_tensor_t input2,
+                                          flexflow_tensor_t const input1,
+                                          flexflow_tensor_t const input2,
                                           char const *name);
 
 flexflow_tensor_t
     flexflow_model_add_batch_matmul(flexflow_model_t handle,
-                                    const flexflow_tensor_t a,
-                                    const flexflow_tensor_t b,
+                                    flexflow_tensor_t const a,
+                                    flexflow_tensor_t const b,
                                     int a_seq_length_dim /* -1 */,
                                     int b_seq_length_dim /* -1 */);
 
 flexflow_tensor_t flexflow_model_add_dense(
     flexflow_model_t handle,
-    const flexflow_tensor_t input,
+    flexflow_tensor_t const input,
     int out_dim,
     enum ActiMode activation /* AC_MODE_NONE */,
     bool use_bias /* true */,
@@ -329,96 +329,96 @@ flexflow_tensor_t flexflow_model_add_flat(flexflow_model_t handle,
                                           char const *name);
 
 flexflow_tensor_t flexflow_model_add_gather(flexflow_model_t handle,
-                                            const flexflow_tensor_t input,
-                                            const flexflow_tensor_t index,
+                                            flexflow_tensor_t const input,
+                                            flexflow_tensor_t const index,
                                             int dim,
                                             char const *name);
 
 flexflow_tensor_t flexflow_model_add_softmax(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              int dim,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_transpose(flexflow_model_t handle,
-                                               const flexflow_tensor_t input,
+                                               flexflow_tensor_t const input,
                                                int n,
                                                int *perm,
                                                char const *name);
 
 flexflow_tensor_t flexflow_model_add_reshape(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              int n,
                                              int *shape,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_reverse(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              int axis,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_relu(flexflow_model_t handle,
-                                          const flexflow_tensor_t input,
+                                          flexflow_tensor_t const input,
                                           bool inplace,
                                           char const *name);
 
 flexflow_tensor_t
     flexflow_model_add_scalar_multiply(flexflow_model_t handle,
-                                       const flexflow_tensor_t input,
+                                       flexflow_tensor_t const input,
                                        float const scalar,
                                        bool inplace,
                                        char const *name);
 
 flexflow_tensor_t flexflow_model_add_scalar_add(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_scalar_sub(flexflow_model_t handle,
-                                                const flexflow_tensor_t input,
+                                                flexflow_tensor_t const input,
                                                 float const scalar,
                                                 bool inplace,
                                                 char const *name);
 
 flexflow_tensor_t
     flexflow_model_add_scalar_truediv(flexflow_model_t handle,
-                                      const flexflow_tensor_t input,
+                                      flexflow_tensor_t const input,
                                       float const scalar,
                                       bool inplace,
                                       char const *name);
 
 flexflow_tensor_t flexflow_model_add_gelu(flexflow_model_t handle,
-                                          const flexflow_tensor_t input,
+                                          flexflow_tensor_t const input,
                                           char const *name);
 
 flexflow_tensor_t flexflow_model_add_identity(flexflow_model_t handle,
-                                              const flexflow_tensor_t input,
+                                              flexflow_tensor_t const input,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_sigmoid(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_tanh(flexflow_model_t handle,
-                                          const flexflow_tensor_t input,
+                                          flexflow_tensor_t const input,
                                           char const *name);
 
 flexflow_tensor_t flexflow_model_add_elu(flexflow_model_t handle,
-                                         const flexflow_tensor_t input,
+                                         flexflow_tensor_t const input,
                                          bool inplace,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_dropout(flexflow_model_t handle,
-                                             const flexflow_tensor_t input,
+                                             flexflow_tensor_t const input,
                                              float rate,
                                              unsigned long long seed,
                                              char const *name);
 
 flexflow_tensor_t flexflow_model_add_multihead_attention(
     flexflow_model_t handle,
-    const flexflow_tensor_t query,
-    const flexflow_tensor_t key,
-    const flexflow_tensor_t value,
+    flexflow_tensor_t const query,
+    flexflow_tensor_t const key,
+    flexflow_tensor_t const value,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -432,7 +432,7 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -452,7 +452,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -472,7 +472,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -492,7 +492,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -513,7 +513,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -534,7 +534,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
+    flexflow_tensor_t const input_,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -554,39 +554,39 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float eps,
                                               int dim,
                                               char const *name);
 
 flexflow_tensor_t *
     flexflow_model_add_residual_rms_norm(flexflow_model_t handle_,
-                                         const flexflow_tensor_t input1_,
-                                         const flexflow_tensor_t input2_,
+                                         flexflow_tensor_t const input1_,
+                                         flexflow_tensor_t const input2_,
                                          float eps,
                                          int dim,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
-                                               const flexflow_tensor_t input_,
+                                               flexflow_tensor_t const input_,
                                                int k,
                                                bool sorted,
                                                bool speculative_decoding,
                                                char const *name);
 
-flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
-                                                const flexflow_tensor_t input_,
-                                                int max_beam_size,
-                                                bool sorted,
-                                                char const *name);
+// flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
+//                                                 const flexflow_tensor_t
+//                                                 input_, int max_beam_size,
+//                                                 bool sorted,
+//                                                 char const *name);
 
 flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
-                                              const flexflow_tensor_t input_,
+                                              flexflow_tensor_t const input_,
                                               float top_p,
                                               char const *name);
 
 flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
-                                            const flexflow_tensor_t input_,
+                                            flexflow_tensor_t const input_,
                                             bool beam_search,
                                             char const *name);
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 79e721f52..d455e4f5b 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -601,11 +601,11 @@ class FFModel {
                          int dim,
                          DataType data_type = DT_NONE,
                          char const *name = NULL);
-  // Add a beam search top k layer
-  Tensor beam_top_k(Tensor const input,
-                    int max_beam_size,
-                    bool sorted,
-                    char const *name = NULL);
+  //   // Add a beam search top k layer
+  //   Tensor beam_top_k(Tensor const input,
+  //                     int max_beam_size,
+  //                     bool sorted,
+  //                     char const *name = NULL);
 
   // Add a dense layer
   Tensor dense(Tensor const input,
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index cc5270ee9..809dd8306 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -314,13 +314,6 @@ void FlexFlow::top_level_task(Task const *task,
                          model_metadata.llm_tokenizer_path);
   rm->register_output_filepath(file_paths.output_file_path);
 
-  // first decoding step: 3 results
-  if (expansion_degree != -1) {
-    rm->push_spec_infer_tree_width(1);
-    rm->push_spec_infer_tree_width(1);
-    rm->push_spec_infer_tree_width(expansion_degree);
-  }
-
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);
   if (model_metadata.llm_model_type == ModelType::LLAMA) {
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index d1f9bd753..efa4a186d 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1500,16 +1500,16 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
-flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
-                                                flexflow_tensor_t const input_,
-                                                int max_beam_size,
-                                                bool sorted,
-                                                char const *name) {
-  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  Tensor input = FFCObjectWrapper::unwrap(input_);
-  Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name);
-  return FFCObjectWrapper::wrap(tensor);
-}
+// flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
+//                                                 flexflow_tensor_t const
+//                                                 input_, int max_beam_size,
+//                                                 bool sorted,
+//                                                 char const *name) {
+//   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+//   Tensor input = FFCObjectWrapper::unwrap(input_);
+//   Tensor tensor = handle->beam_top_k(input, max_beam_size, sorted, name);
+//   return FFCObjectWrapper::wrap(tensor);
+// }
 
 flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
                                               flexflow_tensor_t const input_,

From ca7380dd23c8e5fb2df5ae59983b6c38288d37cc Mon Sep 17 00:00:00 2001
From: Zeyu Wang <zeyuwang@andrew.cmu.edu>
Date: Thu, 2 May 2024 23:21:53 -0400
Subject: [PATCH 180/667] fix tiny bug

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 892510f57..3d71a5c8e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -397,7 +397,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         if (request.initial_len ==
             request.llm_cache_size) { // all prompt tokens are prefilled
           request.tokens.push_back(
-              result.token_ids[request.num_tokens_in_batch]);
+              result.token_ids[request.num_tokens_in_batch - 1]);
           request_manager_status = DECODING;
         }
         break;

From 64d86900aebba547d2917087b61d90c8bb96065d Mon Sep 17 00:00:00 2001
From: April Yang <aprilytyang@gmail.com>
Date: Fri, 3 May 2024 06:11:00 +0000
Subject: [PATCH 181/667] implement update_llm_verify_results

---
 src/runtime/request_manager.cc | 52 ++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bdce1c644..b719cc8b8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -441,6 +441,13 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
           prefill_model = SSM;
+          // Initialize the bitmask for the new requests with their prompt lengths
+          for (auto& request : all_requests) {
+            Request& req = request.second;
+            if (req.status == Request::PENDING) {
+              init_bitmask_prompt(request.first, req.prompt.size());
+            }
+          }
         }
       }
       break;
@@ -915,6 +922,9 @@ bool RequestManager::update_llm_verify_results(
   // sampling the large model, the other is the top-p / top-k logits of the
   // large model, we can first implement the former one. For the latter one,
   // we have to add a CPU based verify function.
+
+  bool is_request_completed = false;
+
   // 1. Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
@@ -922,13 +932,43 @@ bool RequestManager::update_llm_verify_results(
   // stores the commmitted tokens into the corresponding fields in the
   // Request. For the sampling construction of the speculative token tree, we
   // need to implement a CPU based verify function.
-  // 2. Call init_token_tree() add_root_token_to_spec_token_tree() to add the
-  // root token to the requests' speculative token tree. The root token is the
-  // last committed token.
-  // 3. For requests not completed, update their causal mask.
-  // 4. Some requests may be completed after appending the verified tokens. If
-  // there is a request completed, return true.
+
+  // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
+
+  // Iterate over the requests
+  for (auto& request : all_requests) {
+    Request& req = request.second;
+
+    // 2. Call init_token_tree() add_root_token_to_spec_token_tree() to add the
+    // root token to the requests' speculative token tree. The root token is the
+    // last committed token.
+    if (req.status == Request::RUNNING) {
+      // Initialize the token tree for the request
+      init_token_tree(request.first);
+
+      // Add the last committed token as the root of the speculative token tree
+      if (!req.committed_tokens.empty()) {
+        add_root_to_spec_token_tree(request.first, req.committed_tokens.back().token_id);
+      }
+
+      // 3. For requests not completed, update their causal mask.
+      // Update the bitmask for the request based on the number of committed tokens
+      update_bitmask_prompt(request.first, req.committed_tokens.size());
+
+      // 4. Some requests may be completed after appending the verified tokens. 
+      // If there is a request completed, return true. 
+      if (req.is_completed()) {
+        is_request_completed = true;
+      }
+    } else if (req.status == Request::PENDING) {
+      // Initialize the bitmask for the new request with the prompt length
+      init_bitmask_prompt(request.first, req.prompt.size());
+    }
+  }
+
+  return is_request_completed;
+
 }
 
 bool RequestManager::update_ssm_inference_results(

From e42c1bf0e71e65bc55cb97a212f591905ea9bb57 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 15:40:11 -0400
Subject: [PATCH 182/667] Changed TokenTreeNode.joint_prob to
 log_accumulated_prob for a clearer name and better numerical stability.

---
 include/flexflow/request_manager.h | 16 ++++++------
 src/runtime/request_manager.cc     | 40 ++++++++++++++++--------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e7d35cb06..2de3fbc59 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -125,15 +125,15 @@ struct Request {
 class TokenTreeNode {
 public:
   BatchConfig::TokenId id;
-  float joint_prob;
+  float log_accumulated_prob;
   int parent_pos;
   bool pruned = false;
 
-  TokenTreeNode(BatchConfig::TokenId id, float joint_prob, int parent_pos)
-      : id(id), joint_prob(joint_prob), parent_pos(parent_pos) {}
-  bool operator>(TokenTreeNode const &other) const {
-    return joint_prob > other.joint_prob;
-  }
+  TokenTreeNode(BatchConfig::TokenId id,
+                float log_accumulated_prob,
+                int parent_pos)
+      : id(id), log_accumulated_prob(log_accumulated_prob),
+        parent_pos(parent_pos) {}
 };
 
 // A comparator for shared_ptr<TokenTreeNode>
@@ -142,7 +142,7 @@ struct CompareSharedTokenTreeNodePtrRequestGuidPair {
                             BatchConfig::RequestGuid> const &lhs,
                   std::pair<std::shared_ptr<TokenTreeNode>,
                             BatchConfig::RequestGuid> const &rhs) const {
-    return lhs.first->joint_prob > rhs.first->joint_prob;
+    return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
   }
 };
 
@@ -374,7 +374,7 @@ class RequestManager {
   bool add_token_to_spec_token_tree(RequestGuid guid,
                                     BatchConfig::TokenId token_id,
                                     int parent_pos,
-                                    float joint_prob);
+                                    float log_accumulated_prob);
   void prune_last_layer_of_spec_token_tree(RequestGuid guid);
   /* ---------- Spec Decoding Helper Functions ---------- */
 };
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b719cc8b8..47e8a1cdb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -17,6 +17,7 @@
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
+#include <cmath>
 #include <filesystem>
 #include <future>
 #include <iomanip>
@@ -441,9 +442,10 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
           prefill_model = SSM;
-          // Initialize the bitmask for the new requests with their prompt lengths
-          for (auto& request : all_requests) {
-            Request& req = request.second;
+          // Initialize the bitmask for the new requests with their prompt
+          // lengths
+          for (auto &request : all_requests) {
+            Request &req = request.second;
             if (req.status == Request::PENDING) {
               init_bitmask_prompt(request.first, req.prompt.size());
             }
@@ -937,8 +939,8 @@ bool RequestManager::update_llm_verify_results(
   get_verify_results_greedy(llm_verify_result);
 
   // Iterate over the requests
-  for (auto& request : all_requests) {
-    Request& req = request.second;
+  for (auto &request : all_requests) {
+    Request &req = request.second;
 
     // 2. Call init_token_tree() add_root_token_to_spec_token_tree() to add the
     // root token to the requests' speculative token tree. The root token is the
@@ -949,15 +951,17 @@ bool RequestManager::update_llm_verify_results(
 
       // Add the last committed token as the root of the speculative token tree
       if (!req.committed_tokens.empty()) {
-        add_root_to_spec_token_tree(request.first, req.committed_tokens.back().token_id);
+        add_root_to_spec_token_tree(request.first,
+                                    req.committed_tokens.back().token_id);
       }
 
       // 3. For requests not completed, update their causal mask.
-      // Update the bitmask for the request based on the number of committed tokens
+      // Update the bitmask for the request based on the number of committed
+      // tokens
       update_bitmask_prompt(request.first, req.committed_tokens.size());
 
-      // 4. Some requests may be completed after appending the verified tokens. 
-      // If there is a request completed, return true. 
+      // 4. Some requests may be completed after appending the verified tokens.
+      // If there is a request completed, return true.
       if (req.is_completed()) {
         is_request_completed = true;
       }
@@ -968,7 +972,6 @@ bool RequestManager::update_llm_verify_results(
   }
 
   return is_request_completed;
-
 }
 
 bool RequestManager::update_ssm_inference_results(
@@ -1017,13 +1020,14 @@ bool RequestManager::update_ssm_inference_results(
         } else {
           // Parent token is not pruned
           for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-            float parent_prob = (*parent_it)->joint_prob;
+            float parent_log_prob = (*parent_it)->log_accumulated_prob;
             token_added_to_spec_tree =
                 token_added_to_spec_tree ||
                 add_token_to_spec_token_tree(
                     guid,
                     ssm_inference_result.token_ids[result_index],
-                    ssm_inference_result.probs[result_index] * parent_prob,
+                    log(ssm_inference_result.probs[result_index]) +
+                        parent_log_prob,
                     parent_pos);
             result_index++;
           }
@@ -1539,11 +1543,11 @@ void RequestManager::add_root_to_spec_token_tree(
   // computed yet, and we need the large model to decode the logit of this
   // token to verify its childs (the tokens in the first layer). This method
   // should: construct and add the root token to the empty speculative token
-  // tree, with parent_pos being -1 and joint_prob being 1.0
+  // tree, with parent_pos being -1 and log_accumulated_prob being 0.0
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
   speculative_token_tree.add_layer();
-  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, -1, 1.0);
+  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, -1, 0.0);
   speculative_token_tree.tree_layers[0].push_back(node_ptr);
   speculative_token_tree.tree_size++;
   speculative_token_tree.tree_size_including_pruned++;
@@ -1552,7 +1556,7 @@ void RequestManager::add_root_to_spec_token_tree(
 bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
                                                   BatchConfig::TokenId token_id,
                                                   int parent_pos,
-                                                  float joint_prob) {
+                                                  float log_accumulated_prob) {
   // This method assumes only one small model is used for speculation
   // This method is called by update_ssm_inference_results()
 
@@ -1593,7 +1597,7 @@ bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
     // The pool is full, check if the new node has a higher joint probability
     // than the minimum node in the pool.
 
-    if (joint_prob < min_node_ptr->joint_prob) {
+    if (log_accumulated_prob < min_node_ptr->log_accumulated_prob) {
       // Insertion failed
       add_new_node = false;
     } else {
@@ -1643,8 +1647,8 @@ bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
 
   if (add_new_node) {
     // Add the new node to the pool and the last layer of the speculation tree
-    auto node_ptr =
-        std::make_shared<TokenTreeNode>(token_id, parent_pos, joint_prob);
+    auto node_ptr = std::make_shared<TokenTreeNode>(
+        token_id, parent_pos, log_accumulated_prob);
     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
     request.speculative_token_trees[0]
         .tree_layers[current_speculation_step]

From d7577ed3f8ea499b9ba4007edb2dc28de5f5515e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 16:10:02 -0400
Subject: [PATCH 183/667] 1. Added function request_complete_clean_up to do the
 clean up jobs after a request is completed. 2. Re-implemented
 load_pending_reqeust_to_batch. 3. Fixed update_llm_verify_results. 4.
 Re-implemented init_bitmask_prompt.

---
 include/flexflow/request_manager.h |   1 +
 src/runtime/request_manager.cc     | 110 +++++++++++++++--------------
 2 files changed, 57 insertions(+), 54 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 2de3fbc59..773c232fb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -334,6 +334,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
   void load_pending_reqeust_to_batch();
+  void request_complete_clean_up(int batch_index);
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 47e8a1cdb..48f84ba05 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -370,20 +370,33 @@ BatchConfig
 }
 void RequestManager::load_pending_reqeust_to_batch() {
   assert(!pending_request_queue.empty() && "No pending request to process.");
-  Request &new_request = pending_request_queue.front();
-  all_requests[new_request.guid] = new_request;
-  BatchConfig::RequestGuid guid = new_request.guid;
+  BatchConfig::RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
+
   prefill_request = std::make_shared<Request>(all_requests[guid]);
 
   // Find an empty slot
   int request_index = get_empty_request_index();
   assert(request_index != -1 && "No empty request slot to load the request.");
+  // Load request into batch
   prefill_request->batch_index = request_index;
   guid_of_requests[request_index] = guid;
   request_available[request_index] = true;
   num_available_requests++;
-  request_available[request_index] = true;
+  // Initialize the bitmask for the new request with its prompt length
+  init_bitmask_prompt(guid, prefill_request->tokens.size());
+}
+
+void RequestManager::request_complete_clean_up(int batch_index) {
+  BatchConfig::RequestGuid guid = guid_of_requests[batch_index];
+  Request &request = all_requests[guid];
+
+  guid_of_requests[batch_index] = INVALID_GUID;
+  request_available[batch_index] = false;
+  num_available_requests--;
+  request.status = Request::COMPLETED;
+
+  trigger_request_completion_future(guid);
 }
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
@@ -442,14 +455,6 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
           prefill_model = SSM;
-          // Initialize the bitmask for the new requests with their prompt
-          // lengths
-          for (auto &request : all_requests) {
-            Request &req = request.second;
-            if (req.status == Request::PENDING) {
-              init_bitmask_prompt(request.first, req.prompt.size());
-            }
-          }
         }
       }
       break;
@@ -515,6 +520,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
+  bool request_completed = false;
   int completed_request = 0;
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
@@ -524,14 +530,11 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
     if (request.tokens.size() == get_max_sequence_length()) {
-      request.status = Request::COMPLETED;
-      completed_request++;
-      trigger_request_completion_future(request.guid);
-      guid_of_requests[request_index] = INVALID_GUID;
-      request_available[request_index] = false;
+      request_completed = true;
+      request_complete_clean_up(request_index);
     }
   }
-  return completed_request >= 1;
+  return request_completed;
 }
 
 bool RequestManager::update_ssm_prefill_results(
@@ -919,15 +922,14 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
 
 bool RequestManager::update_llm_verify_results(
     InferenceResult const &llm_verify_result) {
-  // TODO: Implement this function
   // We may have two types of InferenceResults, one is the results from
   // sampling the large model, the other is the top-p / top-k logits of the
   // large model, we can first implement the former one. For the latter one,
   // we have to add a CPU based verify function.
 
-  bool is_request_completed = false;
+  bool request_completed = false;
 
-  // 1. Compare the results returned from the LLM and compare them with the
+  // Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
   // token, this is implemented in get_verify_results_greedy(). This function
@@ -939,39 +941,37 @@ bool RequestManager::update_llm_verify_results(
   get_verify_results_greedy(llm_verify_result);
 
   // Iterate over the requests
-  for (auto &request : all_requests) {
-    Request &req = request.second;
-
-    // 2. Call init_token_tree() add_root_token_to_spec_token_tree() to add the
-    // root token to the requests' speculative token tree. The root token is the
-    // last committed token.
-    if (req.status == Request::RUNNING) {
-      // Initialize the token tree for the request
-      init_token_tree(request.first);
-
-      // Add the last committed token as the root of the speculative token tree
-      if (!req.committed_tokens.empty()) {
-        add_root_to_spec_token_tree(request.first,
-                                    req.committed_tokens.back().token_id);
-      }
-
-      // 3. For requests not completed, update their causal mask.
-      // Update the bitmask for the request based on the number of committed
-      // tokens
-      update_bitmask_prompt(request.first, req.committed_tokens.size());
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-      // 4. Some requests may be completed after appending the verified tokens.
-      // If there is a request completed, return true.
-      if (req.is_completed()) {
-        is_request_completed = true;
-      }
-    } else if (req.status == Request::PENDING) {
-      // Initialize the bitmask for the new request with the prompt length
-      init_bitmask_prompt(request.first, req.prompt.size());
+    // Initialize the token tree for the request
+    init_token_tree(guid);
+    assert(!request.committed_tokens.empty() &&
+           "The committed tokens should not be empty.");
+    // Add the last committed token as the root of the speculative token tree
+    add_root_to_spec_token_tree(guid, request.committed_tokens.back().token_id);
+
+    // Check if the request is completed. If its completed, clean up the
+    // metainfo stored in the RequestManager. Otherwise, update its bitmask.
+    if (request.tokens.size() >= max_sequence_length) {
+      // Request is completed
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else {
+      update_bitmask_prompt(guid, request.committed_tokens.size());
     }
   }
 
-  return is_request_completed;
+  // Some requests may be completed after appending the verified tokens.
+  // If there is a request completed, return true.
+  return request_completed;
 }
 
 bool RequestManager::update_ssm_inference_results(
@@ -1049,7 +1049,7 @@ bool RequestManager::update_ssm_inference_results(
 /* --------- Bitmask Related Functions --------- */
 
 void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
-  // This method is called by update_llm_verify_results when there are new
+  // This method is called by load_pending_reqeust_to_batch when there is a new
   // request to load into the batch
   // 1. Clear the causal mask because our current speculative token tree is
   // empty.
@@ -1057,10 +1057,11 @@ void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
 
+  // TODO: check if we need mask in the ssm prompt kernel
   bitmask.clear_bitmask();
-  bitmask.tree_or_prompt_size = prompt_length;
-  bitmask.current_layer_size = prompt_length;
-  bitmask.non_tree_cache_size = 0;
+  bitmask.tree_or_prompt_size = 0;
+  bitmask.current_layer_size = 0;
+  bitmask.non_tree_cache_size = prompt_length;
 }
 
 void RequestManager::update_bitmask_prompt(RequestGuid guid,
@@ -1073,6 +1074,7 @@ void RequestManager::update_bitmask_prompt(RequestGuid guid,
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
   bitmask.clear_bitmask();
+  // TODO: check if we need mask in the ssm prompt kernel
   bitmask.tree_or_prompt_size = num_committed_tokens;
   bitmask.current_layer_size = num_committed_tokens;
 }

From ee912c384780dfb28bc8a85e462e6157e9ae35e8 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 16:23:48 -0400
Subject: [PATCH 184/667] Fixed init_bitmask_prompt.

---
 src/runtime/request_manager.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 48f84ba05..932c8e892 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1057,11 +1057,10 @@ void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
 
-  // TODO: check if we need mask in the ssm prompt kernel
   bitmask.clear_bitmask();
-  bitmask.tree_or_prompt_size = 0;
-  bitmask.current_layer_size = 0;
-  bitmask.non_tree_cache_size = prompt_length;
+  bitmask.tree_or_prompt_size = prompt_length;
+  bitmask.current_layer_size = prompt_length;
+  bitmask.non_tree_cache_size = 0;
 }
 
 void RequestManager::update_bitmask_prompt(RequestGuid guid,

From 1ad93c4bb8adb1fb2160122c010d50f205a7fd77 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 16:59:09 -0400
Subject: [PATCH 185/667] Modified some comments.

---
 src/runtime/request_manager.cc | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 932c8e892..534f8d6fe 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -409,7 +409,10 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         if (update_llm_prefill_results(result)) {
           // This indicates that the prefilling phase finishes
           request_manager_status = DECODING;
+          // Reset the prefill_request
+          prefill_request = nullptr;
         }
+        // Not completed, continue prefilling
       } else if (decoding_mode == SPECULATIVE_DECODING) {
         if (prefill_model == SSM) {
           if (update_ssm_prefill_results(result)) {
@@ -417,6 +420,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             // We need to start the LLM prefilling
             prefill_model = LLM;
           }
+          // Not completed, continue SSM prefilling
         } else if (prefill_model == LLM) {
           if (update_llm_prefill_results(result)) {
             // This indicates that the prefilling phase finishes
@@ -425,6 +429,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             prefill_request = nullptr;
             current_speculation_step = 0;
           }
+          // Not completed, continue LLM prefilling
         } else {
           assert(false && "Invalid prefill model.");
         }
@@ -512,6 +517,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
+    // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);
     return true;
@@ -541,8 +547,7 @@ bool RequestManager::update_ssm_prefill_results(
     InferenceResult const &ssm_prefill_result) {
   // This function is called by update_inference_results when the
   // request_manager_status is PREFILLING and the prefill_model is SSM.
-  // There's no results to update, but we should update some SSM related states
-  // related to SSM.
+  // There's no results to update, but we should update ssm_cache_size.
   prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
   if (prefill_request->ssm_cache_size == prefill_request->tokens.size()) {
     return true;
@@ -1051,13 +1056,12 @@ bool RequestManager::update_ssm_inference_results(
 void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
   // This method is called by load_pending_reqeust_to_batch when there is a new
   // request to load into the batch
-  // 1. Clear the causal mask because our current speculative token tree is
-  // empty.
-  // 2. Maintain all other fields.
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
 
+  // Clear because the prompt kernel doesn't use mask
   bitmask.clear_bitmask();
+  // Set the info for the mask which is used to store the KV cache
   bitmask.tree_or_prompt_size = prompt_length;
   bitmask.current_layer_size = prompt_length;
   bitmask.non_tree_cache_size = 0;

From c416a6682a662f6d5317e6b999fd9f608240d298 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 17:02:15 -0400
Subject: [PATCH 186/667] Fixed update_llm_decode_results.

---
 src/runtime/request_manager.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 534f8d6fe..185574b70 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -527,11 +527,15 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   bool request_completed = false;
-  int completed_request = 0;
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
     request.llm_cache_size++;
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
@@ -965,7 +969,7 @@ bool RequestManager::update_llm_verify_results(
 
     // Check if the request is completed. If its completed, clean up the
     // metainfo stored in the RequestManager. Otherwise, update its bitmask.
-    if (request.tokens.size() >= max_sequence_length) {
+    if (request.tokens.size() >= get_max_sequence_length()) {
       // Request is completed
       request_completed = true;
       request_complete_clean_up(request_index);

From 3843ace666a5181d2d04750ae0bb1cfafa7bf445 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 17:10:00 -0400
Subject: [PATCH 187/667] Removed unused field dfs_tree_inputs.

---
 include/flexflow/request_manager.h | 11 ++++-------
 src/runtime/request_manager.cc     |  1 +
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 773c232fb..ba0fd7b8e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -69,8 +69,8 @@ struct Request {
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
-  int first_token_offset_in_batch;
-  int num_tokens_in_batch;
+  int first_token_offset_in_batch = 0;
+  int num_tokens_in_batch = 0;
 
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
@@ -304,6 +304,7 @@ class RequestManager {
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
+  // TODO: clear this in the first step of the speculation!
   std::priority_queue<
       std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
       std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
@@ -312,11 +313,6 @@ class RequestManager {
   // rm state
   std::mutex rm_state_mutex;
 
-  // TODO: Move this two vector to request struct
-  std::unordered_map<RequestGuid,
-                     std::vector<std::pair<BatchConfig::TokenId, int>>>
-      dfs_tree_inputs;
-
   // Multi-model support
   std::vector<FFModel *> ssm_models;
 
@@ -324,6 +320,7 @@ class RequestManager {
   Legion::Future background_server_handler;
 
   // Performance profiling
+  // TODO: maintain this field
   size_t num_processed_requests;
 
   struct ProfileInfo {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 185574b70..5541493b0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -374,6 +374,7 @@ void RequestManager::load_pending_reqeust_to_batch() {
   pending_request_queue.pop();
 
   prefill_request = std::make_shared<Request>(all_requests[guid]);
+  prefill_request->status = Request::RUNNING;
 
   // Find an empty slot
   int request_index = get_empty_request_index();

From 7a4c762c75eaad7b5ce64e5e25b8d200f8a92f30 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 17:43:14 -0400
Subject: [PATCH 188/667] 1. Fixed get_verify_results_greedy. 2. Added some
 comments.

---
 src/runtime/request_manager.cc | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5541493b0..c40f4b553 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -937,8 +937,6 @@ bool RequestManager::update_llm_verify_results(
   // large model, we can first implement the former one. For the latter one,
   // we have to add a CPU based verify function.
 
-  bool request_completed = false;
-
   // Compare the results returned from the LLM and compare them with the
   // SSM's speculative token tree. For the greedy construction of the
   // speculative token tree, we can simply compare LLM's sample result at each
@@ -950,6 +948,8 @@ bool RequestManager::update_llm_verify_results(
   // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
 
+  bool request_completed = false;
+
   // Iterate over the requests
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
@@ -960,7 +960,6 @@ bool RequestManager::update_llm_verify_results(
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-
     // Initialize the token tree for the request
     init_token_tree(guid);
     assert(!request.committed_tokens.empty() &&
@@ -1081,8 +1080,9 @@ void RequestManager::update_bitmask_prompt(RequestGuid guid,
   // 2. Maintain all other fields.
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
+  // Clear because the prompt kernel doesn't use mask
   bitmask.clear_bitmask();
-  // TODO: check if we need mask in the ssm prompt kernel
+  // No need to change non_tree_cache_size
   bitmask.tree_or_prompt_size = num_committed_tokens;
   bitmask.current_layer_size = num_committed_tokens;
 }
@@ -1216,7 +1216,9 @@ void RequestManager::get_verify_results_greedy(
         committed_token_index,
         llm_verify_result.token_ids[llm_result_offset]));
     committed_token_index++;
-    // The position of the last accepted token in its tree layer
+
+    // The position of the last accepted token in its tree layer (includeing the
+    // pruned tokens)
     int last_accepted_token_layer_index = 0;
     // The index of the last accepted token in the entire tree (excluding the
     // pruned tokens)
@@ -1234,6 +1236,7 @@ void RequestManager::get_verify_results_greedy(
 
       for (auto const &node_ptr : tree_layer) {
         if (node_ptr->pruned) {
+          current_token_layer_index++;
           continue;
         }
         if ((node_ptr->parent_pos != last_accepted_token_layer_index) ||
@@ -1252,7 +1255,8 @@ void RequestManager::get_verify_results_greedy(
             // The token's parent is accepted, and this token's id equals the
             // llm's sample at its parent's position. We accept this token.
 
-            // from_index: the index of the token in the tree
+            // from_index: the index of the token in the tree (excluding the
+            // pruned tokens)
             // to_index: the committed token index in the request
             request.committed_tokens.push_back(Request::CommittedToken(
                 current_token_index, committed_token_index, node_ptr->id));
@@ -1272,9 +1276,9 @@ void RequestManager::get_verify_results_greedy(
         // However, we have to add the last sampled token as a correction from
         // the LLM
 
-        // from_index: since this token is not in the token tree, neither the
-        // ssm nor the llm have its KV cache, so the from_index should be a
-        // place holder, which is -1
+        // from_index: since this token is not in the token tree, the llm
+        // doesn't have its KV cache, so the from_index should be a place
+        // holder, which is -1
         request.committed_tokens.push_back(Request::CommittedToken(
             -1,
             committed_token_index,

From 828619948a0d802277e03b21e95240e198c6ecca Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 3 May 2024 17:49:48 -0400
Subject: [PATCH 189/667] Use the type name RequestGuid instead of
 xxx::RequestGuid inside the RequestManager.

---
 src/runtime/request_manager.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c40f4b553..ac7ccfc23 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -370,7 +370,7 @@ BatchConfig
 }
 void RequestManager::load_pending_reqeust_to_batch() {
   assert(!pending_request_queue.empty() && "No pending request to process.");
-  BatchConfig::RequestGuid guid = pending_request_queue.front().guid;
+  RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
 
   prefill_request = std::make_shared<Request>(all_requests[guid]);
@@ -389,7 +389,7 @@ void RequestManager::load_pending_reqeust_to_batch() {
 }
 
 void RequestManager::request_complete_clean_up(int batch_index) {
-  BatchConfig::RequestGuid guid = guid_of_requests[batch_index];
+  RequestGuid guid = guid_of_requests[batch_index];
   Request &request = all_requests[guid];
 
   guid_of_requests[batch_index] = INVALID_GUID;
@@ -714,7 +714,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     if (!request_available[request_index]) {
       continue;
     }
-    BatchConfig::RequestGuid guid = guid_of_requests[request_index];
+    RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
     new_bc.request_available[request_index] = true;
@@ -1004,8 +1004,7 @@ bool RequestManager::update_ssm_inference_results(
       // Request in this slot is unavailable
       continue;
     }
-    FlexFlow::RequestManager::RequestGuid guid =
-        guid_of_requests[request_index];
+    RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
 
     TokenTree &token_tree = request.speculative_token_trees[0];

From 0bedf7ecb54638edfbb35cf3331fa9fc23638763 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 11:55:17 -0400
Subject: [PATCH 190/667] Small fix.

---
 src/runtime/request_manager.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ac7ccfc23..dd60b7597 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1006,6 +1006,7 @@ bool RequestManager::update_ssm_inference_results(
     }
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
     TokenTree &token_tree = request.speculative_token_trees[0];
     if (token_tree.tree_layers.size() < current_speculation_step) {
@@ -1577,10 +1578,10 @@ bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
   assert(current_speculation_step >= 1 &&
          "The current speculation step should be no less than 1");
 
-  // First make sure there are enough layers in the speculation tree
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
 
+  // Make sure there are enough layers in the speculation tree
   if (speculative_token_tree.tree_layers.size() == current_speculation_step) {
     // When adding the first token, we need to add a new layer
     speculative_token_tree.add_layer();

From 60f8cd1155f6cc78ec3fa6d72642b9ff118928bb Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 12:09:46 -0400
Subject: [PATCH 191/667] 1. Added lock to the request queue in
 update_inference_results. 2. Fixed bug in prepare_prefilling_batch.

---
 src/runtime/request_manager.cc | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index dd60b7597..d98194152 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -368,6 +368,7 @@ BatchConfig
   update_inference_results(result);
   return prepare_next_batch();
 }
+
 void RequestManager::load_pending_reqeust_to_batch() {
   assert(!pending_request_queue.empty() && "No pending request to process.");
   RequestGuid guid = pending_request_queue.front().guid;
@@ -403,6 +404,8 @@ void RequestManager::request_complete_clean_up(int batch_index) {
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   std::lock_guard<std::mutex> const lock(rm_state_mutex);
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+
   SsmInferenceResult const *ssm_result_ptr;
   switch (request_manager_status) {
     case PREFILLING:
@@ -561,8 +564,6 @@ bool RequestManager::update_ssm_prefill_results(
 }
 
 BatchConfig RequestManager::prepare_next_batch() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-
   switch (request_manager_status) {
     case PREFILLING:
       return prepare_prefilling_batch();
@@ -588,20 +589,17 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   // which means that there is a request in the prefilling phase.
   // This function load its prefilling tokens, constructing a BatchConfig with
   // only one request.
-
-  BatchConfig bc;
-  bc.prompt_phase = true;
-
   assert(prefill_request != nullptr &&
          "No prefilling request to process in the prefilling phase.");
-  int request_index = prefill_request->batch_index;
 
+  BatchConfig bc;
+  bc.prompt_phase = true;
   std::copy(std::begin(request_available),
             std::end(request_available),
             std::begin(bc.request_available));
-  bc.request_available[request_index] = true;
   bc.num_available_requests = num_available_requests;
 
+  int request_index = prefill_request->batch_index;
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
   if (prefill_model == SSM) {
     // Request Info
@@ -643,7 +641,6 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
     bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
 
     bc.num_tokens++;
-    prefill_request->num_tokens_in_batch++;
   }
 
   return bc;

From dff32d69974d99555da7f8522e815b95e3ed6262 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 12:15:16 -0400
Subject: [PATCH 192/667] Modifed code style of prepare_decoding_batch.

---
 src/runtime/request_manager.cc | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d98194152..f6229b07e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -653,32 +653,35 @@ BatchConfig RequestManager::prepare_decoding_batch() {
 
   BatchConfig bc;
   bc.prompt_phase = false;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
 
-  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-    if (!request_available[i]) {
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       request_index++) {
+    if (!request_available[request_index]) {
       continue;
     }
-    bc.request_available[i] = true;
-    bc.num_available_requests++;
-
-    Request &request = all_requests[guid_of_requests[i]];
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
 
     // Per Request Info
-    bc.requestsInfo[i].first_token_index_in_request = request.llm_cache_size;
-    bc.requestsInfo[i].first_token_offset_in_batch = bc.num_tokens;
-    bc.requestsInfo[i].num_tokens_in_batch = 1;
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
 
     request.first_token_offset_in_batch = bc.num_tokens;
     request.num_tokens_in_batch = 1;
 
     // Per Token Info
-    bc.tokensInfo[bc.num_tokens].request_index = i;
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
     bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
     bc.num_tokens++;
   }
-  assert(bc.num_available_requests == num_available_requests);
 
   return bc;
 }

From 1946d2763df9046b2efba9577ef776b5ed9d07e0 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 12:23:18 -0400
Subject: [PATCH 193/667] Modified code style of
 prepare_first_spec_batch_config.

---
 src/runtime/request_manager.cc | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f6229b07e..06c96c68d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -689,25 +689,25 @@ BatchConfig RequestManager::prepare_decoding_batch() {
 
 /***** Request Init Phase *****/
 TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout << "\n############### prepare_first_spec_batch_config "
                  "##############\n";
   }
   // This method does the following:
-  // 1. Commit the verified tokens through TreeSearchBatchConfig. We can do
-  // this request by request. The infomation of the committed tokens are
-  // stored in Request.ssm_committed_tokens. Put the information of the
-  // committed tokens into BatchConfig.TokensInfo.
+  // 1. Commit the verified tokens through TreeSearchBatchConfig. The infomation
+  // of the committed tokens are stored in request.committed_tokens. Put the
+  // information of the committed tokens into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // TreeSearchBatchConfig.
-  // Please refer to the implementation of prepare_next_spec_batch_config()
-  // for more details.
+  assert(current_speculation_step == 0);
+
   TreeSearchBatchConfig new_bc;
   // Assume that only one small model is in use now
-  new_bc.model_id = 0;
   new_bc.prompt_phase = true;
-  assert(current_speculation_step == 0);
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
@@ -717,15 +717,14 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    new_bc.request_available[request_index] = true;
-    new_bc.num_available_requests++;
+
     // TODO: check this profiling, what is profiling
     profiling_requests[request.guid].ssm_decoding_steps += 1;
 
     std::vector<Request::CommittedToken> &committed_tokens =
         request.committed_tokens;
 
-    // 2. Maintain requestsInfo
+    // Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
     new_bc.requestsInfo[request_index].first_token_index_in_request =
@@ -733,19 +732,16 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
     new_bc.requestsInfo[request_index].num_tokens_in_batch =
         committed_tokens.size();
 
-    // 3. Store committed tokens to tokensInfo
-    for (int committed_token_index = 0;
-         committed_token_index < committed_tokens.size();
-         committed_token_index++) {
-      Request::CommittedToken &committed_token =
-          committed_tokens.at(committed_token_index);
+    // Store committed tokens to tokensInfo
+    for (auto const &committed_token : committed_tokens) {
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
       new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
           committed_token.to_index;
       new_bc.tokensInfo[new_bc.num_tokens].token_id = committed_token.token_id;
       new_bc.num_tokens++;
     }
-    // 4. Copy the causal mask, it should already been updated in
+
+    // Copy the causal mask, it should already been updated in
     // update_llm_verify_results
     new_bc.causalMask[request_index] = request.causal_mask;
   }

From 40c70915d69102f21d26bbc7ae4f937a5a6cce2e Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 12:33:51 -0400
Subject: [PATCH 194/667] 1. Added a call to
 prune_last_layer_of_spec_token_tree in update_ssm_inference_results. 2. Fixed
 a bug in prune_last_layer_of_spec_token_tree. 3. Fixed a but in
 prepare_next_spec_batch_config.

---
 src/runtime/request_manager.cc | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 06c96c68d..62b0f764a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -755,16 +755,20 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
 
 /***** Speculative Decoding Phase *****/
 TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout
         << "\n############### prepare_next_spec_batch_config ###############\n";
     std::cout << "Current tree depth: " << current_speculation_step + 1 << "\n";
   }
+
   // Prepare the next batch for existing requests
   TreeSearchBatchConfig new_bc;
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
@@ -774,8 +778,6 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    new_bc.request_available[request_index] = true;
-    new_bc.num_available_requests++;
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
     // TODO: check this profiling
@@ -796,7 +798,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
       // Exclude the current layer from the token tree, because we want the
       // start index
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() + token_tree.tree_size_including_pruned -
+          request.tokens.size() - 1 + token_tree.tree_size_including_pruned -
           current_layer.size();
       new_bc.requestsInfo[request_index].num_tokens_in_batch =
           current_layer.size();
@@ -814,7 +816,8 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
       }
     }
 
-    // Copy the causal mask, it should already been updated
+    // Copy the causal mask, it should already been updated by
+    // update_ssm_inference_results
     new_bc.causalMask[request_index] = request.causal_mask;
   }
 
@@ -1040,6 +1043,9 @@ bool RequestManager::update_ssm_inference_results(
         parent_pos++;
       }
     }
+
+    prune_last_layer_of_spec_token_tree(guid);
+
     if (current_speculation_step == 1) {
       init_bitmask_spec(guid);
     }
@@ -1684,6 +1690,7 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
     if ((*it)->pruned) {
       last_layer.erase(it);
       request.speculative_token_trees[0].tree_size--;
+      request.speculative_token_trees[0].tree_size_including_pruned--;
     }
   }
 }

From df45709a893a8dee5b1099f73279cf6949333f98 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 13:47:30 -0400
Subject: [PATCH 195/667] Removed unused API Bitset.reset_bit. Added
 initialization to TreeVerifyBatchConfig.num_tokens_to_commit.

---
 include/flexflow/batch_config.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index efb57d50c..8b7444998 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -93,12 +93,6 @@ class BatchConfig {
         bits[idx] |= (1ULL << bit);
       }
 
-      void reset_bit(size_t pos) {
-        size_t idx = pos / 64;
-        size_t bit = pos % 64;
-        bits[idx] &= ~(1ULL << bit);
-      }
-
       bool test_bit(size_t pos) const {
         size_t idx = pos / 64;
         size_t bit = pos % 64;
@@ -164,7 +158,7 @@ class TreeVerifyBatchConfig : public BatchConfig {
     int token_depth;   // position of the token in the request's sequence
   };
 
-  int num_tokens_to_commit;
+  int num_tokens_to_commit = 0;
   CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
 };
 

From 927bda2fa7961d58488990d83c35b1e431dd24c0 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 13:49:00 -0400
Subject: [PATCH 196/667] 1. Re-implemented and fixed
 prepare_verify_batch_config. 2. Added code to maintain request.llm_cache_size
 in update_llm_verify_results. 3. Removed cdoe to clear the committed_tokens
 from the last iteration from get_verify_results_greedy.

---
 src/runtime/request_manager.cc | 44 ++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 62b0f764a..10a64c5ca 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -851,10 +851,10 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   // Please refer to the implementation of prepare_next_spec_batch_config()
   // for more details.
   TreeVerifyBatchConfig new_bc;
-  new_bc.num_tokens = 0;
-  new_bc.num_available_requests = 0;
-  new_bc.num_tokens_to_commit = 0;
-  new_bc.prompt_phase = false;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
 
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
@@ -865,18 +865,19 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    new_bc.request_available[request_index] = true;
-    new_bc.num_available_requests++;
+
     // TODO: check this profiling
     profiling_requests[request.guid].llm_decoding_steps += 1;
 
     // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
-        request.tokens.size();
+        request.tokens.size() - 1; // Exclude the last token
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].num_tokens_in_batch =
+        request.speculative_token_trees[0].tree_size;
 
-    // 2. Put the information of the committed tokens into
+    // Put the information of the committed tokens into
     // TreeVerifyBatchConfig.committed_tokens.
     // Note here, we shouldn't put the last token in request.committed_tokens
     // into new_bc. Because the LLM don't have that token's KV cache.
@@ -896,7 +897,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
       new_bc.num_tokens_to_commit++;
     }
 
-    // 3. Load the tokens on the token tree that are not yet pruned to
+    // Load the tokens on the token tree that are not yet pruned to
     // TreeVerifyBatchConfig.tokensInfo.
     TokenTree &token_tree = request.speculative_token_trees[0];
     int token_tree_index = 0;
@@ -912,12 +913,9 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
         }
       }
     }
+    assert(token_tree_index == token_tree.tree_size - 1);
 
-    // 4. Maintain requestsInfo.num_tokens_in_batch of TreeSearchBatchConfig
-    new_bc.requestsInfo[request_index].num_tokens_in_batch =
-        token_tree_index + 1;
-
-    // 5. Create the causal mask for the large model based on the small model
+    // Create the causal mask for the large model based on the small model
     // causal mask.
     new_bc.causalMask[request_index] = create_llm_bitmask(guid);
   }
@@ -944,6 +942,22 @@ bool RequestManager::update_llm_verify_results(
   // Request. For the sampling construction of the speculative token tree, we
   // need to implement a CPU based verify function.
 
+  // Update llm_cache_size with the last committed_tokens, and clear
+  // committed_tokens
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    request.llm_cache_size +=
+        request.committed_tokens.size() - 1; // Exclude the last token
+    request.committed_tokens.clear();
+  }
+
   // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
 
@@ -1207,7 +1221,6 @@ void RequestManager::get_verify_results_greedy(
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    request.committed_tokens.clear();
 
     int committed_token_index = request.tokens.size();
 
@@ -1218,6 +1231,7 @@ void RequestManager::get_verify_results_greedy(
         committed_token_index,
         llm_verify_result.token_ids[llm_result_offset]));
     committed_token_index++;
+    // Don't add it to request.tokens because it has already been added.
 
     // The position of the last accepted token in its tree layer (includeing the
     // pruned tokens)

From 6d8e3aab1561fb609e43d9f9de7795af8e4b6161 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 13:54:32 -0400
Subject: [PATCH 197/667] Added code to maintain request.ssm_cache_size in
 update_ssm_inference_results.

---
 src/runtime/request_manager.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 10a64c5ca..b4e269c7f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1021,6 +1021,10 @@ bool RequestManager::update_ssm_inference_results(
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
+    if (current_speculation_step == 1) {
+      request.ssm_cache_size += request.committed_tokens.size() - 1;
+    }
+
     TokenTree &token_tree = request.speculative_token_trees[0];
     if (token_tree.tree_layers.size() < current_speculation_step) {
       // This means that the parent layer is empty

From 7ff226801942242ffe782bb67d8b9da4be60e04f Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 19:31:08 -0400
Subject: [PATCH 198/667] 1. Added API set_decoding_mode to set the decoding
 mode of the request manager. 2. Added the initialization of some fields in
 the constructor of RequestManager. 3. Fixed the duplicated name of the locks
 in update_inference_results. 4. Added logic to load new prefilling request in
 the first call of update_inference_results.

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 32 ++++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ba0fd7b8e..929f923d3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -207,6 +207,7 @@ class RequestManager {
   int get_max_verify_tokens_per_batch();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
+  void set_decoding_mode(DecodingMode mode);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b4e269c7f..d4f759308 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -47,7 +47,8 @@ std::string LoadBytesFromFile(std::string const &path) {
 RequestManager::RequestManager()
     : background_server_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
-      total_request_run_time(0.0f), request_manager_status(PREFILLING) {
+      total_request_run_time(0.0f), request_manager_status(PREFILLING),
+      decoding_mode(INCREMENTAL_DECODING), prefill_model(SSM) {
   // The following config parameters are set
   // during ffmodel.compile()
   // Initialize them to -1 to make sure no one
@@ -57,6 +58,9 @@ RequestManager::RequestManager()
   max_tokens_per_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
+  std::fill(std::begin(request_available), std::end(request_available), false);
+  std::fill(
+      std::begin(guid_of_requests), std::end(guid_of_requests), INVALID_GUID);
 }
 
 void RequestManager::set_max_requests_per_batch(int max_num_requests) {
@@ -110,6 +114,11 @@ int RequestManager::get_max_sequence_length() {
   return max_sequence_length;
 }
 
+void RequestManager::set_decoding_mode(DecodingMode mode) {
+  assert(mode == INCREMENTAL_DECODING || mode == SPECULATIVE_DECODING);
+  decoding_mode = mode;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -403,8 +412,23 @@ void RequestManager::request_complete_clean_up(int batch_index) {
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
-  std::lock_guard<std::mutex> const lock(rm_state_mutex);
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  std::lock_guard<std::mutex> const rm_state_lock(rm_state_mutex);
+  std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
+
+  if (num_available_requests == 0) {
+    // Update nothing
+    if (pending_request_queue.empty()) {
+      // No request to process
+      return;
+    } else {
+      // Load the pending request to the batch
+      load_pending_reqeust_to_batch();
+      request_manager_status = PREFILLING;
+      if (decoding_mode == SPECULATIVE_DECODING) {
+        prefill_model = SSM;
+      }
+    }
+  }
 
   SsmInferenceResult const *ssm_result_ptr;
   switch (request_manager_status) {
@@ -1376,7 +1400,7 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
-  if (rm->get_num_ssms() == 0) {
+  if (rm->decoding_mode == INCREMENTAL_DECODING) {
     // No SSMs: perform incremental decoding
     rm->serve_decoding(llm);
   } else {

From 717d0fb9127378b1d81c40b785328c90b18e258d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 19:38:37 -0400
Subject: [PATCH 199/667] Fixed update_inference_results.

---
 src/runtime/request_manager.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d4f759308..cc033f0f5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -417,10 +417,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
   if (num_available_requests == 0) {
     // Update nothing
-    if (pending_request_queue.empty()) {
-      // No request to process
-      return;
-    } else {
+    if (!pending_request_queue.empty()) {
       // Load the pending request to the batch
       load_pending_reqeust_to_batch();
       request_manager_status = PREFILLING;
@@ -428,6 +425,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         prefill_model = SSM;
       }
     }
+    return;
   }
 
   SsmInferenceResult const *ssm_result_ptr;

From b91a3db1538f00d2384529b8909ae96501f6fdc3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 19:39:58 -0400
Subject: [PATCH 200/667] Set the decoding mode of the request manager to
 RequestManager::INCREMENTAL_DECODING in incr_decoding.cc.

---
 inference/incr_decoding/incr_decoding.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index aae7256ff..b9c0a0057 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -136,6 +136,8 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::INCREMENTAL_DECODING;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -211,6 +213,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_decoding_mode(decoding_mode);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);

From 2df801e6c996d65eb8600da5ebb50b5d03ad0424 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 4 May 2024 19:40:42 -0400
Subject: [PATCH 201/667] Fixed bug in kernel.

---
 src/ops/inc_multihead_self_attention.cu | 14 +++++++-------
 src/ops/softmax.cc                      | 17 +++++++++--------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 50d8579e1..1646c8ab0 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1135,13 +1135,13 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
     }
     tokens_previous_requests += num_new_tokens;
   }
-  if (tokens_previous_requests != (num_tokens - bc->num_tokens)) {
-    bc->print();
-    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-    printf("num_tokens: %i\n", num_tokens);
-    printf("bc->num_tokens: %i\n", bc->num_tokens);
-  }
-  assert(tokens_previous_requests == (num_tokens - bc->num_tokens));
+  //   if (tokens_previous_requests != (num_tokens - bc->num_tokens)) {
+  //     bc->print();
+  //     printf("tokens_previous_requests: %i\n", tokens_previous_requests);
+  //     printf("num_tokens: %i\n", num_tokens);
+  //     printf("bc->num_tokens: %i\n", bc->num_tokens);
+  //   }
+  //   assert(tokens_previous_requests == (num_tokens - bc->num_tokens));
 }
 
 /*static*/
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 3687fe860..decffaaad 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -92,7 +92,7 @@ SoftmaxParams Softmax::get_params() const {
   return params;
 }
 
-Tensor FFModel::softmax(const Tensor _input,
+Tensor FFModel::softmax(Tensor const _input,
                         int dim,
                         DataType data_type,
                         char const *name) {
@@ -135,7 +135,7 @@ Op *Softmax::create_operator_from_layer(
 
 Softmax::Softmax(FFModel &model,
                  LayerID const &_layer_guid,
-                 const ParallelTensor _input,
+                 ParallelTensor const _input,
                  int _dim,
                  char const *name)
     : Op(model,
@@ -160,7 +160,7 @@ Softmax::Softmax(FFModel &model,
 
 Softmax::Softmax(FFModel &model,
                  SoftmaxParams const &params,
-                 const ParallelTensor input,
+                 ParallelTensor const input,
                  char const *name)
     : Softmax(model, params.layer_guid, input, params.dim, params.name) {}
 
@@ -278,11 +278,12 @@ OpMeta *Softmax::init_task(Task const *task,
   return m;
 }
 
-FutureMap Softmax::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap Softmax::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;

From 8f7ccb047a172596fffa77cf6c3132ac970259af Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 5 May 2024 11:23:56 -0400
Subject: [PATCH 202/667] Fixed a bug.

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cc033f0f5..35808106a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -383,7 +383,7 @@ void RequestManager::load_pending_reqeust_to_batch() {
   RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
 
-  prefill_request = std::make_shared<Request>(all_requests[guid]);
+  prefill_request = std::shared_ptr<Request>(&all_requests[guid]);
   prefill_request->status = Request::RUNNING;
 
   // Find an empty slot

From bfe24693ca010880c17649a447ce583c03cad477 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 5 May 2024 15:54:51 +0000
Subject: [PATCH 203/667] fix linker issue

---
 include/flexflow/batch_config.h    | 6 +++---
 include/flexflow/request_manager.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 8b7444998..8b329c6ef 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -56,9 +56,9 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
-  static int const MAX_NUM_TOKENS = 1024;
-  static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
+  inline static int const MAX_NUM_REQUESTS = 64;
+  inline static int const MAX_NUM_TOKENS = 1024;
+  inline static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
   int num_tokens;
   int num_available_requests;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 929f923d3..916bdc298 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -192,7 +192,7 @@ class RequestManager {
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
 
-  static RequestGuid const INVALID_GUID = 0;
+  inline static RequestGuid const INVALID_GUID = 0;
   RequestManager();
   static RequestManager *get_request_manager();
   size_t get_num_processed_requests();

From c3de16ab29e128b9068e4669f0cd8304f7c6b627 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 5 May 2024 13:03:23 -0400
Subject: [PATCH 204/667] Fixed a bug.

---
 include/flexflow/request_manager.h | 2 +-
 src/runtime/request_manager.cc     | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 916bdc298..75020e927 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -291,7 +291,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
-  std::shared_ptr<Request> prefill_request = nullptr;
+  Request *prefill_request = nullptr;
 
   // Added to make the request manager stateful. During the processing of the
   // first small model inference results, the step equals to 1. That is, every
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 35808106a..54d301e15 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -383,7 +383,7 @@ void RequestManager::load_pending_reqeust_to_batch() {
   RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
 
-  prefill_request = std::shared_ptr<Request>(&all_requests[guid]);
+  prefill_request = &all_requests[guid];
   prefill_request->status = Request::RUNNING;
 
   // Find an empty slot
@@ -407,6 +407,8 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   num_available_requests--;
   request.status = Request::COMPLETED;
 
+  // TODO: remove the request from all_requests?
+
   trigger_request_completion_future(guid);
 }
 

From e1a3b0a43367a2ebbcd11cb50aaea817fcfbbf86 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 5 May 2024 13:35:59 -0400
Subject: [PATCH 205/667] Print output when request is finished. Removed unused
 code.

---
 src/runtime/request_manager.cc | 41 +++-------------------------------
 1 file changed, 3 insertions(+), 38 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 54d301e15..6e2d70e08 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -407,6 +407,9 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   num_available_requests--;
   request.status = Request::COMPLETED;
 
+  std::string output = this->tokenizer_->Decode(request.tokens);
+  std::cout << "Request " << guid << " completed: " << std::endl
+            << output << std::endl;
   // TODO: remove the request from all_requests?
 
   trigger_request_completion_future(guid);
@@ -504,44 +507,6 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   }
 }
 
-// TO BE REMOVED: START
-// void RequestManager::update_inference_results(InferenceResult const &result)
-// {
-//   // Update the inference results
-//   std::lock_guard<std::mutex> const lock(rm_state_mutex);
-//   for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
-//     if (guid_of_requests[i] == INVALID_GUID) {
-//       continue;
-//     }
-//     Request &request = all_requests[guid_of_requests[i]];
-
-//     switch (request_manager_status) {
-//       case PREFILLING:
-//         if (request.initial_len ==
-//             request.llm_cache_size) { // all prompt tokens are prefilled
-//           request.tokens.push_back(
-//               result.token_ids[request.num_tokens_in_batch]);
-//           request_manager_status = DECODING;
-//         }
-//         break;
-//       case DECODING:
-//         request.tokens.push_back(
-//             result.token_ids[request.first_token_offset_in_batch]);
-//         if (request.tokens.size() ==
-//             request.max_sequence_length) { // request is completed
-//           request.status = Request::COMPLETED;
-//           trigger_request_completion_future(request.guid);
-//           guid_of_requests[i] = INVALID_GUID;
-//           request_manager_status = PREFILLING;
-//         }
-//         break;
-//       default:
-//         assert(false);
-//     }
-//   }
-// }
-// TO BE REMOVED: END
-
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {

From 3ebd32dc375974e81e0b82296ec5f31352133737 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 5 May 2024 17:03:01 -0400
Subject: [PATCH 206/667] Fixed a bug.

---
 inference/spec_infer/spec_infer.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 809dd8306..ebce36496 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -279,6 +279,8 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 1024;
   int max_spec_tree_token_num = 23;
   int expansion_degree = 3;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::SPECULATIVE_DECODING;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -312,6 +314,7 @@ void FlexFlow::top_level_task(Task const *task,
                          model_metadata.bos_token_id,
                          model_metadata.eos_token_id,
                          model_metadata.llm_tokenizer_path);
+  rm->set_decoding_mode(decoding_mode);
   rm->register_output_filepath(file_paths.output_file_path);
 
   // Create LLM model

From 63c84c983c691285178c5c5e3540bcc45b486e0b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 5 May 2024 18:29:52 -0400
Subject: [PATCH 207/667] Fixed a bug.

---
 src/runtime/inference_manager.cc | 4 ++--
 src/runtime/request_manager.cc   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 4fcbaa3e3..53a95595c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -312,10 +312,10 @@ FutureMap InferenceManager::inference(FFModel *model,
     return inference(model, index, bcf);
   } else if (bc.get_mode() == TREE_SEARCH_MODE) {
     BatchConfig const *bc_ptr = &bc;
-    TreeSearchBatchConfig const *bsbc_ptr =
+    TreeSearchBatchConfig const *tsbc_ptr =
         static_cast<TreeSearchBatchConfig const *>(bc_ptr);
     TreeSearchBatchConfigFuture bcf =
-        Future::from_value<TreeSearchBatchConfig>(*bsbc_ptr);
+        Future::from_value<TreeSearchBatchConfig>(*tsbc_ptr);
     return inference(model, index, bcf);
   } else if (bc.get_mode() == TREE_VERIFY_MODE) {
     BatchConfig const *bc_ptr = &bc;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6e2d70e08..9b92ef46b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1508,7 +1508,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     } else {
       assert(false && "Invalid request manager status");
     }
-    runtime->end_trace(ctx, 12345 /*trace_id*/);
+    runtime->end_trace(ctx, 12346 /*trace_id*/);
   }
 }
 

From 0e0f5ab8365932854c739f4ad90683bfae3413a6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 5 May 2024 21:34:34 -0400
Subject: [PATCH 208/667] 1. Added void set_verbose(bool verbose_); 2. Added
 some verbose outputs. 3. Splitted prepare_prefilling_batch into two:
 prepare_llm_prefilling_batch and prepare_ssm_prefilling_batch. 4. Fixed the
 verbose parameter in incr_decoding and spec_infer.

---
 include/flexflow/request_manager.h       |   4 +-
 inference/incr_decoding/incr_decoding.cc |   1 +
 inference/spec_infer/spec_infer.cc       |   1 +
 src/runtime/request_manager.cc           | 109 +++++++++++++++++------
 4 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 75020e927..8dbfdc7b0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -208,6 +208,7 @@ class RequestManager {
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
   void set_decoding_mode(DecodingMode mode);
+  void set_verbose(bool verbose_);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -336,11 +337,12 @@ class RequestManager {
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);
-  BatchConfig prepare_prefilling_batch();
+  BatchConfig prepare_llm_prefilling_batch();
   BatchConfig prepare_decoding_batch();
   /* ---------- Incremental Decoding Helper Functions ---------- */
 
   /* ---------- Spec Decoding Helper Functions ---------- */
+  TreeSearchBatchConfig prepare_ssm_prefilling_batch();
   bool update_llm_verify_results(InferenceResult const &llm_verify_result);
   bool update_ssm_inference_results(
       SsmInferenceResult const &ssm_inference_result);
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index b9c0a0057..6a3667d70 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -214,6 +214,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_decoding_mode(decoding_mode);
+  rm->set_verbose(verbose);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index ebce36496..06a5b1d36 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -310,6 +310,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_spec_tree_token_num(max_spec_tree_token_num);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_verbose(verbose);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
                          model_metadata.eos_token_id,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9b92ef46b..72f8a4c74 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -119,6 +119,10 @@ void RequestManager::set_decoding_mode(DecodingMode mode) {
   decoding_mode = mode;
 }
 
+void RequestManager::set_verbose(bool verbose_) {
+  verbose = verbose_;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -555,7 +559,20 @@ bool RequestManager::update_ssm_prefill_results(
 BatchConfig RequestManager::prepare_next_batch() {
   switch (request_manager_status) {
     case PREFILLING:
-      return prepare_prefilling_batch();
+      if (decoding_mode == INCREMENTAL_DECODING) {
+        return prepare_llm_prefilling_batch();
+      } else if (decoding_mode == SPECULATIVE_DECODING) {
+        if (prefill_model == SSM) {
+          return prepare_ssm_prefilling_batch();
+        } else if (prefill_model == LLM) {
+          return prepare_llm_prefilling_batch();
+        } else {
+          assert(false && "Invalid prefill model.");
+        }
+      } else {
+        assert(false && "Invalid inference mode.");
+      }
+      break;
     case DECODING:
       return prepare_decoding_batch();
     case SSM_SPEC:
@@ -573,11 +590,15 @@ BatchConfig RequestManager::prepare_next_batch() {
   }
 }
 
-BatchConfig RequestManager::prepare_prefilling_batch() {
+BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   // This function is called when the request_manager_status is PREFILLING,
   // which means that there is a request in the prefilling phase.
   // This function load its prefilling tokens, constructing a BatchConfig with
   // only one request.
+  if (verbose) {
+    std::cout << "\n############### prepare_llm_prefilling_batch "
+                 "##############\n";
+  }
   assert(prefill_request != nullptr &&
          "No prefilling request to process in the prefilling phase.");
 
@@ -589,24 +610,63 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   bc.num_available_requests = num_available_requests;
 
   int request_index = prefill_request->batch_index;
+  // Request Info
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  if (prefill_model == SSM) {
-    // Request Info
-    bc.requestsInfo[request_index].first_token_index_in_request =
-        prefill_request->ssm_cache_size;
-    bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-        BatchConfig::MAX_NUM_TOKENS,
-        (int)prefill_request->tokens.size() - prefill_request->ssm_cache_size);
+  bc.requestsInfo[request_index].first_token_index_in_request =
+      prefill_request->llm_cache_size;
+  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
+      BatchConfig::MAX_NUM_TOKENS,
+      (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
 
-  } else if (prefill_model == LLM) {
-    // Request Info
-    bc.requestsInfo[request_index].first_token_index_in_request =
-        prefill_request->llm_cache_size;
-    bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-        BatchConfig::MAX_NUM_TOKENS,
-        (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
+  prefill_request->first_token_offset_in_batch = 0;
+  prefill_request->num_tokens_in_batch =
+      bc.requestsInfo[request_index].num_tokens_in_batch;
+
+  // Token Info
+  for (int token_idx = 0;
+       token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
+       token_idx++) {
+    int abs_idx = prefill_request->llm_cache_size + token_idx;
+    assert(abs_idx < prefill_request->tokens.size());
+
+    bc.tokensInfo[token_idx].request_index = request_index;
+    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
+
+    bc.num_tokens++;
   }
 
+  return bc;
+}
+
+TreeSearchBatchConfig RequestManager::prepare_ssm_prefilling_batch() {
+  // This function is called when the request_manager_status is PREFILLING,
+  // which means that there is a request in the prefilling phase.
+  // This function load its prefilling tokens, constructing a BatchConfig with
+  // only one request.
+  if (verbose) {
+    std::cout << "\n############### prepare_ssm_prefilling_batch "
+                 "##############\n";
+  }
+  assert(prefill_request != nullptr &&
+         "No prefilling request to process in the prefilling phase.");
+
+  TreeSearchBatchConfig bc;
+  bc.prompt_phase = true;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
+
+  int request_index = prefill_request->batch_index;
+  // Request Info
+  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+  bc.requestsInfo[request_index].first_token_index_in_request =
+      prefill_request->ssm_cache_size;
+  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
+      BatchConfig::MAX_NUM_TOKENS,
+      (int)prefill_request->tokens.size() - prefill_request->ssm_cache_size);
+
   prefill_request->first_token_offset_in_batch = 0;
   prefill_request->num_tokens_in_batch =
       bc.requestsInfo[request_index].num_tokens_in_batch;
@@ -615,14 +675,7 @@ BatchConfig RequestManager::prepare_prefilling_batch() {
   for (int token_idx = 0;
        token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
        token_idx++) {
-    int abs_idx = -1;
-    if (prefill_model == SSM) {
-      abs_idx = prefill_request->ssm_cache_size + token_idx;
-    } else if (prefill_model == LLM) {
-      abs_idx = prefill_request->llm_cache_size + token_idx;
-    } else {
-      assert(false && "Invalid prefill model.");
-    }
+    int abs_idx = prefill_request->ssm_cache_size + token_idx;
     assert(abs_idx < prefill_request->tokens.size());
 
     bc.tokensInfo[token_idx].request_index = request_index;
@@ -639,6 +692,10 @@ BatchConfig RequestManager::prepare_decoding_batch() {
   // This function is called when the request_manager_status is DECODING. It
   // fills the last token of each request in the current batch to the
   // BatchConfig for the LLM to decode.
+  if (verbose) {
+    std::cout << "\n############### prepare_decoding_batch "
+                 "##############\n";
+  }
 
   BatchConfig bc;
   bc.prompt_phase = false;
@@ -822,7 +879,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout
-        << "\n############### prepare_next_batch_verify ###############\n";
+        << "\n############### prepare_verify_batch_config ###############\n";
   }
   // This method does the following:
   // 1. Commit the verified tokens in the last iteration through the
@@ -910,7 +967,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   }
 
   if (verbose) {
-    std::cout << "prepare_next_batch_verify NEW batchconfig:" << std::endl;
+    std::cout << "prepare_verify_batch_config NEW batchconfig:" << std::endl;
     new_bc.print();
   }
   return new_bc;

From e63cd9b91af4e02e7f1dc17225891f5269ee9e86 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 7 May 2024 12:38:18 -0400
Subject: [PATCH 209/667] 1. Merged TreeSearchBatchConfig and
 TreeVerifyBatchConfig into BatchConfig. 2. Merged SsmInferenceResult into
 InferenceResult.

---
 include/flexflow/batch_config.h               | 144 +++++++------
 include/flexflow/config.h                     |   9 +-
 include/flexflow/ops/arg_topk.h               |   6 +-
 include/flexflow/ops/argmax.h                 |   2 +-
 .../inc_multihead_self_attention_utils.cuh    |   2 +-
 .../ops/spec_inc_multihead_self_attention.h   |   2 +-
 .../ops/tree_inc_multihead_self_attention.h   |  12 +-
 include/flexflow/request_manager.h            |  14 +-
 inference/models/falcon.h                     |   2 +-
 inference/models/llama.h                      |   2 +-
 inference/models/mpt.h                        |   2 +-
 inference/models/opt.h                        |   2 +-
 inference/models/starcoder.h                  |   2 +-
 src/c/flexflow_c.cc                           |  55 +++--
 src/ops/arg_topk.cc                           |  20 +-
 src/ops/arg_topk.cpp                          |  27 +--
 src/ops/arg_topk.cu                           |  27 +--
 src/ops/argmax.cc                             |  20 +-
 src/ops/fused.cpp                             |  11 +-
 src/ops/fused.cu                              |  11 +-
 src/ops/inc_multihead_self_attention.cpp      |  34 +--
 src/ops/inc_multihead_self_attention.cu       |   4 +-
 src/ops/spec_inc_multihead_self_attention.cc  |   5 +-
 src/ops/spec_inc_multihead_self_attention.cu  |  81 ++++----
 src/ops/tree_inc_multihead_self_attention.cc  |  26 ++-
 src/ops/tree_inc_multihead_self_attention.cpp |  66 +++---
 src/ops/tree_inc_multihead_self_attention.cu  | 107 +++++-----
 src/runtime/batch_config.cc                   | 120 ++++++++---
 src/runtime/inference_manager.cc              |  22 +-
 src/runtime/model.cc                          |  11 +-
 src/runtime/request_manager.cc                | 196 +++++++++++-------
 src/runtime/request_manager.cpp               |  16 +-
 src/runtime/request_manager.cu                |  16 +-
 src/runtime/tree_search_batch_config.cc       | 123 -----------
 src/runtime/tree_verify_batch_config.cc       | 100 ---------
 35 files changed, 591 insertions(+), 708 deletions(-)
 delete mode 100644 src/runtime/tree_search_batch_config.cc
 delete mode 100644 src/runtime/tree_verify_batch_config.cc

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 8b329c6ef..d9a6e716a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -28,19 +28,16 @@
 namespace FlexFlow {
 
 class InferenceResult;
-class SsmInferenceResult;
 
 using BatchConfigFuture = Legion::Future;
 using InferenceResultFuture = Legion::Future;
-using TreeSearchBatchConfigFuture = Legion::Future;
-using TreeVerifyBatchConfigFuture = Legion::Future;
-using SsmInferenceResultFuture = Legion::Future;
 
 class BatchConfig {
 public:
   using RequestGuid = size_t;
   using TokenId = int;
-  BatchConfig();
+  BatchConfig(InferenceMode inference_mode = INC_DECODING_MODE,
+              int model_id = 0);
   int num_active_requests() const;
   int num_active_tokens() const;
   static int max_requests_per_batch();
@@ -53,27 +50,40 @@ class BatchConfig {
   void save_to_file(std::string const &filename) const;
   virtual InferenceMode get_mode() const;
   static BatchConfig const *from_future(BatchConfigFuture const &future);
+
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
   inline static int const MAX_NUM_TOKENS = 1024;
   inline static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
+  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+  inline static int const MAX_TREE_DEPTH = 16;
+  inline static int const MAX_K_LOGITS = 16;
 
-  int num_tokens;
-  int num_available_requests;
-  bool prompt_phase;
+  int num_tokens = 0;
+  int num_available_requests = 0;
+  bool prompt_phase = false;
+  int num_tokens_to_commit = 0;
+  int model_id;
+  InferenceMode inference_mode;
 
   struct PerRequestInfo {
-    int first_token_index_in_request = 0;
-    int first_token_offset_in_batch = 0;
+    int first_token_index_in_request = -1;
+    int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
   };
 
   struct PerTokenInfo {
-    TokenId token_id = 0;
-    int abs_index_in_request = 0;
-    int request_index = 0;
+    TokenId token_id = -1;
+    int abs_index_in_request = -1;
+    int request_index = -1;
+  };
+
+  struct CommittedTokensInfo {
+    int token_index = -1;   // the index of the token in the previous batch
+    int request_index = -1; // request index in the batch
+    int token_depth = -1;   // position of the token in the request's sequence
   };
 
   class BitMask {
@@ -139,65 +149,67 @@ class BatchConfig {
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
-
+  CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
   bool request_available[MAX_NUM_REQUESTS];
 };
 
-class TreeVerifyBatchConfig : public BatchConfig {
-public:
-  TreeVerifyBatchConfig();
-  ~TreeVerifyBatchConfig();
-  InferenceMode get_mode() const;
-  friend std::ostream &operator<<(std::ostream &os,
-                                  TreeVerifyBatchConfig const &bc);
-  void print() const;
-  void save_to_file(std::string const &filename) const;
-  struct CommittedTokensInfo {
-    int token_index;   // the index of the token in the previous batch
-    int request_index; // request index in the batch
-    int token_depth;   // position of the token in the request's sequence
-  };
-
-  int num_tokens_to_commit = 0;
-  CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
-};
+// class TreeVerifyBatchConfig : public BatchConfig {
+// public:
+//   TreeVerifyBatchConfig();
+//   ~TreeVerifyBatchConfig();
+//   InferenceMode get_mode() const;
+//   friend std::ostream &operator<<(std::ostream &os,
+//                                   TreeVerifyBatchConfig const &bc);
+//   void print() const;
+//   void save_to_file(std::string const &filename) const;
+//   struct CommittedTokensInfo {
+//     int token_index;   // the index of the token in the previous batch
+//     int request_index; // request index in the batch
+//     int token_depth;   // position of the token in the request's sequence
+//   };
+
+//   int num_tokens_to_commit = 0;
+//   CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
+// };
 
 struct InferenceResult {
-  static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
-  BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
-  virtual ~InferenceResult() = default;
-};
-
-class TreeSearchBatchConfig : public BatchConfig {
-public:
-  TreeSearchBatchConfig();
-  TreeSearchBatchConfig(int model_id);
-  TreeSearchBatchConfig(TreeSearchBatchConfig const &other, int model_id);
-  InferenceMode get_mode() const;
-
-  ~TreeSearchBatchConfig();
-
-  friend std::ostream &operator<<(std::ostream &os,
-                                  TreeSearchBatchConfig const &bc);
-  void print() const;
-  void save_to_file(std::string const &filename) const;
-
-  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
-  inline static int const MAX_TREE_DEPTH = 16;
-
-  // how many requests is in speculative phase
-  int model_id;
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS *
+                                 BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+  float probs[BatchConfig::MAX_NUM_TOKENS *
+              BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+  float topk_logits[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
 };
 
-class SsmInferenceResult : public InferenceResult {
-public:
-  BatchConfig::TokenId
-      token_ids[MAX_NUM_TOKENS *
-                TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-  float probs[MAX_NUM_TOKENS *
-              TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-  int parent_id[MAX_NUM_TOKENS *
-                TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-};
+// class TreeSearchBatchConfig : public BatchConfig {
+// public:
+//   TreeSearchBatchConfig();
+//   TreeSearchBatchConfig(int model_id);
+//   TreeSearchBatchConfig(TreeSearchBatchConfig const &other, int model_id);
+//   InferenceMode get_mode() const;
+
+//   ~TreeSearchBatchConfig();
+
+//   friend std::ostream &operator<<(std::ostream &os,
+//                                   TreeSearchBatchConfig const &bc);
+//   void print() const;
+//   void save_to_file(std::string const &filename) const;
+
+//   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+//   inline static int const MAX_TREE_DEPTH = 16;
+
+//   // how many requests is in speculative phase
+//   int model_id;
+// };
+
+// class SsmInferenceResult : public InferenceResult {
+// public:
+//   BatchConfig::TokenId
+//       token_ids[MAX_NUM_TOKENS *
+//                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+//   float probs[MAX_NUM_TOKENS *
+//               TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+//   int parent_id[MAX_NUM_TOKENS *
+//                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+// };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 3fea8e3a6..54b6d9d8e 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -78,13 +78,10 @@ struct FFHandler {
   size_t workSpaceSize;
   void *batch_config_metadata;
 
-  // request info + token info + topolopgy mask info
   size_t batch_config_metadata_size =
-      sizeof(BatchConfig::tokensInfo) +
-      sizeof(BatchConfig::requestsInfo) +
-      sizeof(BatchConfig::request_available) +
-      sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens);
+      sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
+      sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
+      sizeof(BatchConfig::committed_tokens);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   DataType quantization_type;
diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index e0f328c59..935aa9ff9 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -64,7 +64,7 @@ class ArgTopK : public Op {
                      std::vector<Legion::PhysicalRegion> const &regions,
                      Legion::Context ctx,
                      Legion::Runtime *runtime);
-  static SsmInferenceResult inference_speculative_task(
+  static InferenceResult inference_speculative_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
       Legion::Context ctx,
@@ -89,14 +89,14 @@ class ArgTopK : public Op {
                              int length,
                              int k,
                              bool sorted,
-                             TreeSearchBatchConfig const *bc,
+                             BatchConfig const *bc,
                              ffStream_t stream);
   static void forward_kernel_wrapper(ArgTopKMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &prob,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     TreeSearchBatchConfig const *bc);
+                                     BatchConfig const *bc);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
index d0c549136..e58e8ca80 100644
--- a/include/flexflow/ops/argmax.h
+++ b/include/flexflow/ops/argmax.h
@@ -66,7 +66,7 @@ class ArgMax : public Op {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static SsmInferenceResult
+  static InferenceResult
       inference_task_beam(Legion::Task const *task,
                           std::vector<Legion::PhysicalRegion> const &regions,
                           Legion::Context ctx,
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index 5804023ba..546d5e9a9 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -476,7 +476,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head,
                                     int max_sequence_length,
                                     int threads_per_value,
                                     int threads_per_block,
-                                    TreeVerifyBatchConfig const *bc,
+                                    BatchConfig const *bc,
                                     int shared_mem[]) {
 
   int max_query_length = 0;
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index 1e046bb26..f3e5a23ea 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -109,7 +109,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
 
   static void
       inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m,
-                               TreeSearchBatchConfig const *bc,
+                               BatchConfig const *bc,
                                int shard_id,
                                GenericTensorAccessorR const &input,
                                GenericTensorAccessorR const &weight,
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index d160da4a7..45a7a6b56 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -26,7 +26,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
 
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 LayerID const &layer_guid,
-                                const ParallelTensor _input,
+                                ParallelTensor const _input,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -47,8 +47,8 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 int _tensor_parallelism_degree,
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
-                                const ParallelTensor _input,
-                                const ParallelTensor _weight,
+                                ParallelTensor const _input,
+                                ParallelTensor const _weight,
                                 int _embed_dim,
                                 int _num_q_heads,
                                 int _num_kv_heads,
@@ -70,7 +70,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 TreeIncMultiHeadSelfAttention const &other,
-                                const ParallelTensor input,
+                                ParallelTensor const input,
                                 bool allocate_weights);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 Params const &params,
@@ -111,7 +111,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                              CostMetrics &cost_metrics) const override;
 
   static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m,
-                                       TreeVerifyBatchConfig const *bc,
+                                       BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &weight,
@@ -146,7 +146,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 public:
   int num_active_tokens;
   Realm::RegionInstance committed_token_reserve_inst;
-  TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos;
+  BatchConfig::CommittedTokensInfo *committed_token_infos;
   BatchConfig::BitMask *causalMask;
 };
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8dbfdc7b0..6d65b0cde 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -148,7 +148,7 @@ struct CompareSharedTokenTreeNodePtrRequestGuidPair {
 
 class TokenTree {
 public:
-  std::vector<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
+  std::list<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
   // The numebr of tokens in the tree that are not pruned
   int tree_size = 0;
   // The numebr of tokens in the tree including the pruned ones
@@ -342,21 +342,21 @@ class RequestManager {
   /* ---------- Incremental Decoding Helper Functions ---------- */
 
   /* ---------- Spec Decoding Helper Functions ---------- */
-  TreeSearchBatchConfig prepare_ssm_prefilling_batch();
+  BatchConfig prepare_ssm_prefilling_batch();
   bool update_llm_verify_results(InferenceResult const &llm_verify_result);
-  bool update_ssm_inference_results(
-      SsmInferenceResult const &ssm_inference_result);
+  bool
+      update_ssm_inference_results(InferenceResult const &ssm_inference_result);
   bool update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
   // Prepare the next speculation batch config. This function is called before
   // the second step of the speculation.
-  TreeSearchBatchConfig prepare_next_spec_batch_config();
+  BatchConfig prepare_next_spec_batch_config();
   // Prepare the first speculation batch config. This function is called before
   // the first step of the speculation. The difference with
   // prepare_next_batch_config_spec is that we put the info of the committed
   // tokens into the batch config in the first speculation step to commit the KV
   // cache of the small model.
-  TreeSearchBatchConfig prepare_first_spec_batch_config();
-  TreeVerifyBatchConfig prepare_verify_batch_config();
+  BatchConfig prepare_first_spec_batch_config();
+  BatchConfig prepare_verify_batch_config();
 
   // LLM result verification
   void get_verify_results_greedy(InferenceResult const &llm_verify_result);
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index bfbf288be..e7aa4fecf 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -61,7 +61,7 @@ class FALCON {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 2de105217..7a7440982 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -49,7 +49,7 @@ class LLAMA {
                   << std::endl;
         assert(false);
       }
-      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 00cef5c6e..8a42b0e2d 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -48,7 +48,7 @@ class MPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 8756db460..bc142d7d0 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -56,7 +56,7 @@ class OPT {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index 1c593f00a..e56e0f098 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -53,7 +53,7 @@ class STARCODER {
       }
       // max_seq_len = BatchConfig::MAX_SEQ_LENGTH;
       // max_num_tokens = BatchConfig::MAX_NUM_TOKENS;
-      k_of_arg_topk = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      k_of_arg_topk = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     }
 
     void print() const {}
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index efa4a186d..72a9f66d3 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -61,10 +61,9 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *);
   // inference
   FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *);
-  FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t,
-                        TreeVerifyBatchConfig *);
-  FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t,
-                        TreeSearchBatchConfig *);
+  //   FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, BatchConfig
+  //   *); FF_NEW_OPAQUE_WRAPPER(flexflow_beam_search_batch_config_t,
+  //   BatchConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_inference_manager_t, InferenceManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
@@ -2527,37 +2526,37 @@ void flexflow_batch_config_destroy(flexflow_batch_config_t handle_) {
 // TreeVerifyBatchConfig
 // -----------------------------------------------------------------------
 
-flexflow_tree_verify_batch_config_t
-    flexflow_tree_verify_batch_config_create(void) {
-  TreeVerifyBatchConfig *config = new TreeVerifyBatchConfig();
-  DEBUG_PRINT("[TreeVerifyBatchConfig] new %p", config);
-  return FFCObjectWrapper::wrap(config);
-}
+// flexflow_tree_verify_batch_config_t
+//     flexflow_tree_verify_batch_config_create(void) {
+//   BatchConfig *config = new BatchConfig();
+//   DEBUG_PRINT("[BatchConfig] new %p", config);
+//   return FFCObjectWrapper::wrap(config);
+// }
 
-void flexflow_tree_verify_batch_config_destroy(
-    flexflow_tree_verify_batch_config_t handle_) {
-  TreeVerifyBatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  DEBUG_PRINT("[TreeVerifyBatchConfig] delete %p", handle);
-  delete handle;
-}
+// void flexflow_tree_verify_batch_config_destroy(
+//     flexflow_tree_verify_batch_config_t handle_) {
+//   BatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[BatchConfig] delete %p", handle);
+//   delete handle;
+// }
 
 // -----------------------------------------------------------------------
 // BeamSearchBatchConfig
 // -----------------------------------------------------------------------
 
-flexflow_beam_search_batch_config_t
-    flexflow_beam_search_batch_config_create(void) {
-  TreeSearchBatchConfig *config = new TreeSearchBatchConfig();
-  DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config);
-  return FFCObjectWrapper::wrap(config);
-}
+// flexflow_beam_search_batch_config_t
+//     flexflow_beam_search_batch_config_create(void) {
+//   BatchConfig *config = new BatchConfig();
+//   DEBUG_PRINT("[BeamSearchBatchConfig] new %p", config);
+//   return FFCObjectWrapper::wrap(config);
+// }
 
-void flexflow_beam_search_batch_config_destroy(
-    flexflow_beam_search_batch_config_t handle_) {
-  TreeSearchBatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
-  DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle);
-  delete handle;
-}
+// void flexflow_beam_search_batch_config_destroy(
+//     flexflow_beam_search_batch_config_t handle_) {
+//   BatchConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[BeamSearchBatchConfig] delete %p", handle);
+//   delete handle;
+// }
 
 // -----------------------------------------------------------------------
 // RequestManager
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 212e10d67..706fbbc7a 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -291,11 +291,12 @@ void ArgTopK::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap ArgTopK::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap ArgTopK::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
@@ -404,18 +405,17 @@ InferenceResult
   return ir;
 }
 
-SsmInferenceResult ArgTopK::inference_speculative_task(
+InferenceResult ArgTopK::inference_speculative_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
   assert(regions.size() == 3);
   assert(task->regions.size() == 3);
-  TreeSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
+  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
   if (bc.num_active_tokens() == 0) {
     // Directly return for empty batch config
-    SsmInferenceResult ir;
+    InferenceResult ir;
     return ir;
   }
   ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
@@ -430,7 +430,7 @@ SsmInferenceResult ArgTopK::inference_speculative_task(
   int batch_size = bc.num_active_tokens();
   ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
 
-  SsmInferenceResult ir;
+  InferenceResult ir;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
   download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index 3a60e54fe..cc2b8e9ec 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -371,16 +371,17 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input,
 
 /*static*/
 template <typename DT>
-void ArgTopK::forward_kernel(ArgTopKMeta const *m,
-                             DT const *input_ptr,
-                             float *output_ptr,
-                             int *indices_ptr,
-                             size_t batch_size,
-                             int length,
-                             int k,
-                             bool sorted,
-                             /* Reserved: BatchConfig Updated */TreeSearchBatchConfig const *bc,
-                             hipStream_t stream) {
+void ArgTopK::forward_kernel(
+    ArgTopKMeta const *m,
+    DT const *input_ptr,
+    float *output_ptr,
+    int *indices_ptr,
+    size_t batch_size,
+    int length,
+    int k,
+    bool sorted,
+    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
+    hipStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
   int num_shards = 0;
@@ -402,13 +403,13 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
 
-    assert(num_shards >= (size_t)TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
     arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
-        TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
         output_ptr,
         indices_ptr,
@@ -436,7 +437,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                                      // float *output_ptr,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     TreeSearchBatchConfig const *bc) {
+                                     BatchConfig const *bc) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   // Domain in1_domain = runtime->get_index_space_domain(
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 5403494bd..93444becc 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -371,16 +371,17 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input,
 
 /*static*/
 template <typename DT>
-void ArgTopK::forward_kernel(ArgTopKMeta const *m,
-                             DT const *input_ptr,
-                             float *output_ptr,
-                             int *indices_ptr,
-                             size_t batch_size,
-                             int length,
-                             int k,
-                             bool sorted,
-                             /* Reserved: BatchConfig Updated */TreeSearchBatchConfig const *bc,
-                             cudaStream_t stream) {
+void ArgTopK::forward_kernel(
+    ArgTopKMeta const *m,
+    DT const *input_ptr,
+    float *output_ptr,
+    int *indices_ptr,
+    size_t batch_size,
+    int length,
+    int k,
+    bool sorted,
+    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
+    cudaStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
   int num_shards = 0;
@@ -402,13 +403,13 @@ void ArgTopK::forward_kernel(ArgTopKMeta const *m,
   // all requests share the same number of branches
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
-    assert(num_shards >= (size_t)TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
     arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
-        TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
         output_ptr,
         indices_ptr,
@@ -436,7 +437,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                                      GenericTensorAccessorW const &probs,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
-                                     TreeSearchBatchConfig const *bc) {
+                                     BatchConfig const *bc) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index e5b6331c8..ad80b665e 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -260,11 +260,12 @@ void ArgMax::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap ArgMax::inference(FFModel const &ff,
-                            /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                            std::vector<ParallelTensor> const &batch_inputs,
-                            std::vector<ParallelTensor> const &batch_outputs,
-                            MachineView const *mv) {
+FutureMap ArgMax::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
@@ -332,7 +333,7 @@ FutureMap ArgMax::inference(FFModel const &ff,
   }
 }
 
-SsmInferenceResult
+InferenceResult
     ArgMax::inference_task_beam(Task const *task,
                                 std::vector<PhysicalRegion> const &regions,
                                 Context ctx,
@@ -342,7 +343,7 @@ SsmInferenceResult
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     // Directly return for empty batch config
-    SsmInferenceResult ir;
+    InferenceResult ir;
     return ir;
   }
   ArgMaxMeta *m = *((ArgMaxMeta **)task->local_args);
@@ -355,17 +356,16 @@ SsmInferenceResult
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
-  SsmInferenceResult ir;
+  InferenceResult ir;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   download_tensor(m->probs, ir.probs, batch_size);
-  download_tensor<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     ArgMax::save_inference_tensors_to_file(
-        m, shard_id, bc, {}, {}, {input, indices, parent});
+        m, shard_id, bc, {}, {}, {input, indices});
   }
 
   return ir;
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index f09caa10d..9ba9f7b8b 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -525,7 +525,8 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  /* Reserved: BatchConfig Updated */BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  /* Reserved: BatchConfig Updated */ BatchConfig const *bc =
+      BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
   }
@@ -871,8 +872,8 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         TreeIncMultiHeadSelfAttentionMeta *m =
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        TreeVerifyBatchConfig const &verify_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        BatchConfig const &verify_bc =
+            Future(task->futures[0]).get_result<BatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -897,8 +898,8 @@ __host__ void
             (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
         // TreeSearchBatchConfig const *search_bc =
         //     (TreeSearchBatchConfig *)task->args;
-        TreeSearchBatchConfig const &search_bc =
-            Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
+        BatchConfig const &search_bc =
+            Future(task->futures[0]).get_result<BatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 875321182..ab41e5af1 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -539,7 +539,8 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  /* Reserved: BatchConfig Updated */BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  /* Reserved: BatchConfig Updated */ BatchConfig const *bc =
+      BatchConfig::from_future(task->futures[0]);
   // Return if no active tokens
   if (bc->num_tokens == 0) {
     return;
@@ -907,8 +908,8 @@ __host__ void
             (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
         // TreeVerifyBatchConfig const *verify_bc =
         //     (TreeVerifyBatchConfig *)task->args;
-        TreeVerifyBatchConfig const &verify_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        BatchConfig const &verify_bc =
+            Future(task->futures[0]).get_result<BatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
@@ -933,8 +934,8 @@ __host__ void
             (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
         // TreeSearchBatchConfig const *search_bc =
         //     (TreeSearchBatchConfig *)task->args;
-        TreeSearchBatchConfig const &search_bc =
-            Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
+        BatchConfig const &search_bc =
+            Future(task->futures[0]).get_result<BatchConfig>();
         assert(fused->op_num_weights[op] ==
                (1 + (int)(*m->qkv_bias || *m->final_bias)));
         GenericTensorAccessorR biases;
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index b74225bee..c5480e187 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -124,19 +124,19 @@ __global__ void scaling_query_kernel(DT *input_ptr,
 }
 
 template <typename DT>
-__global__ void apply_rotary_embedding_native(
-    DT *input_ptr,
-    hipFloatComplex *complex_input,
-    /* Reserved: BatchConfig Updated */
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int num_q_heads,
-    int num_tokens,
-    int num_kv_heads,
-    int q_block_size,
-    int k_block_size,
-    int q_array_size) {
+__global__ void
+    apply_rotary_embedding_native(DT *input_ptr,
+                                  hipFloatComplex *complex_input,
+                                  /* Reserved: BatchConfig Updated */
+                                  BatchConfig::PerTokenInfo const *tokenInfos,
+                                  int qProjSize,
+                                  int kProjSize,
+                                  int num_q_heads,
+                                  int num_tokens,
+                                  int num_kv_heads,
+                                  int q_block_size,
+                                  int k_block_size,
+                                  int q_array_size) {
   CUDA_KERNEL_LOOP(
       i,
       num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
@@ -968,13 +968,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       }
       case TREE_SEARCH_MODE: {
         key_cache_size = num_q_heads * kProjSize *
-                         TreeSearchBatchConfig::max_requests_per_batch() *
+                         BatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length() *
-                         TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
         value_cache_size = num_q_heads * vProjSize *
-                           TreeSearchBatchConfig::max_requests_per_batch() *
+                           BatchConfig::max_requests_per_batch() *
                            BatchConfig::max_sequence_length() *
-                           TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+                           BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
         break;
       }
       default:
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 1646c8ab0..54b1704a3 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1366,11 +1366,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       case TREE_VERIFY_MODE: {
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
-                         TreeSearchBatchConfig::max_requests_per_batch() *
+                         BatchConfig::max_requests_per_batch() *
                          (BatchConfig::max_sequence_length() +
                           BatchConfig::max_spec_tree_token_num());
         value_cache_size = num_q_heads * vProjSize *
-                           TreeSearchBatchConfig::max_requests_per_batch() *
+                           BatchConfig::max_requests_per_batch() *
                            (BatchConfig::max_sequence_length() +
                             BatchConfig::max_spec_tree_token_num());
         break;
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 08b0a2ac9..0e71af411 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -673,7 +673,7 @@ void SpecIncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap SpecIncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
@@ -736,8 +736,7 @@ void SpecIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  TreeSearchBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeSearchBatchConfig>();
+  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
   if (bc.num_tokens == 0) {
     return;
   }
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index e636d629b..f5de35663 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -48,7 +48,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     int const max_seq_length,
     int per_head_size,
     int hidden_size,
-    /* Reserved: BatchConfig Updated */BatchConfig::PerRequestInfo *request_infos,
+    /* Reserved: BatchConfig Updated */
+    BatchConfig::PerRequestInfo *request_infos,
     BatchConfig::BitMask *causalMask,
     bool *request_available) {
 
@@ -86,7 +87,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   // request_idx = re
 
   // BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
-  BatchConfig::BitMask* bitmask = &causalMask[requext_idx_in_batch];
+  BatchConfig::BitMask *bitmask = &causalMask[requext_idx_in_batch];
 
   int const first_step = 0;
 
@@ -100,9 +101,10 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   int const totalCacheSize =
       bitmask->non_tree_cache_size + bitmask->tree_or_prompt_size;
 
-  int const first_token_idx = request_infos[requext_idx_in_batch].first_token_offset_in_batch;
+  int const first_token_idx =
+      request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
-  int const tree_branch_num = 
+  int const tree_branch_num =
       request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
   // shared memory objects
@@ -147,7 +149,8 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
           ii * THREADS_PER_KEY * K_VEC_SIZE);
     }
 
-    // int const query_token = bitmask->prompt_size + bitmask->tree_or_prompt_size
+    // int const query_token = bitmask->prompt_size +
+    // bitmask->tree_or_prompt_size
     // -
     //                         1 - tree_branch_num + qi;
     int const query_token = bitmask->tree_or_prompt_size - tree_branch_num + qi;
@@ -172,9 +175,11 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
         // todo add alobi here
         // bool const mask = ti_circ >= totalCacheSize;
         bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                          !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, query_token));
-                          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-                          //   (1 << query_token))));
+                           !test_bit(bitmask->bit_mask,
+                                     ti - bitmask->non_tree_cache_size,
+                                     query_token));
+        // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+        //   (1 << query_token))));
 
         // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
         //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
@@ -225,9 +230,11 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     for (int ti = first_step + tidx; ti < totalCacheSize;
          ti += THREADS_PER_BLOCK) {
       bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                          !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, query_token));
-                          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-                          //   (1 << query_token))));
+                         !test_bit(bitmask->bit_mask,
+                                   ti - bitmask->non_tree_cache_size,
+                                   query_token));
+      // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+      //   (1 << query_token))));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -262,8 +269,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
 
     // The base pointer for the value in the cache buffer.
     DT const *v_cache_batch =
-        value_cache + requext_idx_in_batch * max_seq_length * hidden_size +
-        vi;
+        value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
 
     if (Dh == Dh_MAX || vi < Dh) {
       for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) {
@@ -273,9 +279,11 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
 
         bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                            !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, query_token));
-                            // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-                            //   (1 << query_token))));
+                           !test_bit(bitmask->bit_mask,
+                                     ti - bitmask->non_tree_cache_size,
+                                     query_token));
+        // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+        //   (1 << query_token))));
         float logit = mask ? 0.0f : qk_smem[ti - first_step];
         out = FlexFlow::fma(logit, cast_to_float(v), out);
       }
@@ -321,19 +329,19 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
 }
 
 template <typename DT>
-__global__ void spec_inc_store_kv_cache(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    BatchConfig::PerTokenInfo *tokenInfos,
-    BatchConfig::PerRequestInfo *requestInfo,
-    BatchConfig::BitMask *causalMask,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens,
-    int max_seq_len,
-    int hidden_size) {
+__global__ void
+    spec_inc_store_kv_cache(DT const *devQKVProjArray,
+                            DT *kCache_ptr,
+                            DT *vCache_ptr,
+                            BatchConfig::PerTokenInfo *tokenInfos,
+                            BatchConfig::PerRequestInfo *requestInfo,
+                            BatchConfig::BitMask *causalMask,
+                            int qProjSize,
+                            int kProjSize,
+                            int vProjSize,
+                            int num_tokens,
+                            int max_seq_len,
+                            int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
     int token_idx = i / (hidden_size);
     int offset = i % hidden_size;
@@ -351,12 +359,13 @@ __global__ void spec_inc_store_kv_cache(
         requestInfo[req_id].first_token_offset_in_batch;
 
     // BatchConfig::BitMask bitmask = causalMask[req_id];
-    BatchConfig::BitMask* bitmask = &causalMask[req_id];
+    BatchConfig::BitMask *bitmask = &causalMask[req_id];
 
     // if prompt token -> token id
     // if tree token:
 
-    // int const cache_idx = bitmask->prompt_size + bitmask->non_tree_cache_size +
+    // int const cache_idx = bitmask->prompt_size + bitmask->non_tree_cache_size
+    // +
     //                       bitmask->tree_or_prompt_size - 1 -
     //                       bitmask->current_layer_size + token_idx -
     //                       request_token_offset;
@@ -373,7 +382,7 @@ __global__ void spec_inc_store_kv_cache(
 
 template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                            TreeSearchBatchConfig const *bc,
+                            BatchConfig const *bc,
                             cudaStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   if (num_tokens > 0) {
@@ -428,7 +437,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 template <typename DT>
 void compute_spec_inc_attention_kernel_generation(
     SpecIncMultiHeadSelfAttentionMeta const *m,
-    TreeSearchBatchConfig const *bc,
+    BatchConfig const *bc,
     DT *output_ptr,
     cudaStream_t stream) {
   // one block == one head per request
@@ -469,7 +478,7 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
-                                     TreeSearchBatchConfig const *bc,
+                                     BatchConfig const *bc,
                                      int shard_id,
                                      DT *output_ptr,
                                      DT const *bias_ptr,
@@ -711,7 +720,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                      TreeSearchBatchConfig const *bc,
+                      BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -754,7 +763,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 /*static*/
 void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     SpecIncMultiHeadSelfAttentionMeta const *m,
-    TreeSearchBatchConfig const *bc,
+    BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 19ee265c5..c1d1e53a5 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_tree_verify("TreeVerifyIncMHA");
+LegionRuntime::Logger::Category log_tree_verify("BatchConfig");
 
 bool TreeIncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
@@ -55,7 +55,7 @@ bool TreeIncMultiHeadSelfAttentionParams::is_valid(
 }
 
 Tensor FFModel::inc_multihead_self_attention_verify(
-    const Tensor input,
+    Tensor const input,
     int embed_dim,
     int num_heads,
     int kdim,
@@ -93,7 +93,7 @@ Tensor FFModel::inc_multihead_self_attention_verify(
 }
 
 Tensor FFModel::inc_multiquery_self_attention_verify(
-    const Tensor input,
+    Tensor const input,
     int embed_dim,
     int num_q_heads,
     int num_kv_heads,
@@ -279,7 +279,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     LayerID const &_layer_guid,
-    const ParallelTensor _input,
+    ParallelTensor const _input,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
@@ -393,8 +393,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
 
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
-    const ParallelTensor _input,
-    const ParallelTensor _weight,
+    ParallelTensor const _input,
+    ParallelTensor const _weight,
     int _embed_dim,
     int _num_q_heads,
     int _num_kv_heads,
@@ -510,7 +510,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     FFModel &model,
     TreeIncMultiHeadSelfAttention const &other,
-    const ParallelTensor input,
+    ParallelTensor const input,
     bool allocate_weights)
     : TreeIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
@@ -740,7 +740,7 @@ void TreeIncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap TreeIncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
@@ -806,12 +806,10 @@ void TreeIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  TreeVerifyBatchConfig const &bc =
-      Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-  log_tree_verify.debug(
-      "TreeVerifyBatchConfig, num_tokens: %d, num_requests: %d",
-      bc.num_tokens,
-      bc.num_active_requests());
+  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  log_tree_verify.debug("BatchConfig, num_tokens: %d, num_requests: %d",
+                        bc.num_tokens,
+                        bc.num_active_requests());
   if (bc.num_tokens == 0) {
     return;
   }
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index e1fc4d73a..b2002453b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -36,7 +36,8 @@ __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    /* Reserved: BatchConfig Updated, leave HIP code to be updated */TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    /* Reserved: BatchConfig Updated, leave HIP code to be updated */
+    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,
@@ -70,7 +71,7 @@ __global__ void commit_tokens_kernel(
 
 template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
-                   TreeVerifyBatchConfig const *bc,
+                   BatchConfig const *bc,
                    hipStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
@@ -96,19 +97,19 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void update_tree_branch_kv_cache(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens_in_branch,
-    int processed_tokens_in_batch,
-    int total_tokens_in_batch,
-    int max_seq_len,
-    int hidden_size) {
+__global__ void
+    update_tree_branch_kv_cache(DT const *devQKVProjArray,
+                                DT *kCache_ptr,
+                                DT *vCache_ptr,
+                                BatchConfig::PerTokenInfo const *tokenInfos,
+                                int qProjSize,
+                                int kProjSize,
+                                int vProjSize,
+                                int num_tokens_in_branch,
+                                int processed_tokens_in_batch,
+                                int total_tokens_in_batch,
+                                int max_seq_len,
+                                int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) {
     int token_idx = i / (hidden_size * KV_WEIGHT_NUM);
     int offset = i % hidden_size;
@@ -146,7 +147,7 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              TreeVerifyBatchConfig const *bc,
+                              BatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
                               DT const *bias_ptr,
@@ -437,7 +438,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
-                      TreeVerifyBatchConfig const *bc,
+                      BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -464,13 +465,12 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // Note that m->num_active_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
-  checkCUDA(
-      hipMemcpyAsync(m->committed_token_infos,
-                     &(bc->committed_tokens),
-                     bc->num_tokens_to_commit *
-                         sizeof(TreeVerifyBatchConfig::CommittedTokensInfo),
-                     hipMemcpyHostToDevice,
-                     stream));
+  checkCUDA(hipMemcpyAsync(m->committed_token_infos,
+                           &(bc->committed_tokens),
+                           bc->num_tokens_to_commit *
+                               sizeof(BatchConfig::CommittedTokensInfo),
+                           hipMemcpyHostToDevice,
+                           stream));
   commit_tokens<DT>(m, bc, stream);
 
   // After commit we update m->num_active_tokens to be the number of active
@@ -486,7 +486,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   checkCUDA(hipMemcpyAsync(m->token_infos,
                            &(bc->tokensInfo),
                            bc->num_active_tokens() *
-                               sizeof(TreeVerifyBatchConfig::PerTokenInfo),
+                               sizeof(BatchConfig::PerTokenInfo),
                            hipMemcpyHostToDevice,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
@@ -515,7 +515,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 /*static*/
 void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeIncMultiHeadSelfAttentionMeta *m,
-    TreeVerifyBatchConfig const *bc,
+    BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -631,24 +631,22 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   {
     int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
     size_t committed_tokeninfo_size = max_tokens_per_batch;
-    size_t total_size = committed_tokeninfo_size *
-                        sizeof(TreeVerifyBatchConfig::CommittedTokensInfo);
+    size_t total_size =
+        committed_tokeninfo_size * sizeof(BatchConfig::CommittedTokensInfo);
     if (offload) {
       // assert that we have enough reserved work space left
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              total_size);
       committed_token_infos =
-          gpu_mem_allocator
-              .allocate_reserved<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
+          gpu_mem_allocator.allocate_reserved<BatchConfig::CommittedTokensInfo>(
+              committed_tokeninfo_size);
     } else {
       gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst,
                                                total_size);
       committed_token_infos =
-          gpu_mem_allocator
-              .allocate_instance<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
+          gpu_mem_allocator.allocate_instance<BatchConfig::CommittedTokensInfo>(
+              committed_tokeninfo_size);
     }
   }
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 3ab39ed88..3d0235a6b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -50,7 +50,8 @@ __global__ void compute_attention_kernel_fused_kernel(
     int const max_token_per_batch,
     int per_head_size,
     int hidden_size,
-    /* Reserved: BatchConfig Updated */BatchConfig::PerRequestInfo *request_infos,
+    /* Reserved: BatchConfig Updated */
+    BatchConfig::PerRequestInfo *request_infos,
     int num_heads,
     int num_requests,
     BatchConfig::BitMask *causalMask,
@@ -94,13 +95,13 @@ __global__ void compute_attention_kernel_fused_kernel(
   int const tlength =
       request_infos[requext_idx_in_batch].first_token_index_in_request +
       request_infos[requext_idx_in_batch].num_tokens_in_batch;
-  int const qlength =
-      request_infos[requext_idx_in_batch].num_tokens_in_batch;
+  int const qlength = request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
   // BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
-  BatchConfig::BitMask* bitmask = &causalMask[requext_idx_in_batch];
+  BatchConfig::BitMask *bitmask = &causalMask[requext_idx_in_batch];
 
-  int const first_token_idx = request_infos[requext_idx_in_batch].first_token_offset_in_batch;
+  int const first_token_idx =
+      request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
   int q_start =
       request_infos[requext_idx_in_batch].first_token_index_in_request;
@@ -169,12 +170,14 @@ __global__ void compute_attention_kernel_fused_kernel(
       float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
 
       if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-        bool const mask =
-            prompt_phase ? (qi + q_start < ti)
-                         : (ti >= bitmask->non_tree_cache_size &&
-                            !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
-                            // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-                            //    (1 << qi))));
+        bool const mask = prompt_phase
+                              ? (qi + q_start < ti)
+                              : (ti >= bitmask->non_tree_cache_size &&
+                                 !test_bit(bitmask->bit_mask,
+                                           ti - bitmask->non_tree_cache_size,
+                                           qi));
+        // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+        //    (1 << qi))));
 
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
 
@@ -229,11 +232,13 @@ __global__ void compute_attention_kernel_fused_kernel(
     float exp_sum = 0.f;
     for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
       bool const mask =
-          prompt_phase ? (q_start + qi < ti)
-                       : (ti >= bitmask->non_tree_cache_size &&
-                          !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
-                          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-                          //    (1 << qi))));
+          prompt_phase
+              ? (q_start + qi < ti)
+              : (ti >= bitmask->non_tree_cache_size &&
+                 !test_bit(
+                     bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
+      // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+      //    (1 << qi))));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -267,8 +272,7 @@ __global__ void compute_attention_kernel_fused_kernel(
 
     // The base pointer for the value in the cache buffer.
     DT const *v_cache_batch =
-        value_cache + requext_idx_in_batch * max_seq_length * hidden_size +
-        vi;
+        value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
 
     if (Dh == Dh_MAX || vi < Dh) {
       for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
@@ -279,13 +283,14 @@ __global__ void compute_attention_kernel_fused_kernel(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
 
         if (ti < tlength) {
-          bool const mask =
-              prompt_phase
-                  ? (q_start + qi < ti)
-                  : (ti >= bitmask->non_tree_cache_size &&
-                      !test_bit(bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
-                      // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-                      //   (1 << qi))));
+          bool const mask = prompt_phase
+                                ? (q_start + qi < ti)
+                                : (ti >= bitmask->non_tree_cache_size &&
+                                   !test_bit(bitmask->bit_mask,
+                                             ti - bitmask->non_tree_cache_size,
+                                             qi));
+          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
+          //   (1 << qi))));
           float logit = mask ? 0.0f : qk_smem[ti - first_step];
           out = FlexFlow::fma(logit, cast_to_float(v), out);
         }
@@ -347,7 +352,7 @@ __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    TreeVerifyBatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int qProjSize,
     int kProjSize,
     int vProjSize,
@@ -381,7 +386,7 @@ __global__ void commit_tokens_kernel(
 
 template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
-                   TreeVerifyBatchConfig const *bc,
+                   BatchConfig const *bc,
                    cudaStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
@@ -406,19 +411,19 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void update_tree_branch_kv_cache(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens_in_branch,
-    int processed_tokens_in_batch,
-    int total_tokens_in_batch,
-    int max_seq_len,
-    int hidden_size) {
+__global__ void
+    update_tree_branch_kv_cache(DT const *devQKVProjArray,
+                                DT *kCache_ptr,
+                                DT *vCache_ptr,
+                                BatchConfig::PerTokenInfo const *tokenInfos,
+                                int qProjSize,
+                                int kProjSize,
+                                int vProjSize,
+                                int num_tokens_in_branch,
+                                int processed_tokens_in_batch,
+                                int total_tokens_in_batch,
+                                int max_seq_len,
+                                int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
 
     int token_idx = i / (hidden_size);
@@ -445,7 +450,7 @@ __global__ void update_tree_branch_kv_cache_fused(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerTokenInfo const *tokenInfos,
     BatchConfig::PerRequestInfo *request_infos,
     int qProjSize,
     int kProjSize,
@@ -506,7 +511,7 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix,
 
 template <typename DT>
 void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              TreeVerifyBatchConfig const *bc,
+                              BatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
                               DT const *bias_ptr,
@@ -799,8 +804,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   assert(processed_tokens_in_batch == bc->num_active_tokens());
 }
 
-#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream, prompt_phase)      \
+#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(DT,                          \
+                                                  Dh,                          \
+                                                  Dh_MAX,                      \
+                                                  THDS_PER_KEY,                \
+                                                  THDS_PER_VALUE,              \
+                                                  THDS_PER_BLOCK,              \
+                                                  stream,                      \
+                                                  prompt_phase)                \
   smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
                               BatchConfig::max_sequence_length() +             \
                                   BatchConfig::max_spec_tree_token_num(),      \
@@ -835,7 +846,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                    TreeVerifyBatchConfig const *bc,
+                                    BatchConfig const *bc,
                                     DT *output_ptr,
                                     cudaStream_t stream) {
 
@@ -880,7 +891,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
-                      TreeVerifyBatchConfig const *bc,
+                      BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
                       DT const *weight_ptr,
@@ -958,7 +969,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 /*static*/
 void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     TreeIncMultiHeadSelfAttentionMeta *m,
-    TreeVerifyBatchConfig const *bc,
+    BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &weight,
@@ -1073,7 +1084,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
         sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
         sizeof(BatchConfig::request_available));
     committed_token_infos =
-        reinterpret_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+        reinterpret_cast<BatchConfig::CommittedTokensInfo *>(
             reinterpret_cast<char *>(handler.batch_config_metadata) +
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index def2e7a17..d4d9cf0fe 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -25,42 +25,53 @@ LegionRuntime::Logger::Category log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig()
-    : num_tokens(0), num_available_requests(0), prompt_phase(false) {
+// BatchConfig::BatchConfig() : model_id(0), inference_mode(INC_DECODING_MODE) {
+//   std::fill(std::begin(request_available), std::end(request_available), 0);
+//   // Don't need to initialize requestInfo ,tokensInfo, causalMask and
+//   // committed_tokens here because they initialize themselves.
+//   // Other fields are already initialized to proper value.
+// }
+
+BatchConfig::BatchConfig(InferenceMode inference_mode_, int model_id_)
+    : model_id(model_id_), inference_mode(inference_mode_) {
   std::fill(std::begin(request_available), std::end(request_available), 0);
-  // Don't need to initialize requestInfo ,tokensInfo, and causalMask
-  // here because they initialize themselves.
+  // Don't need to initialize requestInfo ,tokensInfo, causalMask and
+  // committed_tokens here because they initialize themselves.
+  // Other fields are already initialized to proper value.
 }
 
+/*static*/
+// BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future)
+// {
+//   BatchConfig const *bc = static_cast<BatchConfig const *>(
+//       Future(future).get_buffer(Memory::SYSTEM_MEM));
+//   // Check future size
+//   if (bc->get_mode() == INC_DECODING_MODE) {
+//     assert(Future(future).get_untyped_size() == sizeof(BatchConfig));
+//   } else if (bc->get_mode() == TREE_SEARCH_MODE) {
+//     assert(Future(future).get_untyped_size() ==
+//     sizeof(TreeSearchBatchConfig));
+//   } else if (bc->get_mode() == TREE_VERIFY_MODE) {
+//     assert(Future(future).get_untyped_size() ==
+//     sizeof(TreeVerifyBatchConfig));
+//   } else {
+//     assert(false && "Unsupported inference mode");
+//   }
+//   return bc;
+// }
+
 /*static*/
 BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) {
-  BatchConfig const *bc = static_cast<BatchConfig const *>(
+  return static_cast<BatchConfig const *>(
       Future(future).get_buffer(Memory::SYSTEM_MEM));
-  // Check future size
-  if (bc->get_mode() == INC_DECODING_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(BatchConfig));
-  } else if (bc->get_mode() == TREE_SEARCH_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(TreeSearchBatchConfig));
-  } else if (bc->get_mode() == TREE_VERIFY_MODE) {
-    assert(Future(future).get_untyped_size() == sizeof(TreeVerifyBatchConfig));
-  } else {
-    assert(false && "Unsupported inference mode");
-  }
-  return bc;
 }
 
 InferenceMode BatchConfig::get_mode() const {
-  return INC_DECODING_MODE;
+  return inference_mode;
 }
 
 int BatchConfig::num_active_requests() const {
-  int num_requests = 0;
-  for (int i = 0; i < max_requests_per_batch(); i++) {
-    if (request_available[i]) {
-      num_requests++;
-    }
-  }
-  return num_requests;
+  return num_available_requests;
 }
 
 int BatchConfig::num_active_tokens() const {
@@ -95,18 +106,37 @@ int BatchConfig::max_spec_tree_token_num() {
 std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode()
      << ") @@@@@@@@@@@@@@" << std::endl;
-  // Max values
-  os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
   // Current values
   os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
   os << "Number of requests: " << bc.num_active_requests() << std::endl;
+  os << "Prompt phase: " << bc.prompt_phase << std::endl;
+  os << "Inference mode: ";
+  switch (bc.inference_mode) {
+    case INC_DECODING_MODE:
+      os << "Incremental decoding";
+      break;
+    case TREE_SEARCH_MODE:
+      os << "Tree search";
+      break;
+    case TREE_VERIFY_MODE:
+      os << "Tree verify";
+      break;
+    default:
+      os << "Unknown";
+  }
+  os << std::endl;
+  if (bc.inference_mode == TREE_VERIFY_MODE) {
+    os << "Number of tokens to commit: " << bc.num_tokens_to_commit
+       << std::endl;
+  }
+  if (bc.inference_mode == TREE_SEARCH_MODE) {
+    os << "Model id: " << bc.model_id << std::endl;
+  }
 
   // Per-request info
   os << "Per-request info:\n";
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_available[i]) {
+    if (bc.request_available[i]) {
       os << "  Request " << i << ":\n";
       os << "    First token depth in request: "
          << bc.requestsInfo[i].first_token_index_in_request << std::endl;
@@ -127,6 +157,38 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
     os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
     os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
   }
+
+  if (bc.inference_mode == TREE_VERIFY_MODE) {
+    os << "Committed tokens info:\n";
+    for (int i = 0; i < bc.num_tokens_to_commit; i++) {
+      os << "  Token " << i << ":\n";
+      os << "    Token index: " << bc.committed_tokens[i].token_index
+         << std::endl;
+      os << "    Request index: " << bc.committed_tokens[i].request_index
+         << std::endl;
+      os << "    Token depth: " << bc.committed_tokens[i].token_depth
+         << std::endl;
+    }
+  }
+
+  if (bc.inference_mode == TREE_SEARCH_MODE ||
+      bc.inference_mode == TREE_VERIFY_MODE) {
+    os << "Causal mask:\n";
+    for (int i = 0; i < bc.max_requests_per_batch(); i++) {
+      if (bc.request_available[i]) {
+        os << "  Request " << i << ":\n";
+        os << "    Non tree cache size: "
+           << bc.causalMask[i].non_tree_cache_size << std::endl;
+        os << "    Tree or prompt size: "
+           << bc.causalMask[i].tree_or_prompt_size
+
+           << std::endl;
+        os << "    Current layer size: " << bc.causalMask[i].current_layer_size
+           << std::endl;
+      }
+    }
+  }
+
   os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;
   return os;
 }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 53a95595c..2df1d6bfa 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -307,26 +307,8 @@ void InferenceManager::init_operators_inference(FFModel *model) {
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfig const &bc) {
-  if (bc.get_mode() == INC_DECODING_MODE) {
-    BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
-    return inference(model, index, bcf);
-  } else if (bc.get_mode() == TREE_SEARCH_MODE) {
-    BatchConfig const *bc_ptr = &bc;
-    TreeSearchBatchConfig const *tsbc_ptr =
-        static_cast<TreeSearchBatchConfig const *>(bc_ptr);
-    TreeSearchBatchConfigFuture bcf =
-        Future::from_value<TreeSearchBatchConfig>(*tsbc_ptr);
-    return inference(model, index, bcf);
-  } else if (bc.get_mode() == TREE_VERIFY_MODE) {
-    BatchConfig const *bc_ptr = &bc;
-    TreeVerifyBatchConfig const *tvbc_ptr =
-        static_cast<TreeVerifyBatchConfig const *>(bc_ptr);
-    TreeVerifyBatchConfigFuture bcf =
-        Future::from_value<TreeVerifyBatchConfig>(*tvbc_ptr);
-    return inference(model, index, bcf);
-  } else {
-    assert(false && "Unsupported inference mode");
-  }
+  BatchConfigFuture bcf = Future::from_value<BatchConfig>(bc);
+  return inference(model, index, bcf);
 }
 
 FutureMap InferenceManager::inference(FFModel *model,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 7b3ed3468..29af1fffa 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -5981,14 +5981,14 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<SsmInferenceResult,
+      Runtime::preregister_task_variant<InferenceResult,
                                         ArgTopK::inference_speculative_task>(
           registrar, "ArgTopK Speculative Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<SsmInferenceResult,
+      runtime->register_task_variant<InferenceResult,
                                      ArgTopK::inference_speculative_task>(
           registrar);
     }
@@ -6081,15 +6081,16 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<SsmInferenceResult,
+      Runtime::preregister_task_variant<InferenceResult,
                                         ArgMax::inference_task_beam>(
           registrar, "ArgMax Inference Task Beam");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<SsmInferenceResult,
-                                     ArgMax::inference_task_beam>(registrar);
+      runtime
+          ->register_task_variant<InferenceResult, ArgMax::inference_task_beam>(
+              registrar);
     }
   }
   {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 72f8a4c74..26beb52d8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -437,7 +437,6 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
     return;
   }
 
-  SsmInferenceResult const *ssm_result_ptr;
   switch (request_manager_status) {
     case PREFILLING:
       if (decoding_mode == INCREMENTAL_DECODING) {
@@ -496,11 +495,13 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           load_pending_reqeust_to_batch();
           prefill_model = SSM;
         }
+      } else {
+        request_manager_status = SSM_SPEC;
+        current_speculation_step = 0;
       }
       break;
     case SSM_SPEC:
-      ssm_result_ptr = dynamic_cast<SsmInferenceResult const *>(&result);
-      if (update_ssm_inference_results(*ssm_result_ptr)) {
+      if (update_ssm_inference_results(result)) {
         // Stop condition for the speculation phase has been reached
         request_manager_status = LLM_VERIFY;
       }
@@ -512,14 +513,28 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 }
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
+  bool prefill_completed = false;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);
-    return true;
+    prefill_completed = true;
+
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      // Add the last token to the token tree
+      prefill_request->committed_tokens.push_back(
+          Request::CommittedToken{-1,
+                                  (int)prefill_request->tokens.size() - 1,
+                                  prefill_request->tokens.back()});
+
+      init_token_tree(prefill_request->guid);
+      add_root_to_spec_token_tree(prefill_request->guid,
+                                  prefill_request->tokens.back());
+      update_bitmask_prompt(prefill_request->guid, 1);
+    }
   }
-  return false;
+  return prefill_completed;
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
@@ -603,6 +618,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
          "No prefilling request to process in the prefilling phase.");
 
   BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
   bc.prompt_phase = true;
   std::copy(std::begin(request_available),
             std::end(request_available),
@@ -639,7 +655,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   return bc;
 }
 
-TreeSearchBatchConfig RequestManager::prepare_ssm_prefilling_batch() {
+BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   // This function is called when the request_manager_status is PREFILLING,
   // which means that there is a request in the prefilling phase.
   // This function load its prefilling tokens, constructing a BatchConfig with
@@ -651,7 +667,8 @@ TreeSearchBatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   assert(prefill_request != nullptr &&
          "No prefilling request to process in the prefilling phase.");
 
-  TreeSearchBatchConfig bc;
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
   bc.prompt_phase = true;
   std::copy(std::begin(request_available),
             std::end(request_available),
@@ -698,6 +715,7 @@ BatchConfig RequestManager::prepare_decoding_batch() {
   }
 
   BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
   bc.prompt_phase = false;
   std::copy(std::begin(request_available),
             std::end(request_available),
@@ -734,20 +752,21 @@ BatchConfig RequestManager::prepare_decoding_batch() {
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/
-TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
+BatchConfig RequestManager::prepare_first_spec_batch_config() {
   if (verbose) {
     std::cout << "\n############### prepare_first_spec_batch_config "
                  "##############\n";
   }
   // This method does the following:
-  // 1. Commit the verified tokens through TreeSearchBatchConfig. The infomation
+  // 1. Commit the verified tokens through BatchConfig. The infomation
   // of the committed tokens are stored in request.committed_tokens. Put the
   // information of the committed tokens into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
-  // TreeSearchBatchConfig.
+  // BatchConfig.
   assert(current_speculation_step == 0);
 
-  TreeSearchBatchConfig new_bc;
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
   // Assume that only one small model is in use now
   new_bc.prompt_phase = true;
   std::copy(std::begin(request_available),
@@ -800,7 +819,7 @@ TreeSearchBatchConfig RequestManager::prepare_first_spec_batch_config() {
 }
 
 /***** Speculative Decoding Phase *****/
-TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
+BatchConfig RequestManager::prepare_next_spec_batch_config() {
   if (verbose) {
     std::cout
         << "\n############### prepare_next_spec_batch_config ###############\n";
@@ -808,7 +827,8 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   }
 
   // Prepare the next batch for existing requests
-  TreeSearchBatchConfig new_bc;
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
   // We assume that only one small model is in use now
   new_bc.model_id = 0;
   std::copy(std::begin(request_available),
@@ -840,7 +860,7 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
-          token_tree.tree_layers.at(current_speculation_step);
+          token_tree.tree_layers.back();
       // Exclude the current layer from the token tree, because we want the
       // start index
       new_bc.requestsInfo[request_index].first_token_index_in_request =
@@ -868,14 +888,14 @@ TreeSearchBatchConfig RequestManager::prepare_next_spec_batch_config() {
   }
 
   if (verbose) {
-    std::cout << "prepare_next_batch_beam NEW batchconfig:" << std::endl;
+    std::cout << "prepare_next_spec_batch_config NEW batchconfig:" << std::endl;
     new_bc.print();
   }
   return new_bc;
 }
 
 /***** Verify Phase *****/
-TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
+BatchConfig RequestManager::prepare_verify_batch_config() {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout
@@ -883,20 +903,21 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
   }
   // This method does the following:
   // 1. Commit the verified tokens in the last iteration through the
-  // TreeVerifyBatchConfig. We can do this request by request.
+  // BatchConfig. We can do this request by request.
   // The information of the committed tokens is stored in
   // Request.llm_committed_tokens. Put the information of the committed tokens
-  // into TreeVerifyBatchConfig.committed_tokens.
+  // into BatchConfig.committed_tokens.
   // 2. Load the tokens on the token tree that are not yet pruned to
-  // TreeVerifyBatchConfig.tokensInfo. Be careful with the abs_depth etc.
+  // BatchConfig.tokensInfo. Be careful with the abs_depth etc.
   // (skip the pruned tokens).
   // 3. Create the causal mask for the large model based on the small model
   // causal mask (call create_llm_bitmask()).
-  // 4. Maintain TreeVerifyBatchConfig::RequestsInfo and all other fields of
-  // TreeSearchBatchConfig.
+  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
+  // BatchConfig.
   // Please refer to the implementation of prepare_next_spec_batch_config()
   // for more details.
-  TreeVerifyBatchConfig new_bc;
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
   std::copy(std::begin(request_available),
             std::end(request_available),
             std::begin(new_bc.request_available));
@@ -924,7 +945,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
         request.speculative_token_trees[0].tree_size;
 
     // Put the information of the committed tokens into
-    // TreeVerifyBatchConfig.committed_tokens.
+    // BatchConfig.committed_tokens.
     // Note here, we shouldn't put the last token in request.committed_tokens
     // into new_bc. Because the LLM don't have that token's KV cache.
     std::vector<Request::CommittedToken> &committed_tokens =
@@ -944,7 +965,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
     }
 
     // Load the tokens on the token tree that are not yet pruned to
-    // TreeVerifyBatchConfig.tokensInfo.
+    // BatchConfig.tokensInfo.
     TokenTree &token_tree = request.speculative_token_trees[0];
     int token_tree_index = 0;
     for (auto const &tree_layer : token_tree.tree_layers) {
@@ -959,7 +980,7 @@ TreeVerifyBatchConfig RequestManager::prepare_verify_batch_config() {
         }
       }
     }
-    assert(token_tree_index == token_tree.tree_size - 1);
+    assert(token_tree_index == token_tree.tree_size);
 
     // Create the causal mask for the large model based on the small model
     // causal mask.
@@ -1043,20 +1064,20 @@ bool RequestManager::update_llm_verify_results(
 }
 
 bool RequestManager::update_ssm_inference_results(
-    SsmInferenceResult const &ssm_inference_result) {
+    InferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
   assert(current_speculation_step >= 0 &&
          "The current speculation step should be no less than 0");
   current_speculation_step++;
 
-  int num_branches = TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+  int num_branches = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
   bool token_added_to_spec_tree = false;
 
   // Here we assume that the order of the tokens in the last
-  // TreeSearchBatchConfig and hence the last SsmInferenceResult is equal to
-  // the order of the request in the last TreeSearchBatchConfig
+  // BatchConfig and hence the last InferenceResult is equal to
+  // the order of the request in the last BatchConfig
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1076,13 +1097,17 @@ bool RequestManager::update_ssm_inference_results(
       // This means that the parent layer is empty
       continue;
     } else {
+      auto parent_layer_iter =
+          token_tree.tree_layers.end(); // The iterator after the last element
+      --parent_layer_iter;
       std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
-          token_tree.tree_layers[current_speculation_step - 1];
+          *parent_layer_iter;
       int parent_pos = 0;
-      for (auto parent_it = parent_tree_layer.begin();
-           parent_it != parent_tree_layer.end();
-           parent_it++) {
-        if ((*parent_it)->pruned) {
+      //   for (auto &parent_it = parent_tree_layer.begin();
+      //        parent_it != parent_tree_layer.end();
+      //        parent_it++) {
+      for (auto parent_ptr : parent_tree_layer) {
+        if (parent_ptr->pruned) {
           // Parent token is pruned, we have to skip all its children
           // Because no token is pruned in the last layer during the small
           // model inference, the reason why some parents are pruned is that
@@ -1092,7 +1117,7 @@ bool RequestManager::update_ssm_inference_results(
         } else {
           // Parent token is not pruned
           for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-            float parent_log_prob = (*parent_it)->log_accumulated_prob;
+            float parent_log_prob = parent_ptr->log_accumulated_prob;
             token_added_to_spec_tree =
                 token_added_to_spec_tree ||
                 add_token_to_spec_token_tree(
@@ -1118,14 +1143,14 @@ bool RequestManager::update_ssm_inference_results(
 
   // Stop conditions
   return !token_added_to_spec_tree ||
-         current_speculation_step > TreeSearchBatchConfig::MAX_TREE_DEPTH;
+         current_speculation_step > BatchConfig::MAX_TREE_DEPTH;
 }
 
 /* --------- Bitmask Related Functions --------- */
 
 void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
-  // This method is called by load_pending_reqeust_to_batch when there is a new
-  // request to load into the batch
+  // This method is called by load_pending_reqeust_to_batch when there is a
+  // new request to load into the batch
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
 
@@ -1148,16 +1173,21 @@ void RequestManager::update_bitmask_prompt(RequestGuid guid,
   BatchConfig::BitMask &bitmask = request.causal_mask;
   // Clear because the prompt kernel doesn't use mask
   bitmask.clear_bitmask();
-  // No need to change non_tree_cache_size
   bitmask.tree_or_prompt_size = num_committed_tokens;
   bitmask.current_layer_size = num_committed_tokens;
+
+  // If the request just finishes the prefilling phase, we need to set the
+  // non_tree_cache_size to the size of the prompt
+  if (bitmask.non_tree_cache_size == 0) {
+    bitmask.non_tree_cache_size = request.tokens.size() - num_committed_tokens;
+  }
 }
 
 void RequestManager::init_bitmask_spec(RequestGuid guid) {
   // This method modifies the bitmask in place
-  // This method is called by the first call of update_ssm_inference_results in
-  // a speculative iteration
-  // CAUTION: You should still call append_bitmask() after this method
+  // This method is called by the first call of update_ssm_inference_results
+  // in a speculative iteration CAUTION: You should still call
+  // append_bitmask() after this method
   // 1. Clear the causal mask and add a root into it, because the tree is
   // currently empty but we have a root.
   // 2. Maintain all other fields.
@@ -1189,7 +1219,7 @@ void RequestManager::append_bitmask(RequestGuid guid) {
     return;
   }
   std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-      request.speculative_token_trees[0].tree_layers[current_speculation_step];
+      request.speculative_token_trees[0].tree_layers.back();
   int new_layer_size = tree_layer.size();
   int last_layer_size = bitmask.current_layer_size;
   int previous_tree_size = bitmask.tree_or_prompt_size;
@@ -1283,8 +1313,8 @@ void RequestManager::get_verify_results_greedy(
     committed_token_index++;
     // Don't add it to request.tokens because it has already been added.
 
-    // The position of the last accepted token in its tree layer (includeing the
-    // pruned tokens)
+    // The position of the last accepted token in its tree layer (includeing
+    // the pruned tokens)
     int last_accepted_token_layer_index = 0;
     // The index of the last accepted token in the entire tree (excluding the
     // pruned tokens)
@@ -1292,10 +1322,12 @@ void RequestManager::get_verify_results_greedy(
 
     int current_token_index = 1; // Because we skip the root
     int num_layers = token_tree.tree_layers.size();
-    for (int layer_index = 1; layer_index < num_layers; layer_index++) {
+    auto layer_it = token_tree.tree_layers.begin();
+    ++layer_it;
+    // for (int layer_index = 1; layer_index < num_layers; layer_index++) {
+    for (; layer_it != token_tree.tree_layers.end(); layer_it++) {
       // We skip the first layer
-      std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-          token_tree.tree_layers.at(layer_index);
+      std::list<std::shared_ptr<TokenTreeNode>> &tree_layer = *layer_it;
 
       bool token_accepted_this_layer = false;
       int current_token_layer_index = 0;
@@ -1535,37 +1567,54 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
         break;
       }
     }
-    runtime->begin_trace(ctx, 12346 /*trace_id*/);
-    InferenceResultFuture next_ir = batch_pipeline.back();
-    BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
-    if (request_manager_status == PREFILLING) {
-      if (prefill_model == LLM) {
-        FutureMap fm = im->inference(llm, 0, bcf);
-        assert(fm.get_future_map_domain().get_volume() == 1);
-        InferenceResultFuture irf = fm.get_future(0);
-        batch_pipeline.push(irf);
-      } else if (prefill_model == SSM) {
-        FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-        assert(fm.get_future_map_domain().get_volume() == 1);
-        InferenceResultFuture irf = fm.get_future(0);
-        batch_pipeline.push(irf);
-      } else {
-        assert(false && "Invalid prefill model");
-      }
-    } else if (request_manager_status == LLM_VERIFY) {
+    if ((request_manager_status == PREFILLING and prefill_model == LLM) or
+        request_manager_status == LLM_VERIFY) {
+      runtime->begin_trace(ctx, 12345 /*trace_id*/);
+      InferenceResultFuture next_ir = batch_pipeline.back();
+      BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
       FutureMap fm = im->inference(llm, 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
       InferenceResultFuture irf = fm.get_future(0);
       batch_pipeline.push(irf);
-    } else if (request_manager_status == SSM_SPEC) {
+      runtime->end_trace(ctx, 12345 /*trace_id*/);
+    } else if ((request_manager_status == PREFILLING and
+                prefill_model == SSM) or
+               request_manager_status == SSM_SPEC) {
+      runtime->begin_trace(ctx, 23456 /*trace_id*/);
+      InferenceResultFuture next_ir = batch_pipeline.back();
+      BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
       InferenceResultFuture irf = fm.get_future(0);
       batch_pipeline.push(irf);
+      runtime->end_trace(ctx, 23456 /*trace_id*/);
     } else {
       assert(false && "Invalid request manager status");
     }
-    runtime->end_trace(ctx, 12346 /*trace_id*/);
+    // runtime->begin_trace(ctx, 12345 /*trace_id*/);
+    // InferenceResultFuture next_ir = batch_pipeline.back();
+    // BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
+    // FutureMap fm;
+    // if (request_manager_status == PREFILLING) {
+    //   if (prefill_model == LLM) {
+    //     fm = im->inference(llm, 0, bcf);
+    //   } else if (prefill_model == SSM) {
+    //     fm = im->inference(get_ssm_model(0), 0, bcf);
+    //   } else {
+    //     assert(false && "Invalid prefill model");
+    //   }
+    // } else if (request_manager_status == LLM_VERIFY) {
+    //   fm = im->inference(llm, 0, bcf);
+    // } else if (request_manager_status == SSM_SPEC) {
+    //   fm = im->inference(get_ssm_model(0), 0, bcf);
+    // } else {
+    //   assert(false && "Invalid request manager status");
+    // }
+    // std::cout << "after inference" << std::endl;
+    // assert(fm.get_future_map_domain().get_volume() == 1);
+    // InferenceResultFuture irf = fm.get_future(0);
+    // batch_pipeline.push(irf);
+    // runtime->end_trace(ctx, 12345 /*trace_id*/);
   }
 }
 
@@ -1627,8 +1676,8 @@ void RequestManager::add_root_to_spec_token_tree(
   Request &request = all_requests[guid];
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
   speculative_token_tree.add_layer();
-  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, -1, 0.0);
-  speculative_token_tree.tree_layers[0].push_back(node_ptr);
+  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
+  speculative_token_tree.tree_layers.front().push_back(node_ptr);
   speculative_token_tree.tree_size++;
   speculative_token_tree.tree_size_including_pruned++;
 }
@@ -1730,9 +1779,7 @@ bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
     auto node_ptr = std::make_shared<TokenTreeNode>(
         token_id, parent_pos, log_accumulated_prob);
     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
-    request.speculative_token_trees[0]
-        .tree_layers[current_speculation_step]
-        .push_back(node_ptr);
+    speculative_token_tree.tree_layers.back().push_back(node_ptr);
     speculative_token_tree.tree_size++;
     speculative_token_tree.tree_size_including_pruned++;
   }
@@ -1748,8 +1795,7 @@ void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
     // There are no tokens in the last layer
     return;
   }
-  auto &last_layer =
-      request.speculative_token_trees[0].tree_layers[current_speculation_step];
+  auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
   for (auto it = last_layer.begin(); it != last_layer.end(); ++it) {
     if ((*it)->pruned) {
       last_layer.erase(it);
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index 2dc74b018..c3e3dcdf0 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -91,35 +91,29 @@ void RequestManager::load_batch_config_task(
 
   // load speculative metadata
   if (batch_config->get_mode() == TREE_SEARCH_MODE) {
-    TreeSearchBatchConfig const *tree_search_batch_config =
-        static_cast<TreeSearchBatchConfig const *>(batch_config);
-
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(tree_search_batch_config->causalMask),
+                             &(batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
 
     total_copy_size += sizeof(BatchConfig::causalMask);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    TreeVerifyBatchConfig const *tree_batch_config =
-        static_cast<TreeVerifyBatchConfig const *>(batch_config);
-
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(tree_batch_config->causalMask),
+                             &(batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
     total_copy_size += sizeof(BatchConfig::causalMask);
     checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                  total_copy_size,
-                             &(tree_batch_config->committed_tokens),
-                             sizeof(TreeVerifyBatchConfig::committed_tokens),
+                             &(batch_config->committed_tokens),
+                             sizeof(BatchConfig::committed_tokens),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
+    total_copy_size += sizeof(BatchConfig::committed_tokens);
   }
 
   // add a size check
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 2f91c89bf..67f2c8713 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -106,34 +106,28 @@ void RequestManager::load_batch_config_task(
 
   // load speculative metadata
   if (batch_config->get_mode() == TREE_SEARCH_MODE) {
-    TreeSearchBatchConfig const *beam_batch_config =
-        static_cast<TreeSearchBatchConfig const *>(batch_config);
-
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->causalMask),
+        &(batch_config->causalMask),
         sizeof(BatchConfig::causalMask),
         cudaMemcpyHostToDevice,
         stream));
     total_copy_size += sizeof(BatchConfig::causalMask);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    TreeVerifyBatchConfig const *tree_batch_config =
-        static_cast<TreeVerifyBatchConfig const *>(batch_config);
-
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->causalMask),
+        &(batch_config->causalMask),
         sizeof(BatchConfig::causalMask),
         cudaMemcpyHostToDevice,
         stream));
     total_copy_size += sizeof(BatchConfig::causalMask);
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->committed_tokens),
-        sizeof(TreeVerifyBatchConfig::committed_tokens),
+        &(batch_config->committed_tokens),
+        sizeof(BatchConfig::committed_tokens),
         cudaMemcpyHostToDevice,
         stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
+    total_copy_size += sizeof(BatchConfig::committed_tokens);
   }
 
   // add a size check
diff --git a/src/runtime/tree_search_batch_config.cc b/src/runtime/tree_search_batch_config.cc
deleted file mode 100644
index fcc1d3a0c..000000000
--- a/src/runtime/tree_search_batch_config.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2023 CMU, Stanford, Facebook, LANL
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/batch_config.h"
-#include "flexflow/request_manager.h"
-#include "legion.h"
-#include <cassert>
-#include <climits>
-
-#define DEFAULT_BEAM_WIDTH 1
-#define DEFAULT_TARGET_ITERATIONS 3
-
-namespace FlexFlow {
-
-LegionRuntime::Logger::Category log_tree_search_bc("TreeSearchBatchConfig");
-
-TreeSearchBatchConfig::TreeSearchBatchConfig() : BatchConfig() {}
-
-TreeSearchBatchConfig::TreeSearchBatchConfig(int model_id)
-    : BatchConfig(), model_id(model_id) {
-  std::cout << "==================\n"
-            << "Register Batch Config with Model " << this->model_id
-            << std::endl;
-}
-
-/* Why do we need this? */
-TreeSearchBatchConfig::TreeSearchBatchConfig(TreeSearchBatchConfig const &other,
-                                             int model_id)
-    : BatchConfig(), model_id(model_id) {}
-
-TreeSearchBatchConfig::~TreeSearchBatchConfig() {}
-
-InferenceMode TreeSearchBatchConfig::get_mode() const {
-  return TREE_SEARCH_MODE;
-}
-
-std::ostream &
-    operator<<(std::ostream &os,
-               TreeSearchBatchConfig const &tree_search_batch_config) {
-  os << "@@@@@@@@@@@@@@ TreeSearchBatchConfig (mode "
-     << tree_search_batch_config.get_mode() << ") @@@@@@@@@@@@@@" << std::endl;
-  // Max values
-  os << "Max number of requests: "
-     << tree_search_batch_config.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: "
-     << tree_search_batch_config.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: "
-     << tree_search_batch_config.max_sequence_length() << std::endl;
-  // Current values
-  os << "Number of tokens: " << tree_search_batch_config.num_active_tokens()
-     << std::endl;
-  os << "Number of requests: " << tree_search_batch_config.num_active_requests()
-     << std::endl;
-  // Tree Search-specific
-  os << "Model ID: " << tree_search_batch_config.model_id << std::endl;
-  os << "Max tree depth: " << TreeSearchBatchConfig::MAX_TREE_DEPTH
-     << std::endl;
-  os << "Max num branch: "
-     << TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES << std::endl;
-
-  os << "Per-request info:\n";
-  for (int i = 0; i < tree_search_batch_config.max_requests_per_batch(); i++) {
-    if (!tree_search_batch_config.request_available[i]) {
-      os << "  Request " << i << ":\n";
-      os << "    First token depth in request: "
-         << tree_search_batch_config.requestsInfo[i]
-                .first_token_index_in_request
-         << std::endl;
-      os << "    First token offset in batch: "
-         << tree_search_batch_config.requestsInfo[i].first_token_offset_in_batch
-         << std::endl;
-      os << "    Number of tokens in batch: "
-         << tree_search_batch_config.requestsInfo[i].num_tokens_in_batch
-         << std::endl;
-      os << "    Request available: "
-         << tree_search_batch_config.request_available[i] << std::endl;
-    }
-  }
-
-  os << "Per-token info:\n";
-  for (int i = 0; i < tree_search_batch_config.num_tokens; i++) {
-    os << "  Token " << i << ":\n";
-    os << "    Absolute depth in request: "
-       << tree_search_batch_config.tokensInfo[i].abs_index_in_request
-       << std::endl;
-    os << "    Request index: "
-       << tree_search_batch_config.tokensInfo[i].request_index << std::endl;
-    os << "    Token id: " << tree_search_batch_config.tokensInfo[i].token_id
-       << std::endl;
-  }
-  os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;
-  return os;
-}
-
-void TreeSearchBatchConfig::print() const {
-  std::cout << *this << std::endl;
-}
-
-void TreeSearchBatchConfig::save_to_file(std::string const &filename) const {
-  std::ofstream outputFile(filename);
-  if (outputFile.is_open()) {
-    outputFile << *this << std::endl;
-    outputFile.close();
-  } else {
-    std::cerr << "Error: Unable to open the batch config output file: "
-              << filename << std::endl;
-    assert(false);
-  }
-}
-
-}; // namespace FlexFlow
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
deleted file mode 100644
index 4b5fbcb63..000000000
--- a/src/runtime/tree_verify_batch_config.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2023 CMU, Stanford, Facebook, LANL
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/batch_config.h"
-#include "flexflow/request_manager.h"
-#include "legion.h"
-#include <cassert>
-#include <climits>
-
-namespace FlexFlow {
-
-LegionRuntime::Logger::Category log_tree_verify_bc("TreeVerifyBatchConfig");
-
-TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {}
-
-TreeVerifyBatchConfig::~TreeVerifyBatchConfig() {}
-
-InferenceMode TreeVerifyBatchConfig::get_mode() const {
-  return TREE_VERIFY_MODE;
-}
-
-std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
-  os << "@@@@@@@@@@@@@@ TreeVerifyBatchConfig (mode " << bc.get_mode()
-     << ") @@@@@@@@@@@@@@" << std::endl;
-  // Max values
-  os << "Max number of requests: " << bc.max_requests_per_batch() << std::endl;
-  os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
-  os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
-  // Current values
-  os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
-  os << "Number of requests: " << bc.num_active_requests() << std::endl;
-  os << "Number of tokens to commit: " << bc.num_tokens_to_commit << std::endl;
-
-  os << "Per-request info:\n";
-  for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-    if (!bc.request_available[i]) {
-      os << "  Request " << i << ":\n";
-      os << "    First token depth in request: "
-         << bc.requestsInfo[i].first_token_index_in_request << std::endl;
-      os << "    First token offset in batch: "
-         << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
-      os << "    Number of tokens in batch: "
-         << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    Request available: " << bc.request_available[i] << std::endl;
-    }
-  }
-
-  os << "Per-token info:\n";
-  for (int i = 0; i < bc.num_tokens; i++) {
-    os << "  Token " << i << ":\n";
-    os << "    Absolute depth in request: "
-       << bc.tokensInfo[i].abs_index_in_request << std::endl;
-    os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
-    os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
-  }
-
-  os << "Tokens to commit info:\n";
-  for (int i = 0; i < bc.num_tokens_to_commit; i++) {
-    os << "  Token " << i << ":\n";
-    os << "    token_index: " << bc.committed_tokens[i].token_index
-       << std::endl;
-    os << "    request_index: " << bc.committed_tokens[i].request_index
-       << std::endl;
-    os << "    token_depth: " << bc.committed_tokens[i].token_depth
-       << std::endl;
-  }
-
-  os << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << std::endl;
-  return os;
-}
-
-void TreeVerifyBatchConfig::print() const {
-  std::cout << *this << std::endl;
-}
-
-void TreeVerifyBatchConfig::save_to_file(std::string const &filename) const {
-  std::ofstream outputFile(filename);
-  if (outputFile.is_open()) {
-    outputFile << *this << std::endl;
-    outputFile.close();
-  } else {
-    std::cerr << "Error: Unable to open the batch config output file: "
-              << filename << std::endl;
-    assert(false);
-  }
-}
-
-}; // namespace FlexFlow

From b542054bd55518d7da38a012d6a90cec394c447b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 7 May 2024 16:17:00 -0400
Subject: [PATCH 210/667] Fixed a bug.

---
 src/runtime/request_manager.cc | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 26beb52d8..eac48f643 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1097,11 +1097,8 @@ bool RequestManager::update_ssm_inference_results(
       // This means that the parent layer is empty
       continue;
     } else {
-      auto parent_layer_iter =
-          token_tree.tree_layers.end(); // The iterator after the last element
-      --parent_layer_iter;
       std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
-          *parent_layer_iter;
+          token_tree.tree_layers.back();
       int parent_pos = 0;
       //   for (auto &parent_it = parent_tree_layer.begin();
       //        parent_it != parent_tree_layer.end();
@@ -1118,14 +1115,24 @@ bool RequestManager::update_ssm_inference_results(
           // Parent token is not pruned
           for (int child_idx = 0; child_idx < num_branches; child_idx++) {
             float parent_log_prob = parent_ptr->log_accumulated_prob;
+            std::cout << "Probability: "
+                      << ssm_inference_result.probs[result_index] << std::endl;
+            std::cout << "Log Probability: "
+                      << log(ssm_inference_result.probs[result_index])
+                      << std::endl;
+            assert(parent_log_prob != -std::numeric_limits<float>::infinity() &&
+                   "Parent log probability should not be -inf.");
+            assert(log(ssm_inference_result.probs[result_index]) !=
+                       -std::numeric_limits<float>::infinity() &&
+                   "Child log probability should not be -inf.");
             token_added_to_spec_tree =
                 token_added_to_spec_tree ||
                 add_token_to_spec_token_tree(
                     guid,
                     ssm_inference_result.token_ids[result_index],
+                    parent_pos,
                     log(ssm_inference_result.probs[result_index]) +
-                        parent_log_prob,
-                    parent_pos);
+                        parent_log_prob);
             result_index++;
           }
         }
@@ -1689,6 +1696,12 @@ bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
   // This method assumes only one small model is used for speculation
   // This method is called by update_ssm_inference_results()
 
+  if (verbose) {
+    std::cout << "add_token_to_spec_token_tree: guid=" << guid
+              << " token_id=" << token_id << " parent_pos=" << parent_pos
+              << " log_accumulated_prob=" << log_accumulated_prob << std::endl;
+  }
+
   // This is called after the first small model inference
   assert(current_speculation_step >= 1 &&
          "The current speculation step should be no less than 1");
@@ -1777,7 +1790,7 @@ bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
   if (add_new_node) {
     // Add the new node to the pool and the last layer of the speculation tree
     auto node_ptr = std::make_shared<TokenTreeNode>(
-        token_id, parent_pos, log_accumulated_prob);
+        token_id, log_accumulated_prob, parent_pos);
     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
     speculative_token_tree.tree_layers.back().push_back(node_ptr);
     speculative_token_tree.tree_size++;

From 8876cb7cd498af25bdc94e20ac64ee91297440d4 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 7 May 2024 17:40:39 -0400
Subject: [PATCH 211/667] Fixed bug in arg_topk kernel.

---
 src/ops/arg_topk.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 93444becc..491b255be 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -333,6 +333,7 @@ __device__ void mergeShards(int num_shards,
     // top_k_values[last_k] = max_element.value;
     int shard_index = max_element.index;
     top_k_indices[last_k] = entries[shard_index].index;
+    top_k_values[last_k] = static_cast<float>(max_element.value);
   }
 }
 

From a288b43fbd6912fa1a8dab0f7e6fd4246819abe5 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Wed, 8 May 2024 13:35:51 -0400
Subject: [PATCH 212/667] chore: add GumbelTopk entrances

---
 include/flexflow/ffconst.h             | 1 +
 include/flexflow/operator_params.h     | 2 ++
 include/flexflow/substitution_loader.h | 1 +
 src/runtime/ffconst_utils.cc           | 2 ++
 src/runtime/graph.cc                   | 4 ++++
 src/runtime/model.cc                   | 5 +++++
 src/runtime/operator_params.cc         | 3 +++
 7 files changed, 18 insertions(+)

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index b24adc080..f713e4592 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -137,6 +137,7 @@ enum OperatorType {
   OP_SHAPE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape
   OP_SIZE,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size
   OP_TOPK,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK
+  OP_GUMBEL_TOPK,
   OP_ARG_TOPK,
   OP_WHERE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where
   OP_CEIL,  // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index e87408438..766d4a582 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -20,6 +20,7 @@
 #include "flexflow/ops/flat_params.h"
 #include "flexflow/ops/gather_params.h"
 #include "flexflow/ops/groupby_params.h"
+#include "flexflow/ops/gumbel_topk_params.h"
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
@@ -79,6 +80,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        ReshapeParams,
                                        SplitParams,
                                        TopKParams,
+                                       GumbelTopKParams,
                                        ArgTopKParams,
                                        SamplingParams,
                                        ArgMaxParams,
diff --git a/include/flexflow/substitution_loader.h b/include/flexflow/substitution_loader.h
index e0c252ffd..e7367c5bb 100644
--- a/include/flexflow/substitution_loader.h
+++ b/include/flexflow/substitution_loader.h
@@ -103,6 +103,7 @@ NLOHMANN_JSON_SERIALIZE_ENUM(
      {OP_SHAPE, "OP_SHAPE"},
      {OP_SIZE, "OP_SIZE"},
      {OP_TOPK, "OP_TOPK"},
+     {OP_GUMBEL_TOPK, "OP_GUMBEL_TOPK"},
      {OP_WHERE, "OP_WHERE"},
      {OP_CEIL, "OP_CEIL"},
      {OP_CAST, "OP_CAST"},
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index b274e547c..36c68c836 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -114,6 +114,8 @@ std::string get_operator_type_name(OperatorType type) {
       return "Size";
     case OP_TOPK:
       return "TopK";
+    case OP_GUMBEL_TOPK:
+      return "GumbelTopK";
     case OP_ARG_TOPK:
       return "ArgTopK";
     // case OP_BEAM_TOPK:
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 11a9bf363..64a577132 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2970,6 +2970,10 @@ void FFModel::deserialize_graph_optimal_view(
         node = TopK::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
+      case OP_GUMBEL_TOPK: {
+        node = GumbelTopK::deserialize(*this, dez, inputs, num_inputs);
+        break;
+      }
       case OP_ARG_TOPK: {
         node = ArgTopK::deserialize(*this, dez, inputs, num_inputs);
         break;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 29af1fffa..6b508709c 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3241,6 +3241,11 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
+    case OP_GUMBEL_TOPK: {
+      Op *op = GumbelTopK::create_operator_from_layer(*this, layer, inputs);
+      operators.push_back(op);
+      return op;
+    }
     case OP_ARG_TOPK: {
       Op *op = ArgTopK::create_operator_from_layer(*this, layer, inputs);
       operators.push_back(op);
diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc
index 442d09254..33e945774 100644
--- a/src/runtime/operator_params.cc
+++ b/src/runtime/operator_params.cc
@@ -19,6 +19,7 @@
 #include "flexflow/ops/flat.h"
 #include "flexflow/ops/gather.h"
 #include "flexflow/ops/groupby.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
@@ -129,6 +130,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((Split *)op)->get_params();
     case OP_TOPK:
       return ((TopK *)op)->get_params();
+    case OP_GUMBEL_TOPK:
+      return ((GumbelTopK *)op)->get_params();
     case OP_GROUP_BY:
       return ((Group_by *)op)->get_params();
     case OP_AGGREGATE:

From a85c7a903e1a8db70ac1d1779a2ae7fb5d49618a Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 8 May 2024 19:11:31 -0400
Subject: [PATCH 213/667] Changed max_sequence_length for debug.

---
 inference/spec_infer/spec_infer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 06a5b1d36..7df39e94c 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -276,7 +276,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool verbose = false;
   int max_requests_per_batch = 16;
   int max_tokens_per_batch = 256;
-  int max_sequence_length = 1024;
+  int max_sequence_length = 256;
   int max_spec_tree_token_num = 23;
   int expansion_degree = 3;
   RequestManager::DecodingMode decoding_mode =

From 7f67a777cb53ec3285640a2b5c5698430632d856 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 8 May 2024 19:11:58 -0400
Subject: [PATCH 214/667] 1. Re-implemented the logic of adding tokens to the
 speculative token tree. Combined add_token_to_spec_token_tree and
 prune_last_layer_of_spec_token_trees into add_tokens_to_spec_token_tree. 2.
 Removed field tree_size_including_pruned from TokenTree because we no longer
 need it. 3. Added token commitment during LLM prefilling. Each prefilling
 batch commits the tokens in the last batch. 4. Added waits to make
 serve_spec_infer synchronous. 5. Fixed other minor bugs.

---
 include/flexflow/request_manager.h |  26 +-
 src/runtime/request_manager.cc     | 665 ++++++++++++++++++-----------
 2 files changed, 443 insertions(+), 248 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 6d65b0cde..80ce1ca9f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -136,7 +136,15 @@ class TokenTreeNode {
         parent_pos(parent_pos) {}
 };
 
-// A comparator for shared_ptr<TokenTreeNode>
+// A comparator for std::shared_ptr<TokenTreeNode>
+struct CompareSharedTokenTreeNodePtr {
+  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
+                  std::shared_ptr<TokenTreeNode> const &rhs) const {
+    return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
+  }
+};
+
+// A comparator for std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
 struct CompareSharedTokenTreeNodePtrRequestGuidPair {
   bool operator()(std::pair<std::shared_ptr<TokenTreeNode>,
                             BatchConfig::RequestGuid> const &lhs,
@@ -152,7 +160,6 @@ class TokenTree {
   // The numebr of tokens in the tree that are not pruned
   int tree_size = 0;
   // The numebr of tokens in the tree including the pruned ones
-  int tree_size_including_pruned = 0;
 
   void add_layer() {
     tree_layers.emplace_back();
@@ -161,10 +168,9 @@ class TokenTree {
   void clear() {
     tree_layers.clear();
     tree_size = 0;
-    tree_size_including_pruned = 0;
   }
 
-  TokenTree() : tree_size(0), tree_size_including_pruned(0) {}
+  TokenTree() : tree_size(0) {}
 };
 
 class RequestManager {
@@ -372,11 +378,13 @@ class RequestManager {
   void init_token_tree(RequestGuid guid);
   void add_root_to_spec_token_tree(RequestGuid guid,
                                    BatchConfig::TokenId token_id);
-  bool add_token_to_spec_token_tree(RequestGuid guid,
-                                    BatchConfig::TokenId token_id,
-                                    int parent_pos,
-                                    float log_accumulated_prob);
-  void prune_last_layer_of_spec_token_tree(RequestGuid guid);
+  bool add_tokens_to_spec_token_tree(
+      InferenceResult const &ssm_inference_result);
+  //   bool add_token_to_spec_token_tree(RequestGuid guid,
+  //                                     BatchConfig::TokenId token_id,
+  //                                     int parent_pos,
+  //                                     float log_accumulated_prob);
+  //   bool prune_last_layer_of_spec_token_trees();
   /* ---------- Spec Decoding Helper Functions ---------- */
 };
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index eac48f643..9e3cb3304 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -514,7 +514,19 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
+  int committed_token_offset = prefill_request->llm_cache_size;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
+
+  if (decoding_mode == SPECULATIVE_DECODING) {
+    // Add the committed tokens to the token tree
+    for (int i = 0; i < prefill_request->num_tokens_in_batch; i++) {
+      prefill_request->committed_tokens.push_back(Request::CommittedToken{
+          i,
+          committed_token_offset + i,
+          prefill_request->tokens[i + committed_token_offset]});
+    }
+  }
+
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
@@ -652,6 +664,16 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     bc.num_tokens++;
   }
 
+  // Committed tokens
+  for (auto const &committed_token : prefill_request->committed_tokens) {
+    bc.committed_tokens[bc.num_tokens_to_commit].token_index =
+        committed_token.from_index;
+    bc.committed_tokens[bc.num_tokens_to_commit].request_index = request_index;
+    bc.committed_tokens[bc.num_tokens_to_commit].token_depth =
+        committed_token.to_index;
+    bc.num_tokens_to_commit++;
+  }
+
   return bc;
 }
 
@@ -793,16 +815,29 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
     new_bc.requestsInfo[request_index].first_token_index_in_request =
-        request.tokens.size() - committed_tokens.size();
+        request.ssm_cache_size;
+    // We don't directly use committed_tokens.size() here because there is a
+    // case where committed_tokens.size() != request.tokens.size() -
+    // request.ssm_cache_size, that's when the LLM prefilling is just finished
     new_bc.requestsInfo[request_index].num_tokens_in_batch =
-        committed_tokens.size();
+        request.tokens.size() - request.ssm_cache_size;
+
+    request.first_token_offset_in_batch = new_bc.num_tokens;
+    request.num_tokens_in_batch =
+        request.tokens.size() - request.ssm_cache_size;
 
     // Store committed tokens to tokensInfo
-    for (auto const &committed_token : committed_tokens) {
+    int start_offset = committed_tokens.size() - request.tokens.size() +
+                       request.ssm_cache_size;
+    assert(start_offset >= 0 && "Invalid start offset.");
+    for (int committed_token_index = start_offset;
+         committed_token_index < committed_tokens.size();
+         committed_token_index++) {
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
       new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-          committed_token.to_index;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id = committed_token.token_id;
+          committed_tokens[committed_token_index].to_index;
+      new_bc.tokensInfo[new_bc.num_tokens].token_id =
+          committed_tokens[committed_token_index].token_id;
       new_bc.num_tokens++;
     }
 
@@ -856,7 +891,9 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
       // model inference steps
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() + token_tree.tree_size_including_pruned;
+          request.causal_mask.non_tree_cache_size +
+          request.causal_mask.tree_or_prompt_size -
+          request.causal_mask.current_layer_size;
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
@@ -864,10 +901,11 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
       // Exclude the current layer from the token tree, because we want the
       // start index
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() - 1 + token_tree.tree_size_including_pruned -
-          current_layer.size();
+          request.causal_mask.non_tree_cache_size +
+          request.causal_mask.tree_or_prompt_size -
+          request.causal_mask.current_layer_size;
       new_bc.requestsInfo[request_index].num_tokens_in_batch =
-          current_layer.size();
+          request.causal_mask.current_layer_size;
 
       int child_index = 0;
       for (auto const &node_ptr : current_layer) {
@@ -941,8 +979,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
         request.tokens.size() - 1; // Exclude the last token
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
-    new_bc.requestsInfo[request_index].num_tokens_in_batch =
-        request.speculative_token_trees[0].tree_size;
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
 
     // Put the information of the committed tokens into
     // BatchConfig.committed_tokens.
@@ -973,7 +1010,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
         if (tree_node->pruned == false) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              request.tokens.size() + token_tree_index;
+              request.tokens.size() - 1 + token_tree_index;
           new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
           new_bc.num_tokens++;
           token_tree_index++;
@@ -981,6 +1018,10 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       }
     }
     assert(token_tree_index == token_tree.tree_size);
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
+
+    request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
+    request.num_tokens_in_batch = token_tree_index;
 
     // Create the causal mask for the large model based on the small model
     // causal mask.
@@ -1058,6 +1099,12 @@ bool RequestManager::update_llm_verify_results(
     }
   }
 
+  // Clear the token tree node pool
+  token_tree_node_pool = std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
+      CompareSharedTokenTreeNodePtrRequestGuidPair>();
+
   // Some requests may be completed after appending the verified tokens.
   // If there is a request completed, return true.
   return request_completed;
@@ -1073,11 +1120,13 @@ bool RequestManager::update_ssm_inference_results(
 
   int num_branches = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
-  bool token_added_to_spec_tree = false;
 
   // Here we assume that the order of the tokens in the last
   // BatchConfig and hence the last InferenceResult is equal to
   // the order of the request in the last BatchConfig
+  bool all_request_last_layer_empty =
+      add_tokens_to_spec_token_tree(ssm_inference_result);
+
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1089,58 +1138,60 @@ bool RequestManager::update_ssm_inference_results(
     assert(request.status == Request::RUNNING);
 
     if (current_speculation_step == 1) {
-      request.ssm_cache_size += request.committed_tokens.size() - 1;
+      request.ssm_cache_size = request.tokens.size();
     }
 
-    TokenTree &token_tree = request.speculative_token_trees[0];
-    if (token_tree.tree_layers.size() < current_speculation_step) {
-      // This means that the parent layer is empty
-      continue;
-    } else {
-      std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
-          token_tree.tree_layers.back();
-      int parent_pos = 0;
-      //   for (auto &parent_it = parent_tree_layer.begin();
-      //        parent_it != parent_tree_layer.end();
-      //        parent_it++) {
-      for (auto parent_ptr : parent_tree_layer) {
-        if (parent_ptr->pruned) {
-          // Parent token is pruned, we have to skip all its children
-          // Because no token is pruned in the last layer during the small
-          // model inference, the reason why some parents are pruned is that
-          // adding tokens to the new layer of the tree may result in some
-          // node being pruned in internal layers.
-          result_index += num_branches;
-        } else {
-          // Parent token is not pruned
-          for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-            float parent_log_prob = parent_ptr->log_accumulated_prob;
-            std::cout << "Probability: "
-                      << ssm_inference_result.probs[result_index] << std::endl;
-            std::cout << "Log Probability: "
-                      << log(ssm_inference_result.probs[result_index])
-                      << std::endl;
-            assert(parent_log_prob != -std::numeric_limits<float>::infinity() &&
-                   "Parent log probability should not be -inf.");
-            assert(log(ssm_inference_result.probs[result_index]) !=
-                       -std::numeric_limits<float>::infinity() &&
-                   "Child log probability should not be -inf.");
-            token_added_to_spec_tree =
-                token_added_to_spec_tree ||
-                add_token_to_spec_token_tree(
-                    guid,
-                    ssm_inference_result.token_ids[result_index],
-                    parent_pos,
-                    log(ssm_inference_result.probs[result_index]) +
-                        parent_log_prob);
-            result_index++;
-          }
-        }
-        parent_pos++;
-      }
-    }
+    // TokenTree &token_tree = request.speculative_token_trees[0];
+    // if (token_tree.tree_layers.size() < current_speculation_step) {
+    //   // This means that the parent layer is empty
+    //   continue;
+    // } else {
+    //   std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
+    //       token_tree.tree_layers.back();
+    //   int parent_pos = 0;
+    //   //   for (auto &parent_it = parent_tree_layer.begin();
+    //   //        parent_it != parent_tree_layer.end();
+    //   //        parent_it++) {
+    //   for (auto parent_ptr : parent_tree_layer) {
+    //     if (parent_ptr->pruned) {
+    //       // Parent token is pruned, we have to skip all its children
+    //       // Because no token is pruned in the last layer during the small
+    //       // model inference, the reason why some parents are pruned is that
+    //       // adding tokens to the new layer of the tree may result in some
+    //       // node being pruned in internal layers.
+    //       result_index += num_branches;
+    //     } else {
+    //       // Parent token is not pruned
+    //       for (int child_idx = 0; child_idx < num_branches; child_idx++) {
+    //         float parent_log_prob = parent_ptr->log_accumulated_prob;
+    //         std::cout << "Probability: "
+    //                   << ssm_inference_result.probs[result_index] <<
+    //                   std::endl;
+    //         std::cout << "Log Probability: "
+    //                   << log(ssm_inference_result.probs[result_index])
+    //                   << std::endl;
+    //         assert(parent_log_prob != -std::numeric_limits<float>::infinity()
+    //         &&
+    //                "Parent log probability should not be -inf.");
+    //         assert(log(ssm_inference_result.probs[result_index]) !=
+    //                    -std::numeric_limits<float>::infinity() &&
+    //                "Child log probability should not be -inf.");
+    //         add_token_to_spec_token_tree(
+    //             guid,
+    //             ssm_inference_result.token_ids[result_index],
+    //             parent_pos,
+    //             log(ssm_inference_result.probs[result_index]) +
+    //                 parent_log_prob);
+    //         result_index++;
+    //       }
+    //     }
+    //     parent_pos++;
+    //   }
+    // }
 
-    prune_last_layer_of_spec_token_tree(guid);
+    // bool last_layer_empty = prune_last_layer_of_spec_token_tree(guid);
+    // all_request_last_layer_empty =
+    //     all_request_last_layer_empty && last_layer_empty;
 
     if (current_speculation_step == 1) {
       init_bitmask_spec(guid);
@@ -1149,7 +1200,7 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  return !token_added_to_spec_tree ||
+  return all_request_last_layer_empty ||
          current_speculation_step > BatchConfig::MAX_TREE_DEPTH;
 }
 
@@ -1314,42 +1365,38 @@ void RequestManager::get_verify_results_greedy(
     TokenTree &token_tree = request.speculative_token_trees[0];
     // First add the root to the committed tokens
     request.committed_tokens.push_back(Request::CommittedToken(
-        llm_result_offset,
-        committed_token_index,
-        llm_verify_result.token_ids[llm_result_offset]));
+        llm_result_offset, committed_token_index, request.tokens.back()));
     committed_token_index++;
     // Don't add it to request.tokens because it has already been added.
 
     // The position of the last accepted token in its tree layer (includeing
     // the pruned tokens)
-    int last_accepted_token_layer_index = 0;
+    int last_accepted_token_index_in_layer = 0;
     // The index of the last accepted token in the entire tree (excluding the
     // pruned tokens)
     int last_accepted_token_index = 0;
 
     int current_token_index = 1; // Because we skip the root
-    int num_layers = token_tree.tree_layers.size();
     auto layer_it = token_tree.tree_layers.begin();
     ++layer_it;
-    // for (int layer_index = 1; layer_index < num_layers; layer_index++) {
     for (; layer_it != token_tree.tree_layers.end(); layer_it++) {
       // We skip the first layer
       std::list<std::shared_ptr<TokenTreeNode>> &tree_layer = *layer_it;
 
       bool token_accepted_this_layer = false;
-      int current_token_layer_index = 0;
+      int current_token_index_in_layer = 0;
 
       for (auto const &node_ptr : tree_layer) {
         if (node_ptr->pruned) {
-          current_token_layer_index++;
+          current_token_index_in_layer++;
           continue;
         }
-        if ((node_ptr->parent_pos != last_accepted_token_layer_index) ||
+        if ((node_ptr->parent_pos != last_accepted_token_index_in_layer) ||
             token_accepted_this_layer) {
           // The token's parent is not accepted, or there is already another
           // token accepted in this layer
           current_token_index++;
-          current_token_layer_index++;
+          current_token_index_in_layer++;
           continue;
         } else {
           // The token's parent is accepted, and no token has been accepted in
@@ -1369,10 +1416,10 @@ void RequestManager::get_verify_results_greedy(
 
             token_accepted_this_layer = true;
             last_accepted_token_index = current_token_index;
-            last_accepted_token_layer_index = current_token_layer_index;
+            last_accepted_token_index_in_layer = current_token_index_in_layer;
             committed_token_index++;
             current_token_index++;
-            current_token_layer_index++;
+            current_token_index_in_layer++;
           }
         }
       }
@@ -1395,7 +1442,17 @@ void RequestManager::get_verify_results_greedy(
         break;
       }
     }
-    llm_result_offset += token_tree.tree_size;
+    llm_result_offset += request.num_tokens_in_batch;
+
+    if (verbose) {
+      std::cout << "Request " << request.guid << " committed tokens: ";
+      for (auto const &committed_token : request.committed_tokens) {
+        std::cout << committed_token.token_id << " ";
+      }
+      std::cout << std::endl;
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Output sequence: " << output << std::endl;
+    }
   }
 }
 
@@ -1555,73 +1612,35 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     InferenceResult ir;
     last_irf = Future::from_value<InferenceResult>(ir);
   }
-  std::queue<InferenceResultFuture> batch_pipeline;
-  { batch_pipeline.push(last_irf); }
+
+  request_manager_status = PREFILLING;
+  prefill_model = SSM;
 
   while (!is_background_server_terminated()) {
+    last_irf.get_void_result();
+    BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
+    bcf.get_void_result();
 
-    if (batch_pipeline.size() >= 4) {
-      // Block here to avoid launching too many batches
-      auto const &ir = batch_pipeline.front();
-      ir.get_void_result();
-    }
-    // deque finished batches
-    while (batch_pipeline.size() > 1) {
-      auto const &ir = batch_pipeline.front();
-      if (ir.is_ready()) {
-        batch_pipeline.pop();
-      } else {
-        break;
-      }
-    }
     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
         request_manager_status == LLM_VERIFY) {
+      std::cout << "Branch 1" << std::endl;
       runtime->begin_trace(ctx, 12345 /*trace_id*/);
-      InferenceResultFuture next_ir = batch_pipeline.back();
-      BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
       FutureMap fm = im->inference(llm, 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture irf = fm.get_future(0);
-      batch_pipeline.push(irf);
+      last_irf = fm.get_future(0);
       runtime->end_trace(ctx, 12345 /*trace_id*/);
     } else if ((request_manager_status == PREFILLING and
                 prefill_model == SSM) or
                request_manager_status == SSM_SPEC) {
+      std::cout << "Branch 2" << std::endl;
       runtime->begin_trace(ctx, 23456 /*trace_id*/);
-      InferenceResultFuture next_ir = batch_pipeline.back();
-      BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
       assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture irf = fm.get_future(0);
-      batch_pipeline.push(irf);
+      last_irf = fm.get_future(0);
       runtime->end_trace(ctx, 23456 /*trace_id*/);
     } else {
       assert(false && "Invalid request manager status");
     }
-    // runtime->begin_trace(ctx, 12345 /*trace_id*/);
-    // InferenceResultFuture next_ir = batch_pipeline.back();
-    // BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
-    // FutureMap fm;
-    // if (request_manager_status == PREFILLING) {
-    //   if (prefill_model == LLM) {
-    //     fm = im->inference(llm, 0, bcf);
-    //   } else if (prefill_model == SSM) {
-    //     fm = im->inference(get_ssm_model(0), 0, bcf);
-    //   } else {
-    //     assert(false && "Invalid prefill model");
-    //   }
-    // } else if (request_manager_status == LLM_VERIFY) {
-    //   fm = im->inference(llm, 0, bcf);
-    // } else if (request_manager_status == SSM_SPEC) {
-    //   fm = im->inference(get_ssm_model(0), 0, bcf);
-    // } else {
-    //   assert(false && "Invalid request manager status");
-    // }
-    // std::cout << "after inference" << std::endl;
-    // assert(fm.get_future_map_domain().get_volume() == 1);
-    // InferenceResultFuture irf = fm.get_future(0);
-    // batch_pipeline.push(irf);
-    // runtime->end_trace(ctx, 12345 /*trace_id*/);
   }
 }
 
@@ -1684,138 +1703,306 @@ void RequestManager::add_root_to_spec_token_tree(
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
   speculative_token_tree.add_layer();
   auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
+  token_tree_node_pool.push(std::make_pair(node_ptr, guid));
   speculative_token_tree.tree_layers.front().push_back(node_ptr);
   speculative_token_tree.tree_size++;
-  speculative_token_tree.tree_size_including_pruned++;
 }
 
-bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
-                                                  BatchConfig::TokenId token_id,
-                                                  int parent_pos,
-                                                  float log_accumulated_prob) {
-  // This method assumes only one small model is used for speculation
-  // This method is called by update_ssm_inference_results()
-
-  if (verbose) {
-    std::cout << "add_token_to_spec_token_tree: guid=" << guid
-              << " token_id=" << token_id << " parent_pos=" << parent_pos
-              << " log_accumulated_prob=" << log_accumulated_prob << std::endl;
-  }
-
-  // This is called after the first small model inference
-  assert(current_speculation_step >= 1 &&
-         "The current speculation step should be no less than 1");
-
-  Request &request = all_requests[guid];
-  TokenTree &speculative_token_tree = request.speculative_token_trees[0];
+bool RequestManager::add_tokens_to_spec_token_tree(
+    InferenceResult const &ssm_inference_result) {
 
-  // Make sure there are enough layers in the speculation tree
-  if (speculative_token_tree.tree_layers.size() == current_speculation_step) {
-    // When adding the first token, we need to add a new layer
-    speculative_token_tree.add_layer();
-  } else {
-    // To add a token, the tree depth is either the same as the current
-    // speculation step or one more than the current speculation step.
-    assert(speculative_token_tree.tree_layers.size() ==
-               current_speculation_step + 1 &&
-           "Invalid token tree depth");
-  }
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-  bool remove_min_node = false;
-  bool add_new_node = true;
+    int parent_num = request.num_tokens_in_batch;
+    if (parent_num == 0) {
+      // The request has no committed tokens, we don't need to add tokens to the
+      // token tree
+      continue;
+    }
+    int result_offset = request.first_token_offset_in_batch *
+                        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    int result_num = parent_num * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    int current_tree_size = request.causal_mask.tree_or_prompt_size;
+    int empty_slots_on_tree = BatchConfig::MAX_SPEC_TREE_TOKEN_NUM -
+                              current_tree_size; // The number of empty slots
+
+    if (empty_slots_on_tree == 0) {
+      // The token tree is full, we don't need to add tokens to it
+      continue;
+    }
 
-  std::shared_ptr<TokenTreeNode> min_node_ptr = nullptr;
-  RequestGuid min_node_guid = -1;
-  if (token_tree_node_pool.size() > 0) {
-    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
-        min_node_pair_in_pool = token_tree_node_pool.top();
-    min_node_ptr = min_node_pair_in_pool.first;
-    min_node_guid = min_node_pair_in_pool.second;
-  }
+    bool token_pool_full =
+        token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS;
+
+    TokenTree &spec_token_tree = request.speculative_token_trees[0];
+    std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
+        spec_token_tree.tree_layers.back();
+    std::set<std::shared_ptr<TokenTreeNode>, CompareSharedTokenTreeNodePtr>
+        tokens;
+    int parent_pos = 0;
+    for (auto const &parent_ptr : last_layer) {
+      for (int child_pos = 0;
+           child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           child_pos++) {
+        int result_idx =
+            result_offset +
+            parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES + child_pos;
+        float log_prob = log(ssm_inference_result.probs[result_idx]);
+        float log_accumulated_prob =
+            log_prob + parent_ptr->log_accumulated_prob;
+
+        std::cout << "Probability: " << ssm_inference_result.probs[result_idx]
+                  << std::endl;
+        std::cout << "Log Probability: " << log_prob << std::endl;
+        assert(log_prob != -std::numeric_limits<float>::infinity() &&
+               "Child log probability should not be -inf.");
+
+        if (tokens.size() == empty_slots_on_tree and
+            log_accumulated_prob <= (*tokens.begin())->log_accumulated_prob) {
+          // The token tree is full, and the new token has a lower joint
+          // probability than the minimum node in the pool, we don't need to add
+          // the new token and the following tokens belong to the same parent
+          // to the tree, because the tokens are sorted by their probability
+          break;
+        } else if (token_pool_full and
+                   log_accumulated_prob <=
+                       token_tree_node_pool.top().first->log_accumulated_prob) {
+          // The token tree is not full, but the token pool is full, and the new
+          // token has a lower joint probability than the minimum node in the
+          // pool, we don't need to add the new token and the following tokens
+          // belong to the same parent to the tree, because the tokens are
+          // sorted by their probability
+          break;
+        } else {
+          std::shared_ptr<TokenTreeNode> node_ptr =
+              std::make_shared<TokenTreeNode>(
+                  ssm_inference_result.token_ids[result_idx],
+                  log_accumulated_prob,
+                  parent_pos);
+          if (tokens.size() == empty_slots_on_tree and
+              log_accumulated_prob > (*tokens.begin())->log_accumulated_prob) {
+            // The token tree is full, and the new token has a higher joint
+            // probability than the minimum node in the pool, we need to remove
+            // the minimum node from the pool and add the new token to the tree
+            tokens.erase(tokens.begin());
+          }
+          tokens.insert(node_ptr);
+        }
+      }
+      parent_pos++;
+    }
 
-  // We maintain the size of the token tree node pool to not exceed
-  //  BatchConfig::MAX_NUM_TOKENS
-  if (token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS) {
-    // The pool is full, check if the new node has a higher joint probability
-    // than the minimum node in the pool.
+    // Now add all tokens in the set to the token tree, in descending order of
+    // their joint probability
+    spec_token_tree.add_layer();
+    for (auto token_it = tokens.crbegin(); token_it != tokens.crend();
+         token_it++) {
+      if (token_pool_full and
+          token_tree_node_pool.top().first->log_accumulated_prob >=
+              (*token_it)->log_accumulated_prob) {
+        break;
+      } else if (token_pool_full) {
+        token_tree_node_pool.top().first->pruned = true;
+        token_tree_node_pool.pop();
+      }
 
-    if (log_accumulated_prob < min_node_ptr->log_accumulated_prob) {
-      // Insertion failed
-      add_new_node = false;
-    } else {
-      // Remove the minimum node from the pool, and set its pruned field to
-      // true
-      remove_min_node = true;
+      token_tree_node_pool.push(std::make_pair((*token_it), guid));
+      spec_token_tree.tree_layers.back().push_back((*token_it));
+      spec_token_tree.tree_size++;
     }
-  } else if (token_tree_node_pool.size() > BatchConfig::MAX_NUM_TOKENS) {
-    assert(false && "The size of the token tree node pool should not exceed "
-                    "BatchConfig::MAX_NUM_TOKENS");
   }
-  // Do nothing if the pool is not full
-
-  // The request's token tree size should not exceed
-  // BatchConfig::MAX_SPEC_TREE_TOKEN_NUM
-  // The judgement is done here to avoid the case where the tree is full but a
-  // node is pruned.
-  if (speculative_token_tree.tree_size ==
-      BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
-    if (remove_min_node && guid == min_node_guid) {
-      // The minimum node in the pool is pruned, and it's in the same request
-      // with the new node. Only in this case we can add the new node.
-      // Because remove_min_node is true means that the new node has a higher
-      // joint probability than the minimum node in the pool.
-      add_new_node = true;
-    } else {
-      // Otherwise, we cannot add the new node, and we don't need to expel the
-      // minimum node from the pool.
-      add_new_node = false;
-      remove_min_node = false;
-    }
-  } else if (speculative_token_tree.tree_size >
-             BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
-    assert(false && "The size of the token tree should not exceed "
-                    "BatchConfig::MAX_SPEC_TREE_TOKEN_NUM");
-  }
-
-  assert(!(remove_min_node && !add_new_node) &&
-         "The minimum node should be removed only when the new node is added");
 
-  if (remove_min_node) {
-    // Remove the minimum node from the pool, and set its pruned field to true
-    min_node_ptr->pruned = true;
-    token_tree_node_pool.pop();
-    all_requests[min_node_guid].speculative_token_trees[0].tree_size--;
-  }
+  bool all_request_last_layer_empty = true;
 
-  if (add_new_node) {
-    // Add the new node to the pool and the last layer of the speculation tree
-    auto node_ptr = std::make_shared<TokenTreeNode>(
-        token_id, log_accumulated_prob, parent_pos);
-    token_tree_node_pool.push(std::make_pair(node_ptr, guid));
-    speculative_token_tree.tree_layers.back().push_back(node_ptr);
-    speculative_token_tree.tree_size++;
-    speculative_token_tree.tree_size_including_pruned++;
-  }
-  return add_new_node;
-}
+  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    TokenTree &spec_token_tree = request.speculative_token_trees[0];
 
-void RequestManager::prune_last_layer_of_spec_token_tree(RequestGuid guid) {
-  // This method assumes only one small model is used for speculation
-  Request &request = all_requests[guid];
+    if (spec_token_tree.tree_layers.size() <= current_speculation_step) {
+      // This request has no token added in this layer, skip it
+      continue;
+    }
 
-  if (request.speculative_token_trees[0].tree_layers.size() <=
-      current_speculation_step) {
-    // There are no tokens in the last layer
-    return;
-  }
-  auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
-  for (auto it = last_layer.begin(); it != last_layer.end(); ++it) {
-    if ((*it)->pruned) {
-      last_layer.erase(it);
-      request.speculative_token_trees[0].tree_size--;
-      request.speculative_token_trees[0].tree_size_including_pruned--;
+    std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
+        request.speculative_token_trees[0].tree_layers.back();
+    for (auto it = last_layer.begin(); it != last_layer.end();) {
+      if ((*it)->pruned) {
+        it = last_layer.erase(it);
+        spec_token_tree.tree_size--;
+      } else {
+        ++it;
+      }
     }
+    all_request_last_layer_empty &= last_layer.empty();
   }
+  return all_request_last_layer_empty;
 }
+
+// bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
+//                                                   BatchConfig::TokenId
+//                                                   token_id, int parent_pos,
+//                                                   float log_accumulated_prob)
+//                                                   {
+//   // This method assumes only one small model is used for speculation
+//   // This method is called by update_ssm_inference_results()
+
+//   if (verbose) {
+//     std::cout << "add_token_to_spec_token_tree: guid=" << guid
+//               << " token_id=" << token_id << " parent_pos=" << parent_pos
+//               << " log_accumulated_prob=" << log_accumulated_prob <<
+//               std::endl;
+//   }
+
+//   // This is called after the first small model inference
+//   assert(current_speculation_step >= 1 &&
+//          "The current speculation step should be no less than 1");
+
+//   Request &request = all_requests[guid];
+//   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
+
+//   // Make sure there are enough layers in the speculation tree
+//   if (speculative_token_tree.tree_layers.size() == current_speculation_step)
+//   {
+//     // When adding the first token, we need to add a new layer
+//     speculative_token_tree.add_layer();
+//   } else {
+//     // To add a token, the tree depth is either the same as the current
+//     // speculation step or one more than the current speculation step.
+//     assert(speculative_token_tree.tree_layers.size() ==
+//                current_speculation_step + 1 &&
+//            "Invalid token tree depth");
+//   }
+
+//   bool remove_min_node = false;
+//   bool add_new_node = true;
+
+//   std::shared_ptr<TokenTreeNode> min_node_ptr = nullptr;
+//   RequestGuid min_node_guid = -1;
+//   if (token_tree_node_pool.size() > 0) {
+//     std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
+//         min_node_pair_in_pool = token_tree_node_pool.top();
+//     min_node_ptr = min_node_pair_in_pool.first;
+//     min_node_guid = min_node_pair_in_pool.second;
+//   }
+
+//   // We maintain the size of the token tree node pool to not exceed
+//   //  BatchConfig::MAX_NUM_TOKENS
+//   if (token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS) {
+//     // The pool is full, check if the new node has a higher joint probability
+//     // than the minimum node in the pool.
+
+//     if (log_accumulated_prob < min_node_ptr->log_accumulated_prob) {
+//       // Insertion failed
+//       add_new_node = false;
+//     } else {
+//       // Remove the minimum node from the pool, and set its pruned field to
+//       // true
+//       remove_min_node = true;
+//     }
+//   } else if (token_tree_node_pool.size() > BatchConfig::MAX_NUM_TOKENS) {
+//     assert(false && "The size of the token tree node pool should not exceed "
+//                     "BatchConfig::MAX_NUM_TOKENS");
+//   }
+//   // Do nothing if the pool is not full
+
+//   // The request's token tree size should not exceed
+//   // BatchConfig::MAX_SPEC_TREE_TOKEN_NUM
+//   // The judgement is done here to avoid the case where the tree is full but
+//   a
+//   // node is pruned.
+//   if (speculative_token_tree.tree_size ==
+//       BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
+//     if (remove_min_node && guid == min_node_guid) {
+//       // The minimum node in the pool is pruned, and it's in the same request
+//       // with the new node. Only in this case we can add the new node.
+//       // Because remove_min_node is true means that the new node has a higher
+//       // joint probability than the minimum node in the pool.
+//       add_new_node = true;
+//     } else {
+//       // Otherwise, we cannot add the new node, and we don't need to expel
+//       the
+//       // minimum node from the pool.
+//       add_new_node = false;
+//       remove_min_node = false;
+//     }
+//   } else if (speculative_token_tree.tree_size >
+//              BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
+//     assert(false && "The size of the token tree should not exceed "
+//                     "BatchConfig::MAX_SPEC_TREE_TOKEN_NUM");
+//   }
+
+//   assert(!(remove_min_node && !add_new_node) &&
+//          "The minimum node should be removed only when the new node is
+//          added");
+
+//   if (remove_min_node) {
+//     // Remove the minimum node from the pool, and set its pruned field to
+//     true min_node_ptr->pruned = true; token_tree_node_pool.pop();
+//     all_requests[min_node_guid].speculative_token_trees[0].tree_size--;
+//   }
+
+//   if (add_new_node) {
+//     // Add the new node to the pool and the last layer of the speculation
+//     tree auto node_ptr = std::make_shared<TokenTreeNode>(
+//         token_id, log_accumulated_prob, parent_pos);
+//     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
+//     speculative_token_tree.tree_layers.back().push_back(node_ptr);
+//     speculative_token_tree.tree_size++;
+//     speculative_token_tree.tree_size_including_pruned++;
+//   }
+//   return add_new_node;
+// }
+
+// bool RequestManager::prune_last_layer_of_spec_token_trees() {
+//   // Returns true if the last layers of the token tree of all requests are
+//   empty for (int request_idx = 0; request_idx <
+//   BatchConfig::MAX_NUM_REQUESTS;
+//        request_idx++) {
+//     RequestGuid guid = request_idx;
+//     if (all_requests[guid].status != Request::RUNNING) {
+//       continue;
+//     }
+//     if (prune_last_layer_of_spec_token_tree(guid)) {
+//       return true;
+//     }
+//   }
+//   Request &request = all_requests[guid];
+
+//   if (request.speculative_token_trees[0].tree_layers.size() <=
+//       current_speculation_step) {
+//     // There are no tokens in the last layer
+//     return true;
+//   }
+//   auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
+//   for (auto it = last_layer.begin(); it != last_layer.end();) {
+//     if ((*it)->pruned) {
+//       it = last_layer.erase(it);
+//       request.speculative_token_trees[0].tree_size--;
+//       request.speculative_token_trees[0].tree_size_including_pruned--;
+//     } else {
+//       ++it;
+//     }
+//   }
+
+//   if (last_layer.empty()) {
+//     return true;
+//   }
+//   return false;
+// }
 /* --------- Request Token Tree Related Functions --------- */
 }; // namespace FlexFlow

From f3aa13e9f393dda7d735b8bdaecbff2a79905e9d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 8 May 2024 19:28:52 -0400
Subject: [PATCH 215/667] Fixed a bug.

---
 src/runtime/request_manager.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9e3cb3304..4526374ac 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -517,6 +517,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   int committed_token_offset = prefill_request->llm_cache_size;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
 
+  prefill_request->committed_tokens.clear();
   if (decoding_mode == SPECULATIVE_DECODING) {
     // Add the committed tokens to the token tree
     for (int i = 0; i < prefill_request->num_tokens_in_batch; i++) {
@@ -964,7 +965,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
   for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
        ++request_index) {
     if (!request_available[request_index]) {
-      new_bc.request_available[request_index] = false;
       continue;
     }
     int guid = guid_of_requests[request_index];
@@ -992,14 +992,14 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
          committed_token_index++) {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+      new_bc.committed_tokens[committed_token_index].request_index =
           request_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+      new_bc.committed_tokens[committed_token_index].token_index =
           committed_token.from_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+      new_bc.committed_tokens[committed_token_index].token_depth =
           committed_token.to_index;
-      new_bc.num_tokens_to_commit++;
     }
+    new_bc.num_tokens_to_commit = committed_tokens.size() - 1;
 
     // Load the tokens on the token tree that are not yet pruned to
     // BatchConfig.tokensInfo.

From 227595a741814bd0245bcecf7d70a095e08c7da5 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 00:19:42 -0400
Subject: [PATCH 216/667] Fixed bitmask related bug.

---
 .../inc_multihead_self_attention_utils.cuh    |  4 +--
 src/ops/spec_inc_multihead_self_attention.cu  | 18 +++++-----
 src/ops/tree_inc_multihead_self_attention.cu  | 36 +++++++++----------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index 546d5e9a9..99e033c20 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -523,5 +523,5 @@ struct threads_per_value_t {
 #define test_bit(bit_mask, idx, pos)                                           \
   (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
-} // namespace FlexFlow
-#endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file
+#endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index f5de35663..41e8bc38a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -175,9 +175,9 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
         // todo add alobi here
         // bool const mask = ti_circ >= totalCacheSize;
         bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                           !test_bit(bitmask->bit_mask,
-                                     ti - bitmask->non_tree_cache_size,
-                                     query_token));
+                           (!test_bit(bitmask->bit_mask,
+                                      query_token,
+                                      ti - bitmask->non_tree_cache_size)));
         // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
         //   (1 << query_token))));
 
@@ -230,9 +230,9 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     for (int ti = first_step + tidx; ti < totalCacheSize;
          ti += THREADS_PER_BLOCK) {
       bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                         !test_bit(bitmask->bit_mask,
-                                   ti - bitmask->non_tree_cache_size,
-                                   query_token));
+                         (!test_bit(bitmask->bit_mask,
+                                    query_token,
+                                    ti - bitmask->non_tree_cache_size)));
       // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
       //   (1 << query_token))));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
@@ -279,9 +279,9 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
 
         bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                           !test_bit(bitmask->bit_mask,
-                                     ti - bitmask->non_tree_cache_size,
-                                     query_token));
+                           (!test_bit(bitmask->bit_mask,
+                                      query_token,
+                                      ti - bitmask->non_tree_cache_size)));
         // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
         //   (1 << query_token))));
         float logit = mask ? 0.0f : qk_smem[ti - first_step];
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 3d0235a6b..fe3b47af2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -170,12 +170,12 @@ __global__ void compute_attention_kernel_fused_kernel(
       float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
 
       if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-        bool const mask = prompt_phase
-                              ? (qi + q_start < ti)
-                              : (ti >= bitmask->non_tree_cache_size &&
-                                 !test_bit(bitmask->bit_mask,
-                                           ti - bitmask->non_tree_cache_size,
-                                           qi));
+        bool const mask =
+            prompt_phase ? (qi + q_start < ti)
+                         : (ti >= bitmask->non_tree_cache_size &&
+                            (!test_bit(bitmask->bit_mask,
+                                       qi,
+                                       ti - bitmask->non_tree_cache_size)));
         // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
         //    (1 << qi))));
 
@@ -231,12 +231,12 @@ __global__ void compute_attention_kernel_fused_kernel(
 
     float exp_sum = 0.f;
     for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      bool const mask =
-          prompt_phase
-              ? (q_start + qi < ti)
-              : (ti >= bitmask->non_tree_cache_size &&
-                 !test_bit(
-                     bitmask->bit_mask, ti - bitmask->non_tree_cache_size, qi));
+      bool const mask = prompt_phase
+                            ? (q_start + qi < ti)
+                            : (ti >= bitmask->non_tree_cache_size &&
+                               (!test_bit(bitmask->bit_mask,
+                                          qi,
+                                          ti - bitmask->non_tree_cache_size)));
       // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
       //    (1 << qi))));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
@@ -283,12 +283,12 @@ __global__ void compute_attention_kernel_fused_kernel(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
 
         if (ti < tlength) {
-          bool const mask = prompt_phase
-                                ? (q_start + qi < ti)
-                                : (ti >= bitmask->non_tree_cache_size &&
-                                   !test_bit(bitmask->bit_mask,
-                                             ti - bitmask->non_tree_cache_size,
-                                             qi));
+          bool const mask =
+              prompt_phase ? (q_start + qi < ti)
+                           : (ti >= bitmask->non_tree_cache_size &&
+                              (!test_bit(bitmask->bit_mask,
+                                         qi,
+                                         ti - bitmask->non_tree_cache_size)));
           // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
           //   (1 << qi))));
           float logit = mask ? 0.0f : qk_smem[ti - first_step];

From 7cfd7d295f163bb8be790ac2bc30ee6beefa6a82 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 00:55:59 -0400
Subject: [PATCH 217/667] Added debug output for bitmask.

---
 include/flexflow/batch_config.h |  2 +-
 src/runtime/batch_config.cc     | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index d9a6e716a..30903271a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -87,6 +87,7 @@ class BatchConfig {
   };
 
   class BitMask {
+  public:
     class Bitset {
     public:
       Bitset() : bits{0} {}
@@ -116,7 +117,6 @@ class BatchConfig {
       uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 64];
     };
 
-  public:
     Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM];
     // the number of generated tokens before the speculation tree (excluding the
     // prompt tokens)
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index d4d9cf0fe..0bf2d1ea8 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -103,6 +103,15 @@ int BatchConfig::max_spec_tree_token_num() {
   return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
 }
 
+// Overloading the << operator for the Bitset class
+std::ostream &operator<<(std::ostream &os,
+                         BatchConfig::BitMask::Bitset const &bitset) {
+  for (size_t i = 0; i < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; i++) {
+    os << (bitset.test_bit(i) ? '1' : '0');
+  }
+  return os;
+}
+
 std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode()
      << ") @@@@@@@@@@@@@@" << std::endl;
@@ -185,6 +194,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
            << std::endl;
         os << "    Current layer size: " << bc.causalMask[i].current_layer_size
            << std::endl;
+        os << "    Bit mask: " << std::endl;
+        for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; j++) {
+          os << "      " << bc.causalMask[i].bit_mask[j] << std::endl;
+        }
       }
     }
   }

From 69bbee8118d0a82d7f6efda51ef50b24858e3082 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 00:57:36 -0400
Subject: [PATCH 218/667] Fixed some bugs.

---
 src/runtime/request_manager.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4526374ac..6e3d1916b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -631,7 +631,11 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
          "No prefilling request to process in the prefilling phase.");
 
   BatchConfig bc;
-  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  if (decoding_mode == INCREMENTAL_DECODING) {
+    bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  } else if (decoding_mode == SPECULATIVE_DECODING) {
+    bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
+  }
   bc.prompt_phase = true;
   std::copy(std::begin(request_available),
             std::end(request_available),
@@ -1095,7 +1099,7 @@ bool RequestManager::update_llm_verify_results(
       request_completed = true;
       request_complete_clean_up(request_index);
     } else {
-      update_bitmask_prompt(guid, request.committed_tokens.size());
+      update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
     }
   }
 
@@ -1360,7 +1364,7 @@ void RequestManager::get_verify_results_greedy(
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
-    int committed_token_index = request.tokens.size();
+    int committed_token_index = request.tokens.size() - 1;
 
     TokenTree &token_tree = request.speculative_token_trees[0];
     // First add the root to the committed tokens
@@ -1418,9 +1422,9 @@ void RequestManager::get_verify_results_greedy(
             last_accepted_token_index = current_token_index;
             last_accepted_token_index_in_layer = current_token_index_in_layer;
             committed_token_index++;
-            current_token_index++;
-            current_token_index_in_layer++;
           }
+          current_token_index++;
+          current_token_index_in_layer++;
         }
       }
       if (!token_accepted_this_layer) {
@@ -1759,7 +1763,9 @@ bool RequestManager::add_tokens_to_spec_token_tree(
         float log_accumulated_prob =
             log_prob + parent_ptr->log_accumulated_prob;
 
-        std::cout << "Probability: " << ssm_inference_result.probs[result_idx]
+        std::cout << "Probability at result index" << result_idx << ": "
+                  << ssm_inference_result.probs[result_idx] << std::endl;
+        std::cout << "Token id: " << ssm_inference_result.token_ids[result_idx]
                   << std::endl;
         std::cout << "Log Probability: " << log_prob << std::endl;
         assert(log_prob != -std::numeric_limits<float>::infinity() &&

From 7c4c06e097e205f1c1d2dc637ed9d3749ac8433c Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 04:38:13 -0400
Subject: [PATCH 219/667] feat: add gumbel_topk, but totally copy from arg_topk

---
 include/flexflow/ops/gumbel_topk.h        | 110 +++++
 include/flexflow/ops/gumbel_topk_params.h |  29 ++
 src/ops/gumbel_topk.cc                    | 510 ++++++++++++++++++++++
 3 files changed, 649 insertions(+)
 create mode 100644 include/flexflow/ops/gumbel_topk.h
 create mode 100644 include/flexflow/ops/gumbel_topk_params.h
 create mode 100644 src/ops/gumbel_topk.cc

diff --git a/include/flexflow/ops/gumbel_topk.h b/include/flexflow/ops/gumbel_topk.h
new file mode 100644
index 000000000..3898383da
--- /dev/null
+++ b/include/flexflow/ops/gumbel_topk.h
@@ -0,0 +1,110 @@
+#ifndef _FLEXFLOW_GUMBEL_TOPK_H_
+#define _FLEXFLOW_GUMBEL_TOPK_H_
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/node.h"
+#include "flexflow/ops/gumbel_topk_params.h"
+
+namespace FlexFlow {
+
+class GumbelTopKMeta : public OpMeta {
+public:
+  GumbelTopKMeta(FFHandler handle, Op const *op);
+  bool sorted;
+  int k;
+  bool speculative_decoding;
+};
+
+class GumbelTopK : public Op {
+public:
+  using Params = GumbelTopKParams;
+  using Input = ParallelTensor;
+  GumbelTopK(FFModel &model,
+          LayerID const &layer_guid,
+          ParallelTensor const input,
+          int k,
+          bool sorted,
+          bool speculative_decoding,
+          char const *name);
+  GumbelTopK(FFModel &model,
+          LayerID const &layer_guid,
+          GumbelTopK const &other,
+          ParallelTensor const input);
+  GumbelTopK(FFModel &model,
+          Params const &params,
+          Input const input,
+          char const *name = nullptr);
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override {
+    assert(0);
+  }
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static InferenceResult
+      inference_task(Legion::Task const *task,
+                     std::vector<Legion::PhysicalRegion> const &regions,
+                     Legion::Context ctx,
+                     Legion::Runtime *runtime);
+  static InferenceResult inference_speculative_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+  void serialize(Legion::Serializer &s) const override;
+  static PCG::Node deserialize(FFModel &ff,
+                               Legion::Deserializer &d,
+                               ParallelTensor inputs[],
+                               int num_inputs);
+  Op *materialize(FFModel &ff,
+                  ParallelTensor inputs[],
+                  int num_inputs) const override;
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+  template <typename DT>
+  static void forward_kernel(GumbelTopKMeta const *m,
+                             DT const *input_ptr,
+                             float *output_ptr,
+                             int *indices_ptr,
+                             size_t batch_size,
+                             int length,
+                             int k,
+                             bool sorted,
+                             BatchConfig const *bc,
+                             ffStream_t stream);
+  static void forward_kernel_wrapper(GumbelTopKMeta const *m,
+                                     GenericTensorAccessorR const &input,
+                                     GenericTensorAccessorW const &prob,
+                                     GenericTensorAccessorW const &indices,
+                                     int batch_size,
+                                     BatchConfig const *bc);
+  Params get_params() const;
+
+public:
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+};
+
+}; // namespace FlexFlow
+
+#endif
diff --git a/include/flexflow/ops/gumbel_topk_params.h b/include/flexflow/ops/gumbel_topk_params.h
new file mode 100644
index 000000000..480e7b9cc
--- /dev/null
+++ b/include/flexflow/ops/gumbel_topk_params.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_GUMBEL_TOPK_PARAMS_H
+#define _FLEXFLOW_GUMBEL_TOPK_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct GumbelTopKParams {
+  LayerID layer_guid;
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+  char name[MAX_OPNAME];
+  bool is_valid(ParallelTensorShape const &) const;
+};
+bool operator==(GumbelTopKParams const &, GumbelTopKParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::GumbelTopKParams> {
+  size_t operator()(FlexFlow::GumbelTopKParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_GUMBEL_TOPK_PARAMS_H
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
new file mode 100644
index 000000000..5f8656061
--- /dev/null
+++ b/src/ops/gumbel_topk.cc
@@ -0,0 +1,510 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/gumbel_topk.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/hash_utils.h"
+#include "legion/legion_utilities.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+using PCG::Node;
+
+// For an input tensor, computes the top k entries in each row
+// (resp. vector along the last dimension). Thus,
+// values.shape = indices.shape = input.shape[:-1] + [k]
+Tensor FFModel::gumbel_top_k(Tensor const input,
+                          int k,
+                          bool sorted,
+                          bool speculative_decoding,
+                          char const *name) {
+  Layer *li = new Layer(this,
+                        OP_ARG_TOPK,
+                        input->data_type,
+                        name,
+                        1 /*inputs*/,
+                        0 /*weights*/,
+                        speculative_decoding ? 2 : 1 /*outputs*/,
+                        input);
+  {
+    int numdims = input->num_dims;
+    int dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdims; i++) {
+      dims[i] = input->dims[i];
+    }
+    dims[0] = k;
+    // li->outputs[0] = create_tensor_legion_ordering(
+    //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
+    li->outputs[0] = create_tensor_legion_ordering(
+        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
+    if (speculative_decoding) {
+      li->outputs[1] = create_tensor_legion_ordering(
+          numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
+    }
+  }
+  li->add_int_property("k", k);
+  li->add_int_property("sorted", sorted);
+  li->add_int_property("speculative_decoding", speculative_decoding);
+  layers.push_back(li);
+  // outputs[0] = li->outputs[0];
+  // outputs[1] = li->outputs[1];
+  return li->outputs[0];
+}
+
+Op *GumbelTopK::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  long long value;
+  layer->get_int_property("k", value);
+  int k = value;
+  layer->get_int_property("sorted", value);
+  bool sorted = (bool)value;
+  layer->get_int_property("speculative_decoding", value);
+  bool speculative_decoding = (bool)value;
+
+  return new GumbelTopK(model,
+                     layer->layer_guid,
+                     inputs[0],
+                     k,
+                     sorted,
+                     speculative_decoding,
+                     layer->name);
+}
+
+GumbelTopKParams GumbelTopK::get_params() const {
+  GumbelTopKParams params;
+  params.k = this->k;
+  params.sorted = this->sorted;
+  params.speculative_decoding = this->speculative_decoding;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
+  return params;
+}
+
+bool GumbelTopKParams::is_valid(ParallelTensorShape const &) const {
+  // topk is always valid
+  return true;
+}
+
+bool operator==(GumbelTopKParams const &lhs, GumbelTopKParams const &rhs) {
+  return lhs.k == rhs.k && lhs.sorted == rhs.sorted &&
+         lhs.speculative_decoding == rhs.speculative_decoding;
+}
+
+GumbelTopK::GumbelTopK(FFModel &model,
+                 LayerID const &_layer_guid,
+                 ParallelTensor const _input,
+                 int _k,
+                 bool _sorted,
+                 bool _speculative_decoding,
+                 char const *name)
+    : Op(model,
+         OP_ARG_TOPK,
+         _input->data_type,
+         name,
+         1 /*inputs*/,
+         0 /*weights*/,
+         _speculative_decoding ? 2 : 1 /*outputs*/,
+         _input),
+      k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
+  // overwrite layer_guid
+  layer_guid = _layer_guid;
+  int numdim = inputs[0]->num_dims;
+  ParallelDim dims[MAX_TENSOR_DIM];
+  for (int i = 0; i < numdim; i++) {
+    dims[i] = inputs[0]->dims[i];
+  }
+
+  dims[0].size = k;
+  assert(inputs[0]->dims[0].degree == 1);
+  assert(inputs[0]->dims[0].parallel_idx == -1);
+
+  outputs[0] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
+  if (_speculative_decoding) {
+    outputs[1] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
+  }
+}
+
+GumbelTopK::GumbelTopK(FFModel &model,
+                 LayerID const &layer_guid,
+                 GumbelTopK const &other,
+                 ParallelTensor const input)
+    : GumbelTopK(model,
+              layer_guid,
+              input,
+              other.k,
+              other.sorted,
+              other.speculative_decoding,
+              other.name) {}
+
+GumbelTopK::GumbelTopK(FFModel &model,
+                 GumbelTopKParams const &params,
+                 ParallelTensor const input,
+                 char const *name)
+    : GumbelTopK(model,
+              params.layer_guid,
+              input,
+              params.k,
+              params.sorted,
+              params.speculative_decoding,
+              params.name) {}
+
+void GumbelTopK::init_inference(FFModel const &ff,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = batch_outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(GumbelTopK)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  //   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
+  //                                                     0 /*projection id*/,
+  //                                                     WRITE_ONLY,
+  //                                                     EXCLUSIVE,
+  //                                                     batch_outputs[1]->region));
+  //   launcher.add_field(2, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
+}
+
+void GumbelTopK::init(FFModel const &ff) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_init(ff, argmap);
+  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(GumbelTopK)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  //   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
+  //                                                     0 /*projection id*/,
+  //                                                     WRITE_ONLY,
+  //                                                     EXCLUSIVE,
+  //                                                     outputs[1]->region));
+  //   launcher.add_field(2, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap(ff, fm);
+}
+
+OpMeta *GumbelTopK::init_task(Task const *task,
+                           std::vector<PhysicalRegion> const &regions,
+                           Context ctx,
+                           Runtime *runtime) {
+  GumbelTopK *topk = (GumbelTopK *)task->args;
+  FFHandler handle = *((FFHandler *)task->local_args);
+  GumbelTopKMeta *m = new GumbelTopKMeta(handle, topk);
+  m->profiling = topk->profiling;
+  m->inference_debugging = topk->inference_debugging;
+  m->sorted = topk->sorted;
+  m->k = topk->k;
+  std::strcpy(m->op_name, topk->name);
+  m->layer_guid = topk->layer_guid;
+  m->speculative_decoding = topk->speculative_decoding;
+  return m;
+}
+
+void GumbelTopK::forward(FFModel const &ff) {
+  // GumbelTopK does not support forward
+  assert(false);
+}
+
+FutureMap GumbelTopK::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "GumbelTopK op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  if (speculative_decoding) {
+    IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+                           parallel_is,
+                           TaskArgument(nullptr, 0),
+                           argmap,
+                           Predicate::TRUE_PRED,
+                           false /*must*/,
+                           0 /*mapper_id*/,
+                           machine_view_hash);
+    launcher.add_future(bc);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[0]->region));
+    launcher.add_field(0, FID_DATA);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(1, FID_DATA);
+
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[1]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[1]->region));
+    launcher.add_field(2, FID_DATA);
+    return runtime->execute_index_space(ctx, launcher);
+
+  } else {
+    IndexLauncher launcher(ARG_TOPK_INF_TASK_ID,
+                           parallel_is,
+                           TaskArgument(nullptr, 0),
+                           argmap,
+                           Predicate::TRUE_PRED,
+                           false /*must*/,
+                           0 /*mapper_id*/,
+                           machine_view_hash);
+    launcher.add_future(bc);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[0]->region));
+    launcher.add_field(0, FID_DATA);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(1, FID_DATA);
+    return runtime->execute_index_space(ctx, launcher);
+  }
+}
+
+InferenceResult
+    GumbelTopK::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  // const GumbelTopK* topk = (const GumbelTopK*) task->args;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
+    // Directly return for empty batch config
+    InferenceResult ir;
+    return ir;
+  }
+  GumbelTopKMeta *m = *((GumbelTopKMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW probs;
+
+  int batch_size = bc->num_active_tokens();
+  GumbelTopK::forward_kernel_wrapper(
+      m, input, probs, indices, batch_size, nullptr);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    GumbelTopK::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {indices});
+  }
+
+  InferenceResult ir;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size);
+  return ir;
+}
+
+InferenceResult GumbelTopK::inference_speculative_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3);
+  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  if (bc.num_active_tokens() == 0) {
+    // Directly return for empty batch config
+    InferenceResult ir;
+    return ir;
+  }
+  GumbelTopKMeta *m = *((GumbelTopKMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO(
+      DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
+
+  int batch_size = bc.num_active_tokens();
+  GumbelTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
+
+  InferenceResult ir;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
+  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  return ir;
+}
+
+void GumbelTopK::backward(FFModel const &ff) {
+  // GumbelTopK does not support backward
+  assert(false);
+}
+
+void GumbelTopK::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->layer_guid.id);
+  sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->k);
+  sez.serialize(this->sorted);
+  sez.serialize(this->speculative_decoding);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
+}
+
+Node GumbelTopK::deserialize(FFModel &ff,
+                          Legion::Deserializer &dez,
+                          ParallelTensor inputs[],
+                          int num_inputs) {
+  assert(num_inputs == 1);
+  size_t id, transformer_layer_id, deserialized_model_id;
+  dez.deserialize(id);
+  dez.deserialize(transformer_layer_id);
+  dez.deserialize(deserialized_model_id);
+  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+  dez.deserialize(k);
+  dez.deserialize(sorted);
+  dez.deserialize(speculative_decoding);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
+  GumbelTopKParams params;
+  params.layer_guid = layer_guid;
+  params.k = k;
+  params.sorted = sorted;
+  params.speculative_decoding = speculative_decoding;
+  strcpy(params.name, name);
+  return ff.get_or_create_node<GumbelTopK>(inputs[0], params);
+}
+
+Op *GumbelTopK::materialize(FFModel &ff,
+                         ParallelTensor inputs[],
+                         int num_inputs) const {
+  GumbelTopKParams params = get_params();
+  return new GumbelTopK(ff, params, inputs[0], this->name);
+}
+
+bool GumbelTopK::measure_operator_cost(Simulator *sim,
+                                    MachineView const &mv,
+                                    CostMetrics &cost_metrics) const {
+  return false;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::GumbelTopKParams>::operator()(
+    FlexFlow::GumbelTopKParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.k);
+  hash_combine(key, params.sorted);
+  hash_combine(key, params.speculative_decoding);
+  return key;
+}
+}; // namespace std

From b06d717f2417958afe466acd672b9a75d6c690bc Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 04:41:33 -0400
Subject: [PATCH 220/667] chore: add GUMBEL_TOPK TaskIDs

---
 include/flexflow/model.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index d455e4f5b..b6bc51adc 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -145,6 +145,9 @@ enum TaskIDs {
   TOPK_INIT_TASK_ID,
   TOPK_FWD_TASK_ID,
   TOPK_BWD_TASK_ID,
+  GUMBEL_TOPK_INIT_TASK_ID,
+  GUMBEL_TOPK_INF_TASK_ID,
+  GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID,
   ARG_TOPK_INIT_TASK_ID,
   ARG_TOPK_INF_TASK_ID,
   ARG_TOPK_INF_SPECULATIVE_TASK_ID,

From a5d68dad94957b913bc256a072eb32c79252f389 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 04:42:08 -0400
Subject: [PATCH 221/667] chore: minor rename

---
 src/ops/gumbel_topk.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index 5f8656061..861e64ded 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -54,7 +54,7 @@ Tensor FFModel::gumbel_top_k(Tensor const input,
                           bool speculative_decoding,
                           char const *name) {
   Layer *li = new Layer(this,
-                        OP_ARG_TOPK,
+                        OP_GUMBEL_TOPK,
                         input->data_type,
                         name,
                         1 /*inputs*/,
@@ -136,7 +136,7 @@ GumbelTopK::GumbelTopK(FFModel &model,
                  bool _speculative_decoding,
                  char const *name)
     : Op(model,
-         OP_ARG_TOPK,
+         OP_GUMBEL_TOPK,
          _input->data_type,
          name,
          1 /*inputs*/,
@@ -200,7 +200,7 @@ void GumbelTopK::init_inference(FFModel const &ff,
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   size_t machine_view_hash = view->hash();
   set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
-  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
+  IndexLauncher launcher(GUMBEL_TOPK_INIT_TASK_ID,
                          parallel_is,
                          TaskArgument(this, sizeof(GumbelTopK)),
                          argmap,
@@ -238,7 +238,7 @@ void GumbelTopK::init(FFModel const &ff) {
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
   set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
+  IndexLauncher launcher(GUMBEL_TOPK_INIT_TASK_ID,
                          parallel_is,
                          TaskArgument(this, sizeof(GumbelTopK)),
                          argmap,
@@ -307,7 +307,7 @@ FutureMap GumbelTopK::inference(
   /* std::cout << "GumbelTopK op machine_view: " << *(MachineView const *)mv
             << std::endl; */
   if (speculative_decoding) {
-    IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+    IndexLauncher launcher(GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID,
                            parallel_is,
                            TaskArgument(nullptr, 0),
                            argmap,
@@ -340,7 +340,7 @@ FutureMap GumbelTopK::inference(
     return runtime->execute_index_space(ctx, launcher);
 
   } else {
-    IndexLauncher launcher(ARG_TOPK_INF_TASK_ID,
+    IndexLauncher launcher(GUMBEL_TOPK_INF_TASK_ID,
                            parallel_is,
                            TaskArgument(nullptr, 0),
                            argmap,

From 08cb70543f2284ec03383bf07bd90205959a89d4 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 05:27:33 -0400
Subject: [PATCH 222/667] chore: FFModel::gumbel_top_k

---
 include/flexflow/model.h |  6 ++++++
 src/ops/gumbel_topk.cc   | 19 ++++++++++---------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index b6bc51adc..7db864def 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -679,6 +679,12 @@ class FFModel {
              int k,
              bool sorted,
              char const *name = NULL);
+  Tensor gumbel_top_k(Tensor const input,
+                   // Tensor *outputs,
+                   int k,
+                   bool sorted,
+                   bool speculative_decoding,
+                   char const *name = NULL);
   Tensor arg_top_k(Tensor const input,
                    // Tensor *outputs,
                    int k,
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index 861e64ded..53020dd11 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -46,8 +46,8 @@ using Legion::TaskLauncher;
 using PCG::Node;
 
 // For an input tensor, computes the top k entries in each row
-// (resp. vector along the last dimension). Thus,
-// values.shape = indices.shape = input.shape[:-1] + [k]
+// (resp. vector along the last dimension) using Gumbel trick (https://arxiv.org/abs/1903.06059). 
+// Thus, values.shape = indices.shape = input.shape[:-1] + [k]
 Tensor FFModel::gumbel_top_k(Tensor const input,
                           int k,
                           bool sorted,
@@ -57,9 +57,9 @@ Tensor FFModel::gumbel_top_k(Tensor const input,
                         OP_GUMBEL_TOPK,
                         input->data_type,
                         name,
-                        1 /*inputs*/,
-                        0 /*weights*/,
-                        speculative_decoding ? 2 : 1 /*outputs*/,
+                        1,
+                        0,
+                        speculative_decoding ? 3 : 1 /*outputs*/,
                         input);
   {
     int numdims = input->num_dims;
@@ -68,21 +68,22 @@ Tensor FFModel::gumbel_top_k(Tensor const input,
       dims[i] = input->dims[i];
     }
     dims[0] = k;
-    // li->outputs[0] = create_tensor_legion_ordering(
-    //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
+    // token_ids
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
     if (speculative_decoding) {
+      // log_probs
       li->outputs[1] = create_tensor_legion_ordering(
           numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
+      // perturbed_log_probs
+      li->outputs[2] = create_tensor_legion_ordering(
+          numdims, dims, DT_FLOAT, li, 2, false /*create_grad*/);
     }
   }
   li->add_int_property("k", k);
   li->add_int_property("sorted", sorted);
   li->add_int_property("speculative_decoding", speculative_decoding);
   layers.push_back(li);
-  // outputs[0] = li->outputs[0];
-  // outputs[1] = li->outputs[1];
   return li->outputs[0];
 }
 

From 20bf729719f1edaeccb8163c7a8f99acb3401d87 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 05:43:17 -0400
Subject: [PATCH 223/667] chore: fix model.cc

---
 src/runtime/model.cc | 52 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 6b508709c..b68e12a39 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -44,6 +44,7 @@
 #include "flexflow/ops/fused.h"
 #include "flexflow/ops/gather.h"
 #include "flexflow/ops/groupby.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
@@ -3329,7 +3330,7 @@ void FFModel::create_operators_from_layers() {
     if (config.computationMode == COMP_MODE_INFERENCE &&
         config.tensor_parallelism_degree > 1 &&
         (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
-         l->op_type == OP_ARGMAX)) {
+         l->op_type == OP_ARGMAX || l->op_type == OP_GUMBEL_TOPK)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,
@@ -5949,6 +5950,55 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<TopK::backward_task>(registrar);
     }
   }
+  // GumbelTopk task
+  {
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INIT_TASK_ID, "GumbelTopK Init");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<OpMeta *, GumbelTopK::init_task>(
+          registrar, "GumbelTopK Init Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<OpMeta *, GumbelTopK::init_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_TASK_ID, "GumbelTopK Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<InferenceResult,
+                                        GumbelTopK::inference_task>(
+          registrar, "GumbelTopK Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<InferenceResult, GumbelTopK::inference_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_SPECULATIVE_TASK_ID,
+                                   "GumbelTopK Speculative Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<InferenceResult,
+                                        GumbelTopK::inference_speculative_task>(
+          registrar, "GumbelTopK Speculative Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<InferenceResult,
+                                     GumbelTopK::inference_speculative_task>(
+          registrar);
+    }
+  }
   // ArgTopk task
   {
     TaskVariantRegistrar registrar(ARG_TOPK_INIT_TASK_ID, "ArgTopK Init");

From 2d6fd61b155e7d3e22c92a517533b1a11c27c2ed Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 07:39:58 -0400
Subject: [PATCH 224/667] feat: add gumbel_topk cuda, but copy from argtopk

---
 src/ops/gumbel_topk.cu | 525 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 525 insertions(+)
 create mode 100644 src/ops/gumbel_topk.cu

diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
new file mode 100644
index 000000000..491b255be
--- /dev/null
+++ b/src/ops/gumbel_topk.cu
@@ -0,0 +1,525 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/arg_topk.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::coord_t;
+
+enum class HeapType { kMinHeap, kMaxHeap };
+enum class PreferIndices { kLower, kHigher };
+
+template <typename T>
+struct Entry {
+  int index;
+  T value;
+};
+
+template <typename T>
+struct LinearData {
+  typedef Entry<T> Entry;
+
+  __device__ Entry &operator[](std::size_t index) const {
+    return data[index];
+  }
+
+  __device__ int get_index(int i) const {
+    return data[i].index;
+  }
+  __device__ T get_value(int i) const {
+    return data[i].value;
+  }
+
+  Entry *const data;
+};
+
+template <typename T>
+struct IndirectLinearData {
+  typedef Entry<T> Entry;
+
+  __device__ Entry &operator[](std::size_t index) const {
+    return data[index];
+  }
+
+  __device__ int get_index(int i) const {
+    return backing_data[data[i].index].index;
+  }
+  __device__ T get_value(int i) const {
+    return data[i].value;
+  }
+
+  Entry *const data;
+  Entry *const backing_data;
+};
+
+template <typename T>
+struct StridedData {
+  typedef Entry<T> Entry;
+
+  __device__ Entry &operator[](std::size_t index) const {
+    return data[index * blockDim.x + threadIdx.x];
+  }
+
+  __device__ int get_index(int i) const {
+    return (*this)[i].index;
+  }
+  __device__ T get_value(int i) const {
+    return (*this)[i].value;
+  }
+
+  Entry *const data;
+};
+
+// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
+template <HeapType heapType,
+          PreferIndices preferIndices,
+          template <typename>
+          class Data,
+          typename T>
+struct IndexedHeap {
+  typedef typename Data<T>::Entry Entry;
+  Data<T> const data;
+  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
+
+  __device__ bool is_above(int left, int right) {
+    T left_value = data.get_value(left);
+    T right_value = data.get_value(right);
+    if (left_value == right_value) {
+      if (preferIndices == PreferIndices::kLower) {
+        return data.get_index(left) < data.get_index(right);
+      } else {
+        return data.get_index(left) > data.get_index(right);
+      }
+    }
+    if (heapType == HeapType::kMinHeap) {
+      return left_value < right_value;
+    } else {
+      return left_value > right_value;
+    }
+  }
+
+  __device__ void assign(int i, Entry const &entry) {
+    data[i] = entry;
+  }
+
+  __device__ void push_up(int i) {
+    int child = i;
+    int parent;
+    for (; child > 0; child = parent) {
+      parent = (child - 1) / 2;
+      if (!is_above(child, parent)) {
+        // Heap property satisfied.
+        break;
+      }
+      swap(child, parent);
+    }
+  }
+
+  __device__ void swap(int a, int b) {
+    auto tmp = data[b];
+    data[b] = data[a];
+    data[a] = tmp;
+  }
+
+  __device__ void push_root_down(int k) {
+    push_down(0, k);
+  }
+
+  // MAX-HEAPIFY in Cormen
+  __device__ void push_down(int node, int k) {
+    while (true) {
+      int const left = 2 * node + 1;
+      int const right = left + 1;
+      int smallest = node;
+      if (left < k && is_above(left, smallest)) {
+        smallest = left;
+      }
+      if (right < k && is_above(right, smallest)) {
+        smallest = right;
+      }
+      if (smallest == node) {
+        break;
+      }
+      swap(smallest, node);
+      node = smallest;
+    }
+  }
+
+  // BUILD-MAX-HEAPIFY in Cormen
+  __device__ void build(int k) {
+    for (int node = (k - 1) / 2; node >= 0; node--) {
+      push_down(node, k);
+    }
+  }
+
+  // HEAP-EXTRACT-MAX in Cormen
+  __device__ void remove_root(int k) {
+    data[0] = data[k - 1];
+    push_root_down(k - 1);
+  }
+
+  // in-place HEAPSORT in Cormen
+  // This method destroys the heap property.
+  __device__ void sort(int k) {
+    for (int slot = k - 1; slot > 0; slot--) {
+      // This is like remove_root but we insert the element at the end.
+      swap(slot, 0);
+      // Heap is now an element smaller.
+      push_root_down(/*k=*/slot);
+    }
+  }
+
+  __device__ void replace_root(Entry const &entry, int k) {
+    data[0] = entry;
+    push_root_down(k);
+  }
+
+  __device__ Entry const &root() {
+    return data[0];
+  }
+};
+
+template <HeapType heapType,
+          PreferIndices preferIndices,
+          template <typename>
+          class Data,
+          typename T>
+__device__ IndexedHeap<heapType, preferIndices, Data, T>
+    make_indexed_heap(typename Data<T>::Entry *data) {
+  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
+}
+
+// heapArgTopK walks over [input, input+length) with `step_size` stride starting
+// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
+// using `Accessor` to access elements in `heap_entries`. If sorted=true, the
+// elements will be sorted at the end.
+template <typename T, template <typename> class Data = LinearData>
+__device__ void heapArgTopK(T const *__restrict__ input,
+                            int length,
+                            int k,
+                            Entry<T> *__restrict__ heap_entries,
+                            bool sorted = false,
+                            int start_index = 0,
+                            int step_size = 1) {
+  assert(k <= length);
+
+  auto heap =
+      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
+          heap_entries);
+
+  int heap_end_index = start_index + k * step_size;
+  if (heap_end_index > length) {
+    heap_end_index = length;
+  }
+  // Initialize the min-heap.
+  for (int index = start_index, slot = 0; index < heap_end_index;
+       index += step_size, slot++) {
+    heap.assign(slot, {index, input[index]});
+  }
+
+  heap.build(k);
+
+  // Now iterate over the remaining items.
+  // If an item is smaller than the min element, it is not amongst the top k.
+  // Otherwise, replace the min element with it and push upwards.
+  for (int index = heap_end_index; index < length; index += step_size) {
+    // We prefer elements with lower indices. This is given here.
+    // Later elements automatically have higher indices, so can be discarded.
+    if (input[index] > heap.root().value) {
+      // This element should replace the min.
+      heap.replace_root({index, input[index]}, k);
+    }
+  }
+
+  // Sort if wanted.
+  if (sorted) {
+    heap.sort(k);
+  }
+}
+
+// mergeShards performs a top-k merge on `num_shards` many sorted streams that
+// are sorted and stored in `entries` in a strided way:
+// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
+// The overall top k elements are written to `top_k_values` and their indices
+// to top_k_indices.
+// `top_k_heap` is used as temporary storage for the merge heap.
+template <typename T>
+__device__ void mergeShards(int num_shards,
+                            int k,
+                            Entry<T> *__restrict__ entries,
+                            Entry<T> *__restrict__ top_k_heap,
+                            float *top_k_values,
+                            int *top_k_indices,
+                            bool speculative_decoding) {
+  // If k < num_shards, we can use a min-heap with k elements to get the top k
+  // of the sorted blocks.
+  // If k > num_shards, we can initialize a min-heap with the top element from
+  // each sorted block.
+  int const heap_size = k < num_shards ? k : num_shards;
+
+  // Min-heap part.
+  {
+    auto min_heap = IndexedHeap<HeapType::kMinHeap,
+                                PreferIndices::kHigher,
+                                IndirectLinearData,
+                                T>{IndirectLinearData<T>{top_k_heap, entries}};
+    // Initialize the heap as a min-heap.
+    for (int slot = 0; slot < heap_size; slot++) {
+      min_heap.assign(slot, {slot, entries[slot].value});
+    }
+    min_heap.build(heap_size);
+
+    // Now perform top k with the remaining shards (if num_shards > heap_size).
+    for (int shard = heap_size; shard < num_shards; shard++) {
+      auto const entry = entries[shard];
+      auto const root = min_heap.root();
+      if (entry.value < root.value) {
+        continue;
+      }
+      if (entry.value == root.value &&
+          entry.index > entries[root.index].index) {
+        continue;
+      }
+      // This element should replace the min.
+      min_heap.replace_root({shard, entry.value}, heap_size);
+    }
+  }
+
+  // Max-part.
+  {
+    // Turn the min-heap into a max-heap in-place.
+    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
+                                PreferIndices::kLower,
+                                IndirectLinearData,
+                                T>{IndirectLinearData<T>{top_k_heap, entries}};
+    // Heapify into a max heap.
+    max_heap.build(heap_size);
+
+    // Now extract the minimum k-1 times.
+    // k is treated specially.
+    int const last_k = k - 1;
+    for (int rank = 0; rank < last_k; rank++) {
+      Entry<T> const &max_element = max_heap.root();
+      if (speculative_decoding) {
+        assert(top_k_values != nullptr);
+        top_k_values[rank] = static_cast<float>(max_element.value);
+      }
+
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
+      int next_shard_index = shard_index + num_shards;
+      // For rank < k-1, each top k heap still contains at least 1 element,
+      // so we can draw a replacement.
+      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
+                            heap_size);
+    }
+
+    // rank == last_k.
+    Entry<T> const &max_element = max_heap.root();
+    // top_k_values[last_k] = max_element.value;
+    int shard_index = max_element.index;
+    top_k_indices[last_k] = entries[shard_index].index;
+    top_k_values[last_k] = static_cast<float>(max_element.value);
+  }
+}
+
+template <typename T>
+__global__ void arg_topk_forward_kernel(T const *__restrict__ input,
+                                        size_t shared_memory_size,
+                                        int length,
+                                        int k,
+                                        bool sorted,
+                                        float *__restrict__ output,
+                                        int *__restrict__ indices,
+                                        bool speculative_decoding) {
+  __shared__ char shared_memory[48 << 10];
+  int const batch_index = blockIdx.x;
+  T const *batch_input = input + batch_index * length;
+  int const thread_index = threadIdx.x;
+  int const thread_count = blockDim.x;
+  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
+  heapArgTopK<T, StridedData>(
+      batch_input, length, k, shared_entries, true, thread_index, thread_count);
+  __syncthreads();
+  if (thread_index == 0) {
+    int const offset = batch_index * k;
+    auto batch_output = output + offset;
+    auto batch_indices = indices + offset;
+    Entry<T> *top_k_heap = shared_entries + thread_count * k;
+    mergeShards(thread_count,
+                k,
+                shared_entries,
+                top_k_heap,
+                batch_output,
+                batch_indices,
+                speculative_decoding);
+  }
+}
+
+/*static*/
+template <typename DT>
+void ArgTopK::forward_kernel(
+    ArgTopKMeta const *m,
+    DT const *input_ptr,
+    float *output_ptr,
+    int *indices_ptr,
+    size_t batch_size,
+    int length,
+    int k,
+    bool sorted,
+    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
+    cudaStream_t stream) {
+  // Adopted from TensorFlow's ArgTopK implementation
+  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
+  int num_shards = 0;
+  {
+    constexpr auto shared_memory_size = 48 << 10;
+    auto const heap_size = k * sizeof(Entry<DT>);
+    // shared_memory_size = (num_shards + 1) * heap_size <=>
+    num_shards = shared_memory_size / heap_size - 1;
+    assert(num_shards > 0);
+    if (num_shards > CUDA_NUM_THREADS) {
+      num_shards = CUDA_NUM_THREADS;
+    }
+  }
+  // We are limited by the amount of shared memory we have per block.
+  size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
+  // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
+  size_t num_blocks = batch_size;
+
+  // all requests share the same number of branches
+  if (m->speculative_decoding) {
+    assert(bc->num_active_requests() >= 0);
+    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+    num_shards = k;
+    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        input_ptr,
+        shared_memory_size,
+        length,
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
+        sorted,
+        output_ptr,
+        indices_ptr,
+        m->speculative_decoding);
+  } else {
+
+    assert(num_shards >= (size_t)k);
+    num_shards = k;
+    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        input_ptr,
+        shared_memory_size,
+        length,
+        k,
+        sorted,
+        nullptr,
+        indices_ptr,
+        false);
+  }
+}
+
+/*static*/
+void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
+                                     GenericTensorAccessorR const &input,
+                                     // float *output_ptr,
+                                     GenericTensorAccessorW const &probs,
+                                     GenericTensorAccessorW const &indices,
+                                     int batch_size,
+                                     BatchConfig const *bc) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  // Domain in1_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[0].region.get_index_space());
+  //   Domain out1_domain = runtime->get_index_space_domain(
+  //       ctx, task->regions[1].region.get_index_space());
+  // Domain out2_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[1].region.get_index_space());
+  int numdims = input.domain.get_dim();
+  assert(indices.domain.get_dim() == numdims);
+
+  int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1;
+  int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1;
+
+  // assert(out1_domain == out2_domain);
+  for (int i = 1; i < input.domain.get_dim(); i++) {
+    assert(input.domain.lo()[i] == indices.domain.lo()[i]);
+    assert(input.domain.hi()[i] == indices.domain.hi()[i]);
+  }
+  // float const *in_ptr = helperGetTensorPointerRO<float>(
+  //     regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  //   float *value_ptr = helperGetTensorPointerWO<float>(
+  //       regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  // int *index_ptr = helperGetTensorPointerWO<int>(
+  //    regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int k = indices.domain.hi()[0] - indices.domain.lo()[0] +
+          1; /*TODO: This prints to 5*/
+
+  // batch_size = input.domain.get_volume() / length;
+  // assert(indices.domain.get_volume() / k == batch_size);
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (input.data_type == DT_HALF) {
+    ArgTopK::forward_kernel(m,
+                            input.get_half_ptr(),
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
+  } else if (input.data_type == DT_FLOAT) {
+    ArgTopK::forward_kernel(m,
+                            input.get_float_ptr(),
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ArgTopK] forward time = %.2lfms\n", elapsed);
+  }
+}
+
+ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op)
+    : OpMeta(handler, op) {}
+
+}; // namespace FlexFlow

From be3a5221b793e7f6c046bb34434f76383a49e8a3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 11:45:16 -0400
Subject: [PATCH 225/667] Fixed a bug in verification.

---
 src/runtime/request_manager.cc | 36 ++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6e3d1916b..6d429baae 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1385,7 +1385,7 @@ void RequestManager::get_verify_results_greedy(
     ++layer_it;
     for (; layer_it != token_tree.tree_layers.end(); layer_it++) {
       // We skip the first layer
-      std::list<std::shared_ptr<TokenTreeNode>> &tree_layer = *layer_it;
+      std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
 
       bool token_accepted_this_layer = false;
       int current_token_index_in_layer = 0;
@@ -1414,8 +1414,10 @@ void RequestManager::get_verify_results_greedy(
             // from_index: the index of the token in the tree (excluding the
             // pruned tokens)
             // to_index: the committed token index in the request
-            request.committed_tokens.push_back(Request::CommittedToken(
-                current_token_index, committed_token_index, node_ptr->id));
+            request.committed_tokens.push_back(
+                Request::CommittedToken(llm_result_offset + current_token_index,
+                                        committed_token_index,
+                                        node_ptr->id));
             request.tokens.push_back(node_ptr->id);
 
             token_accepted_this_layer = true;
@@ -1429,23 +1431,23 @@ void RequestManager::get_verify_results_greedy(
       }
       if (!token_accepted_this_layer) {
         // No token is accepted in this layer, we should stop the traversal
-        // However, we have to add the last sampled token as a correction from
-        // the LLM
-
-        // from_index: since this token is not in the token tree, the llm
-        // doesn't have its KV cache, so the from_index should be a place
-        // holder, which is -1
-        request.committed_tokens.push_back(Request::CommittedToken(
-            -1,
-            committed_token_index,
-            llm_verify_result
-                .token_ids[llm_result_offset + last_accepted_token_index]));
-        request.tokens.push_back(
-            llm_verify_result
-                .token_ids[llm_result_offset + last_accepted_token_index]);
         break;
       }
     }
+
+    // Add the last token (that is not verified by the LLM)
+    // from_index: since this token is not in the token tree, the llm
+    // doesn't have its KV cache, so the from_index should be a place
+    // holder, which is -1
+    request.committed_tokens.push_back(Request::CommittedToken(
+        -1,
+        committed_token_index,
+        llm_verify_result
+            .token_ids[llm_result_offset + last_accepted_token_index]));
+    request.tokens.push_back(
+        llm_verify_result
+            .token_ids[llm_result_offset + last_accepted_token_index]);
+
     llm_result_offset += request.num_tokens_in_batch;
 
     if (verbose) {

From 025ebbb6d30ec1ff117be9696100c7f85276d389 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 14:42:47 -0400
Subject: [PATCH 226/667] feat: more gumbel_topk adaption

---
 include/flexflow/ops/gumbel_topk.h |  6 ++-
 src/ops/gumbel_topk.cc             | 62 +++++++++++++++++-------------
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/include/flexflow/ops/gumbel_topk.h b/include/flexflow/ops/gumbel_topk.h
index 3898383da..129b9e83b 100644
--- a/include/flexflow/ops/gumbel_topk.h
+++ b/include/flexflow/ops/gumbel_topk.h
@@ -83,7 +83,8 @@ class GumbelTopK : public Op {
   template <typename DT>
   static void forward_kernel(GumbelTopKMeta const *m,
                              DT const *input_ptr,
-                             float *output_ptr,
+                             float *log_probs_ptr,
+                             float *perturbed_log_probs_ptr,
                              int *indices_ptr,
                              size_t batch_size,
                              int length,
@@ -93,7 +94,8 @@ class GumbelTopK : public Op {
                              ffStream_t stream);
   static void forward_kernel_wrapper(GumbelTopKMeta const *m,
                                      GenericTensorAccessorR const &input,
-                                     GenericTensorAccessorW const &prob,
+                                     GenericTensorAccessorW const &log_probs,
+                                     GenericTensorAccessorW const &perturbed_log_probs,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
                                      BatchConfig const *bc);
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index 53020dd11..0623569e8 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -120,7 +120,7 @@ GumbelTopKParams GumbelTopK::get_params() const {
 }
 
 bool GumbelTopKParams::is_valid(ParallelTensorShape const &) const {
-  // topk is always valid
+  // gumbel_topk is always valid
   return true;
 }
 
@@ -142,7 +142,7 @@ GumbelTopK::GumbelTopK(FFModel &model,
          name,
          1 /*inputs*/,
          0 /*weights*/,
-         _speculative_decoding ? 2 : 1 /*outputs*/,
+         _speculative_decoding ? 3 : 1 /*outputs*/,
          _input),
       k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
   // overwrite layer_guid
@@ -162,6 +162,8 @@ GumbelTopK::GumbelTopK(FFModel &model,
   if (_speculative_decoding) {
     outputs[1] = model.create_parallel_tensor_legion_ordering(
         numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
+    outputs[2] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, DT_FLOAT, this, 2 /*owner_idx*/);
   }
 }
 
@@ -226,7 +228,7 @@ void GumbelTopK::init_inference(FFModel const &ff,
   //                                                     WRITE_ONLY,
   //                                                     EXCLUSIVE,
   //                                                     batch_outputs[1]->region));
-  //   launcher.add_field(2, FID_DATA);
+  launcher.add_field(2, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -259,12 +261,6 @@ void GumbelTopK::init(FFModel const &ff) {
                                                     EXCLUSIVE,
                                                     outputs[0]->region));
   launcher.add_field(1, FID_DATA);
-  //   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
-  //                                                     0 /*projection id*/,
-  //                                                     WRITE_ONLY,
-  //                                                     EXCLUSIVE,
-  //                                                     outputs[1]->region));
-  //   launcher.add_field(2, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -274,16 +270,16 @@ OpMeta *GumbelTopK::init_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
                            Context ctx,
                            Runtime *runtime) {
-  GumbelTopK *topk = (GumbelTopK *)task->args;
+  GumbelTopK *gumbel_topk = (GumbelTopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  GumbelTopKMeta *m = new GumbelTopKMeta(handle, topk);
-  m->profiling = topk->profiling;
-  m->inference_debugging = topk->inference_debugging;
-  m->sorted = topk->sorted;
-  m->k = topk->k;
-  std::strcpy(m->op_name, topk->name);
-  m->layer_guid = topk->layer_guid;
-  m->speculative_decoding = topk->speculative_decoding;
+  GumbelTopKMeta *m = new GumbelTopKMeta(handle, gumbel_topk);
+  m->profiling = gumbel_topk->profiling;
+  m->inference_debugging = gumbel_topk->inference_debugging;
+  m->sorted = gumbel_topk->sorted;
+  m->k = gumbel_topk->k;
+  std::strcpy(m->op_name, gumbel_topk->name);
+  m->layer_guid = gumbel_topk->layer_guid;
+  m->speculative_decoding = gumbel_topk->speculative_decoding;
   return m;
 }
 
@@ -323,6 +319,7 @@ FutureMap GumbelTopK::inference(
                                                       EXCLUSIVE,
                                                       batch_inputs[0]->region));
     launcher.add_field(0, FID_DATA);
+
     launcher.add_region_requirement(
         RegionRequirement(batch_outputs[0]->part,
                           0 /*projection id*/,
@@ -338,8 +335,16 @@ FutureMap GumbelTopK::inference(
                           EXCLUSIVE,
                           batch_outputs[1]->region));
     launcher.add_field(2, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
 
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[2]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[2]->region));
+    launcher.add_field(3, FID_DATA);
+
+    return runtime->execute_index_space(ctx, launcher);
   } else {
     IndexLauncher launcher(GUMBEL_TOPK_INF_TASK_ID,
                            parallel_is,
@@ -363,6 +368,7 @@ FutureMap GumbelTopK::inference(
                           EXCLUSIVE,
                           batch_outputs[0]->region));
     launcher.add_field(1, FID_DATA);
+
     return runtime->execute_index_space(ctx, launcher);
   }
 }
@@ -387,11 +393,12 @@ InferenceResult
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW probs;
+  GenericTensorAccessorW log_probs;
+  GenericTensorAccessorW perturbed_log_probs;
 
   int batch_size = bc->num_active_tokens();
   GumbelTopK::forward_kernel_wrapper(
-      m, input, probs, indices, batch_size, nullptr);
+      m, input, log_probs, perturbed_log_probs, indices, batch_size, nullptr);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -411,8 +418,8 @@ InferenceResult GumbelTopK::inference_speculative_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
+  assert(regions.size() == 4);
+  assert(task->regions.size() == 4);
   BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
   if (bc.num_active_tokens() == 0) {
     // Directly return for empty batch config
@@ -425,16 +432,19 @@ InferenceResult GumbelTopK::inference_speculative_task(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO(
+  GenericTensorAccessorW log_probs = helperGetGenericTensorAccessorWO(
       DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW perturbed_log_probs = helperGetGenericTensorAccessorWO(
+      DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime);
 
   int batch_size = bc.num_active_tokens();
-  GumbelTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
+  GumbelTopK::forward_kernel_wrapper(m, input, log_probs, perturbed_log_probs, indices, batch_size, &bc);
 
   InferenceResult ir;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
-  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  download_tensor<float>(log_probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  download_tensor<float>(perturbed_log_probs.get_float_ptr(), ir.topk_logits, batch_size * m->k);
   return ir;
 }
 

From 790c7b9a5fa90a4fa6a5d44ed7863987443caea5 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Thu, 9 May 2024 14:43:22 -0400
Subject: [PATCH 227/667] feat: implemented GumbelTopK cuda kernel

---
 src/ops/gumbel_topk.cu | 176 ++++++++++++++++++++++++-----------------
 1 file changed, 104 insertions(+), 72 deletions(-)

diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index 491b255be..f0dd412ef 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "flexflow/ops/arg_topk.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -24,17 +24,18 @@ enum class HeapType { kMinHeap, kMaxHeap };
 enum class PreferIndices { kLower, kHigher };
 
 template <typename T>
-struct Entry {
+struct GumbelEntry {
   int index;
   T value;
+  T perturbed_value;
 };
 
 template <typename T>
 struct LinearData {
-  typedef Entry<T> Entry;
+  typedef GumbelEntry<T> GumbelEntry;
 
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
+  __device__ GumbelEntry &operator[](std::size_t i) const {
+    return data[i];
   }
 
   __device__ int get_index(int i) const {
@@ -43,16 +44,19 @@ struct LinearData {
   __device__ T get_value(int i) const {
     return data[i].value;
   }
+  __device__ T get_perturbed_value(int i) const {
+    return data[i].perturbed_value;
+  }
 
-  Entry *const data;
+  GumbelEntry *const data;
 };
 
 template <typename T>
 struct IndirectLinearData {
-  typedef Entry<T> Entry;
+  typedef GumbelEntry<T> GumbelEntry;
 
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
+  __device__ GumbelEntry &operator[](std::size_t i) const {
+    return data[i];
   }
 
   __device__ int get_index(int i) const {
@@ -61,17 +65,20 @@ struct IndirectLinearData {
   __device__ T get_value(int i) const {
     return data[i].value;
   }
+  __device__ T get_perturbed_value(int i) const {
+    return data[i].perturbed_value;
+  }
 
-  Entry *const data;
-  Entry *const backing_data;
+  GumbelEntry *const data;
+  GumbelEntry *const backing_data;
 };
 
 template <typename T>
 struct StridedData {
-  typedef Entry<T> Entry;
+  typedef GumbelEntry<T> GumbelEntry;
 
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
+  __device__ GumbelEntry &operator[](std::size_t i) const {
+    return data[i * blockDim.x + threadIdx.x];
   }
 
   __device__ int get_index(int i) const {
@@ -80,25 +87,28 @@ struct StridedData {
   __device__ T get_value(int i) const {
     return (*this)[i].value;
   }
+  __device__ T get_perturbed_value(int i) const {
+    return (*this)[i].perturbed_value;
+  }
 
-  Entry *const data;
+  GumbelEntry *const data;
 };
 
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
+// A heap of GumbelEntry<T> that can either work as a min-heap or as a max-heap.
 template <HeapType heapType,
           PreferIndices preferIndices,
           template <typename>
           class Data,
           typename T>
 struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
+  typedef typename Data<T>::GumbelEntry GumbelEntry;
   Data<T> const data;
   __device__ IndexedHeap(Data<T> const &d) : data(d) {}
 
   __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
+    T left_perturbed_value = data.get_perturbed_value(left);
+    T right_perturbed_value = data.get_perturbed_value(right);
+    if (left_perturbed_value == right_perturbed_value) {
       if (preferIndices == PreferIndices::kLower) {
         return data.get_index(left) < data.get_index(right);
       } else {
@@ -106,13 +116,13 @@ struct IndexedHeap {
       }
     }
     if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
+      return left_perturbed_value < right_perturbed_value;
     } else {
-      return left_value > right_value;
+      return left_perturbed_value > right_perturbed_value;
     }
   }
 
-  __device__ void assign(int i, Entry const &entry) {
+  __device__ void assign(int i, GumbelEntry const &entry) {
     data[i] = entry;
   }
 
@@ -183,12 +193,12 @@ struct IndexedHeap {
     }
   }
 
-  __device__ void replace_root(Entry const &entry, int k) {
+  __device__ void replace_root(GumbelEntry const &entry, int k) {
     data[0] = entry;
     push_root_down(k);
   }
 
-  __device__ Entry const &root() {
+  __device__ GumbelEntry const &root() {
     return data[0];
   }
 };
@@ -199,24 +209,27 @@ template <HeapType heapType,
           class Data,
           typename T>
 __device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
+    make_indexed_heap(typename Data<T>::GumbelEntry *data) {
   return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
 }
 
-// heapArgTopK walks over [input, input+length) with `step_size` stride starting
+// heapGumbelTopK walks over [input, input+length) with `step_size` stride starting
 // at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
 // using `Accessor` to access elements in `heap_entries`. If sorted=true, the
 // elements will be sorted at the end.
+// NOTE that it applies Gumbel trick on `input`, which is,
+// input -> log(input) - log(-log(U)), where U is a uniform random number in (0, 1).
 template <typename T, template <typename> class Data = LinearData>
-__device__ void heapArgTopK(T const *__restrict__ input,
+__device__ void heapGumbelTopK(T const *__restrict__ input,
                             int length,
                             int k,
-                            Entry<T> *__restrict__ heap_entries,
+                            GumbelEntry<T> *__restrict__ heap_entries,
                             bool sorted = false,
                             int start_index = 0,
                             int step_size = 1) {
   assert(k <= length);
 
+  // TODO: apply uniform random
   auto heap =
       make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
           heap_entries);
@@ -228,7 +241,9 @@ __device__ void heapArgTopK(T const *__restrict__ input,
   // Initialize the min-heap.
   for (int index = start_index, slot = 0; index < heap_end_index;
        index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
+    T value = log(input[index]);
+    T perturbed_value = value - log(-log(1.0f * (index + 1) / length));
+    heap.assign(slot, {index, value, perturbed_value});
   }
 
   heap.build(k);
@@ -239,9 +254,11 @@ __device__ void heapArgTopK(T const *__restrict__ input,
   for (int index = heap_end_index; index < length; index += step_size) {
     // We prefer elements with lower indices. This is given here.
     // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
+    T value = log(input[index]);
+    T perturbed_value = value - log(-log(1.0f * (index + 1) / length));
+    if (perturbed_value > heap.root().perturbed_value) {
       // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
+      heap.replace_root({index, value, perturbed_value}, k);
     }
   }
 
@@ -254,15 +271,16 @@ __device__ void heapArgTopK(T const *__restrict__ input,
 // mergeShards performs a top-k merge on `num_shards` many sorted streams that
 // are sorted and stored in `entries` in a strided way:
 // |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
-// The overall top k elements are written to `top_k_values` and their indices
-// to top_k_indices.
+// The overall top k elements are written to `top_k_values` and `top_k_perturbed_values`,
+// and their indices to `top_k_indices`.
 // `top_k_heap` is used as temporary storage for the merge heap.
 template <typename T>
 __device__ void mergeShards(int num_shards,
                             int k,
-                            Entry<T> *__restrict__ entries,
-                            Entry<T> *__restrict__ top_k_heap,
+                            GumbelEntry<T> *__restrict__ entries,
+                            GumbelEntry<T> *__restrict__ top_k_heap,
                             float *top_k_values,
+                            float *top_k_perturbed_values,
                             int *top_k_indices,
                             bool speculative_decoding) {
   // If k < num_shards, we can use a min-heap with k elements to get the top k
@@ -279,7 +297,7 @@ __device__ void mergeShards(int num_shards,
                                 T>{IndirectLinearData<T>{top_k_heap, entries}};
     // Initialize the heap as a min-heap.
     for (int slot = 0; slot < heap_size; slot++) {
-      min_heap.assign(slot, {slot, entries[slot].value});
+      min_heap.assign(slot, {slot, entries[slot].value, entries[slot].perturbed_value});
     }
     min_heap.build(heap_size);
 
@@ -287,15 +305,15 @@ __device__ void mergeShards(int num_shards,
     for (int shard = heap_size; shard < num_shards; shard++) {
       auto const entry = entries[shard];
       auto const root = min_heap.root();
-      if (entry.value < root.value) {
+      if (entry.perturbed_value < root.perturbed_value) {
         continue;
       }
-      if (entry.value == root.value &&
+      if (entry.perturbed_value == root.perturbed_value &&
           entry.index > entries[root.index].index) {
         continue;
       }
       // This element should replace the min.
-      min_heap.replace_root({shard, entry.value}, heap_size);
+      min_heap.replace_root({shard, entry.value, entry.perturbed_value}, heap_size);
     }
   }
 
@@ -313,58 +331,64 @@ __device__ void mergeShards(int num_shards,
     // k is treated specially.
     int const last_k = k - 1;
     for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
+      GumbelEntry<T> const &max_element = max_heap.root();
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
       if (speculative_decoding) {
         assert(top_k_values != nullptr);
         top_k_values[rank] = static_cast<float>(max_element.value);
+        top_k_perturbed_values[rank] = static_cast<float>(max_element.perturbed_value);
       }
-
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
       int next_shard_index = shard_index + num_shards;
       // For rank < k-1, each top k heap still contains at least 1 element,
       // so we can draw a replacement.
-      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
+      max_heap.replace_root({next_shard_index, entries[next_shard_index].value, entries[next_shard_index].perturbed_value},
                             heap_size);
     }
 
     // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    // top_k_values[last_k] = max_element.value;
+    GumbelEntry<T> const &max_element = max_heap.root();
     int shard_index = max_element.index;
     top_k_indices[last_k] = entries[shard_index].index;
-    top_k_values[last_k] = static_cast<float>(max_element.value);
+    if (speculative_decoding) {
+      assert(top_k_values != nullptr);
+      top_k_values[last_k] = static_cast<float>(max_element.value);
+      top_k_perturbed_values[last_k] = static_cast<float>(max_element.perturbed_value);
+    }
   }
 }
 
 template <typename T>
-__global__ void arg_topk_forward_kernel(T const *__restrict__ input,
+__global__ void gumbel_topk_forward_kernel(T const *__restrict__ input,
                                         size_t shared_memory_size,
                                         int length,
                                         int k,
                                         bool sorted,
-                                        float *__restrict__ output,
+                                        float *__restrict__ log_probs_ptr,
+                                        float *__restrict__ perturbed_log_probs_ptr,
                                         int *__restrict__ indices,
                                         bool speculative_decoding) {
-  __shared__ char shared_memory[48 << 10];
+  __shared__ char shared_memory[48 << 10]; // block-wise shared memory
   int const batch_index = blockIdx.x;
   T const *batch_input = input + batch_index * length;
   int const thread_index = threadIdx.x;
   int const thread_count = blockDim.x;
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-  heapArgTopK<T, StridedData>(
+  GumbelEntry<T> *shared_entries = (GumbelEntry<T> *)shared_memory;
+  heapGumbelTopK<T, StridedData>(
       batch_input, length, k, shared_entries, true, thread_index, thread_count);
   __syncthreads();
   if (thread_index == 0) {
     int const offset = batch_index * k;
-    auto batch_output = output + offset;
+    auto batch_log_probs_ptr = log_probs_ptr + offset;
+    auto batch_perturbed_log_probs_ptr = perturbed_log_probs_ptr + offset;
     auto batch_indices = indices + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
+    GumbelEntry<T> *top_k_heap = shared_entries + thread_count * k;
     mergeShards(thread_count,
                 k,
                 shared_entries,
                 top_k_heap,
-                batch_output,
+                batch_log_probs_ptr
+                batch_perturbed_log_probs_ptr,
                 batch_indices,
                 speculative_decoding);
   }
@@ -372,10 +396,11 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input,
 
 /*static*/
 template <typename DT>
-void ArgTopK::forward_kernel(
-    ArgTopKMeta const *m,
+void GumbelTopK::forward_kernel(
+    GumbelTopKMeta const *m,
     DT const *input_ptr,
-    float *output_ptr,
+    float *log_probs_ptr,
+    float *perturbed_log_probs_ptr,
     int *indices_ptr,
     size_t batch_size,
     int length,
@@ -388,7 +413,7 @@ void ArgTopK::forward_kernel(
   int num_shards = 0;
   {
     constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = k * sizeof(Entry<DT>);
+    auto const heap_size = k * sizeof(GumbelEntry<DT>);
     // shared_memory_size = (num_shards + 1) * heap_size <=>
     num_shards = shared_memory_size / heap_size - 1;
     assert(num_shards > 0);
@@ -397,7 +422,7 @@ void ArgTopK::forward_kernel(
     }
   }
   // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
+  size_t shared_memory_size = (num_shards + 1) * k * sizeof(GumbelEntry<DT>);
   // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
   size_t num_blocks = batch_size;
 
@@ -406,36 +431,39 @@ void ArgTopK::forward_kernel(
     assert(bc->num_active_requests() >= 0);
     assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+    gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
-        output_ptr,
+        log_probs_ptr,
+        perturbed_log_probs_ptr
         indices_ptr,
         m->speculative_decoding);
   } else {
 
     assert(num_shards >= (size_t)k);
     num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+    gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         input_ptr,
         shared_memory_size,
         length,
         k,
         sorted,
         nullptr,
+        nullptr,
         indices_ptr,
         false);
   }
 }
 
 /*static*/
-void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
+void GumbelTopK::forward_kernel_wrapper(GumbelTopKMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      // float *output_ptr,
-                                     GenericTensorAccessorW const &probs,
+                                     GenericTensorAccessorW const &log_probs,
+                                     GenericTensorAccessorW const &perturbed_log_probs,
                                      GenericTensorAccessorW const &indices,
                                      int batch_size,
                                      BatchConfig const *bc) {
@@ -481,9 +509,11 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 
   if (input.data_type == DT_HALF) {
-    ArgTopK::forward_kernel(m,
+    GumbelTopK::forward_kernel(m,
                             input.get_half_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
+                            m->speculative_decoding ? log_probs.get_float_ptr()
+                                                    : nullptr,
+                            m->speculative_decoding ? perturbed_log_probs.get_float_ptr()
                                                     : nullptr,
                             indices.get_int32_ptr(),
                             batch_size,
@@ -493,9 +523,11 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             m->speculative_decoding ? bc : nullptr,
                             stream);
   } else if (input.data_type == DT_FLOAT) {
-    ArgTopK::forward_kernel(m,
+    GumbelTopK::forward_kernel(m,
                             input.get_float_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
+                            m->speculative_decoding ? log_probs.get_float_ptr()
+                                                    : nullptr,
+                            m->speculative_decoding ? perturbed_log_probs.get_float_ptr()
                                                     : nullptr,
                             indices.get_int32_ptr(),
                             batch_size,
@@ -515,11 +547,11 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
     checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
     cudaEventDestroy(t_start);
     cudaEventDestroy(t_end);
-    printf("[ArgTopK] forward time = %.2lfms\n", elapsed);
+    printf("[GumbelTopK] forward time = %.2lfms\n", elapsed);
   }
 }
 
-ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op)
+GumbelTopKMeta::GumbelTopKMeta(FFHandler handler, Op const *op)
     : OpMeta(handler, op) {}
 
 }; // namespace FlexFlow

From 2518d9b431512ed3c57ce9ea7127011e2a4b6022 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 15:41:15 -0400
Subject: [PATCH 228/667] Fixed some bugs and removed unused code.

---
 src/runtime/request_manager.cc | 168 +++------------------------------
 1 file changed, 13 insertions(+), 155 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6d429baae..f63aa792f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1383,7 +1383,8 @@ void RequestManager::get_verify_results_greedy(
     int current_token_index = 1; // Because we skip the root
     auto layer_it = token_tree.tree_layers.begin();
     ++layer_it;
-    for (; layer_it != token_tree.tree_layers.end(); layer_it++) {
+    for (int layer_index = 1; layer_index < token_tree.tree_layers.size();
+         layer_index++) {
       // We skip the first layer
       std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
 
@@ -1433,6 +1434,7 @@ void RequestManager::get_verify_results_greedy(
         // No token is accepted in this layer, we should stop the traversal
         break;
       }
+      ++layer_it;
     }
 
     // Add the last token (that is not verified by the LLM)
@@ -1453,7 +1455,8 @@ void RequestManager::get_verify_results_greedy(
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
       for (auto const &committed_token : request.committed_tokens) {
-        std::cout << committed_token.token_id << " ";
+        std::cout << committed_token.token_id << " ("
+                  << tokenizer_->Decode({committed_token.token_id}) << ") ";
       }
       std::cout << std::endl;
       std::string output = this->tokenizer_->Decode(request.tokens);
@@ -1735,7 +1738,6 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     }
     int result_offset = request.first_token_offset_in_batch *
                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-    int result_num = parent_num * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     int current_tree_size = request.causal_mask.tree_or_prompt_size;
     int empty_slots_on_tree = BatchConfig::MAX_SPEC_TREE_TOKEN_NUM -
                               current_tree_size; // The number of empty slots
@@ -1755,6 +1757,9 @@ bool RequestManager::add_tokens_to_spec_token_tree(
         tokens;
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
+      if (parent_ptr->pruned) {
+        continue;
+      }
       for (int child_pos = 0;
            child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
            child_pos++) {
@@ -1847,7 +1852,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     }
 
     std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
-        request.speculative_token_trees[0].tree_layers.back();
+        spec_token_tree.tree_layers.back();
     for (auto it = last_layer.begin(); it != last_layer.end();) {
       if ((*it)->pruned) {
         it = last_layer.erase(it);
@@ -1857,160 +1862,13 @@ bool RequestManager::add_tokens_to_spec_token_tree(
       }
     }
     all_request_last_layer_empty &= last_layer.empty();
+
+    if (last_layer.empty()) {
+      spec_token_tree.tree_layers.pop_back();
+    }
   }
   return all_request_last_layer_empty;
 }
 
-// bool RequestManager::add_token_to_spec_token_tree(RequestGuid guid,
-//                                                   BatchConfig::TokenId
-//                                                   token_id, int parent_pos,
-//                                                   float log_accumulated_prob)
-//                                                   {
-//   // This method assumes only one small model is used for speculation
-//   // This method is called by update_ssm_inference_results()
-
-//   if (verbose) {
-//     std::cout << "add_token_to_spec_token_tree: guid=" << guid
-//               << " token_id=" << token_id << " parent_pos=" << parent_pos
-//               << " log_accumulated_prob=" << log_accumulated_prob <<
-//               std::endl;
-//   }
-
-//   // This is called after the first small model inference
-//   assert(current_speculation_step >= 1 &&
-//          "The current speculation step should be no less than 1");
-
-//   Request &request = all_requests[guid];
-//   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
-
-//   // Make sure there are enough layers in the speculation tree
-//   if (speculative_token_tree.tree_layers.size() == current_speculation_step)
-//   {
-//     // When adding the first token, we need to add a new layer
-//     speculative_token_tree.add_layer();
-//   } else {
-//     // To add a token, the tree depth is either the same as the current
-//     // speculation step or one more than the current speculation step.
-//     assert(speculative_token_tree.tree_layers.size() ==
-//                current_speculation_step + 1 &&
-//            "Invalid token tree depth");
-//   }
-
-//   bool remove_min_node = false;
-//   bool add_new_node = true;
-
-//   std::shared_ptr<TokenTreeNode> min_node_ptr = nullptr;
-//   RequestGuid min_node_guid = -1;
-//   if (token_tree_node_pool.size() > 0) {
-//     std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
-//         min_node_pair_in_pool = token_tree_node_pool.top();
-//     min_node_ptr = min_node_pair_in_pool.first;
-//     min_node_guid = min_node_pair_in_pool.second;
-//   }
-
-//   // We maintain the size of the token tree node pool to not exceed
-//   //  BatchConfig::MAX_NUM_TOKENS
-//   if (token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS) {
-//     // The pool is full, check if the new node has a higher joint probability
-//     // than the minimum node in the pool.
-
-//     if (log_accumulated_prob < min_node_ptr->log_accumulated_prob) {
-//       // Insertion failed
-//       add_new_node = false;
-//     } else {
-//       // Remove the minimum node from the pool, and set its pruned field to
-//       // true
-//       remove_min_node = true;
-//     }
-//   } else if (token_tree_node_pool.size() > BatchConfig::MAX_NUM_TOKENS) {
-//     assert(false && "The size of the token tree node pool should not exceed "
-//                     "BatchConfig::MAX_NUM_TOKENS");
-//   }
-//   // Do nothing if the pool is not full
-
-//   // The request's token tree size should not exceed
-//   // BatchConfig::MAX_SPEC_TREE_TOKEN_NUM
-//   // The judgement is done here to avoid the case where the tree is full but
-//   a
-//   // node is pruned.
-//   if (speculative_token_tree.tree_size ==
-//       BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
-//     if (remove_min_node && guid == min_node_guid) {
-//       // The minimum node in the pool is pruned, and it's in the same request
-//       // with the new node. Only in this case we can add the new node.
-//       // Because remove_min_node is true means that the new node has a higher
-//       // joint probability than the minimum node in the pool.
-//       add_new_node = true;
-//     } else {
-//       // Otherwise, we cannot add the new node, and we don't need to expel
-//       the
-//       // minimum node from the pool.
-//       add_new_node = false;
-//       remove_min_node = false;
-//     }
-//   } else if (speculative_token_tree.tree_size >
-//              BatchConfig::MAX_SPEC_TREE_TOKEN_NUM) {
-//     assert(false && "The size of the token tree should not exceed "
-//                     "BatchConfig::MAX_SPEC_TREE_TOKEN_NUM");
-//   }
-
-//   assert(!(remove_min_node && !add_new_node) &&
-//          "The minimum node should be removed only when the new node is
-//          added");
-
-//   if (remove_min_node) {
-//     // Remove the minimum node from the pool, and set its pruned field to
-//     true min_node_ptr->pruned = true; token_tree_node_pool.pop();
-//     all_requests[min_node_guid].speculative_token_trees[0].tree_size--;
-//   }
-
-//   if (add_new_node) {
-//     // Add the new node to the pool and the last layer of the speculation
-//     tree auto node_ptr = std::make_shared<TokenTreeNode>(
-//         token_id, log_accumulated_prob, parent_pos);
-//     token_tree_node_pool.push(std::make_pair(node_ptr, guid));
-//     speculative_token_tree.tree_layers.back().push_back(node_ptr);
-//     speculative_token_tree.tree_size++;
-//     speculative_token_tree.tree_size_including_pruned++;
-//   }
-//   return add_new_node;
-// }
-
-// bool RequestManager::prune_last_layer_of_spec_token_trees() {
-//   // Returns true if the last layers of the token tree of all requests are
-//   empty for (int request_idx = 0; request_idx <
-//   BatchConfig::MAX_NUM_REQUESTS;
-//        request_idx++) {
-//     RequestGuid guid = request_idx;
-//     if (all_requests[guid].status != Request::RUNNING) {
-//       continue;
-//     }
-//     if (prune_last_layer_of_spec_token_tree(guid)) {
-//       return true;
-//     }
-//   }
-//   Request &request = all_requests[guid];
-
-//   if (request.speculative_token_trees[0].tree_layers.size() <=
-//       current_speculation_step) {
-//     // There are no tokens in the last layer
-//     return true;
-//   }
-//   auto &last_layer = request.speculative_token_trees[0].tree_layers.back();
-//   for (auto it = last_layer.begin(); it != last_layer.end();) {
-//     if ((*it)->pruned) {
-//       it = last_layer.erase(it);
-//       request.speculative_token_trees[0].tree_size--;
-//       request.speculative_token_trees[0].tree_size_including_pruned--;
-//     } else {
-//       ++it;
-//     }
-//   }
-
-//   if (last_layer.empty()) {
-//     return true;
-//   }
-//   return false;
-// }
 /* --------- Request Token Tree Related Functions --------- */
 }; // namespace FlexFlow

From 7ebff89555af1084320978fd039b7073d013fa35 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 15:53:28 -0400
Subject: [PATCH 229/667] Added -ll:cpu 4 to the test commands.

---
 tests/inference/cpp_inference_tests.sh | 64 +++++++++++++-------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index 8beea5599..eda43a0e2 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}"
 ###############################################################################################
 
 # LLAMA
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
 # LLAMA (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
 
 # OPT
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
 # OPT (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     
     # OPT
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
@@ -37,63 +37,63 @@ fi
 ###############################################################################################
 
 # LLAMA (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
 
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1
 
 # LLAMA (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
 
 # LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
 # LLAMA (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
 
 # OPT (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
 # OPT (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
 
 # OPT (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
 # OPT (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
 
 # Falcon (full precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 # Falcon (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 
 # # StarCoder (full precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
 # # StarCoder (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # LLAMA (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # LLAMA (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 
     # OPT (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # OPT (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # OPT (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################

From 4f3ac8ce4e58b7a48b2f4cf710241c5e24623028 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 16:04:19 -0400
Subject: [PATCH 230/667] Added some output.

---
 src/runtime/request_manager.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f63aa792f..ebe38ed97 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -679,6 +679,10 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     bc.num_tokens_to_commit++;
   }
 
+  if (verbose) {
+    std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
+    bc.print();
+  }
   return bc;
 }
 
@@ -729,6 +733,10 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     bc.num_tokens++;
   }
 
+  if (verbose) {
+    std::cout << "prepare_ssm_prefilling_batch NEW batchconfig:" << std::endl;
+    bc.print();
+  }
   return bc;
 }
 
@@ -774,6 +782,10 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.num_tokens++;
   }
 
+  if (verbose) {
+    std::cout << "prepare_decoding_batch NEW batchconfig:" << std::endl;
+    bc.print();
+  }
   return bc;
 }
 /* ----- Speculative Inference Specific functions ----- */

From 3eb83f056cfe6506086ec0380b3f5d4d61304276 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 19:54:09 -0400
Subject: [PATCH 231/667] Used function calls to replace BatchConfig constants.

---
 src/runtime/request_manager.cc | 36 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ebe38ed97..07830e828 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -339,7 +339,7 @@ size_t RequestManager::get_num_processed_requests() {
 
 int RequestManager::get_num_active_requests() {
   int count = 0;
-  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+  for (int i = 0; i < get_max_requests_per_batch(); i++) {
     if (guid_of_requests[i] != INVALID_GUID) {
       count++;
     }
@@ -348,7 +348,7 @@ int RequestManager::get_num_active_requests() {
 }
 
 int RequestManager::get_empty_request_index() {
-  for (int i = 0; i < BatchConfig::MAX_NUM_REQUESTS; i++) {
+  for (int i = 0; i < get_max_requests_per_batch(); i++) {
     if (guid_of_requests[i] == INVALID_GUID) {
       return i;
     }
@@ -552,7 +552,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   bool request_completed = false;
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       // Request in this slot is unavailable
@@ -648,7 +648,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   bc.requestsInfo[request_index].first_token_index_in_request =
       prefill_request->llm_cache_size;
   bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-      BatchConfig::MAX_NUM_TOKENS,
+      get_max_tokens_per_batch(),
       (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
 
   prefill_request->first_token_offset_in_batch = 0;
@@ -712,7 +712,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   bc.requestsInfo[request_index].first_token_index_in_request =
       prefill_request->ssm_cache_size;
   bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-      BatchConfig::MAX_NUM_TOKENS,
+      get_max_tokens_per_batch(),
       (int)prefill_request->tokens.size() - prefill_request->ssm_cache_size);
 
   prefill_request->first_token_offset_in_batch = 0;
@@ -757,7 +757,7 @@ BatchConfig RequestManager::prepare_decoding_batch() {
             std::begin(bc.request_available));
   bc.num_available_requests = num_available_requests;
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        request_index++) {
     if (!request_available[request_index]) {
       continue;
@@ -813,7 +813,7 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
             std::begin(new_bc.request_available));
   new_bc.num_available_requests = num_available_requests;
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       continue;
@@ -888,7 +888,7 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
             std::begin(new_bc.request_available));
   new_bc.num_available_requests = num_available_requests;
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       continue;
@@ -978,7 +978,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
             std::begin(new_bc.request_available));
   new_bc.num_available_requests = num_available_requests;
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       continue;
@@ -1068,7 +1068,7 @@ bool RequestManager::update_llm_verify_results(
 
   // Update llm_cache_size with the last committed_tokens, and clear
   // committed_tokens
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       // Request in this slot is unavailable
@@ -1088,7 +1088,7 @@ bool RequestManager::update_llm_verify_results(
   bool request_completed = false;
 
   // Iterate over the requests
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       // Request in this slot is unavailable
@@ -1143,7 +1143,7 @@ bool RequestManager::update_ssm_inference_results(
   bool all_request_last_layer_empty =
       add_tokens_to_spec_token_tree(ssm_inference_result);
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       // Request in this slot is unavailable
@@ -1367,7 +1367,7 @@ void RequestManager::get_verify_results_greedy(
   int llm_result_offset = 0;
   // This function maintain the generated token list of the request and the
   // committed tokens.
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       continue;
@@ -1395,8 +1395,7 @@ void RequestManager::get_verify_results_greedy(
     int current_token_index = 1; // Because we skip the root
     auto layer_it = token_tree.tree_layers.begin();
     ++layer_it;
-    for (int layer_index = 1; layer_index < token_tree.tree_layers.size();
-         layer_index++) {
+    for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
       // We skip the first layer
       std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
 
@@ -1446,7 +1445,6 @@ void RequestManager::get_verify_results_greedy(
         // No token is accepted in this layer, we should stop the traversal
         break;
       }
-      ++layer_it;
     }
 
     // Add the last token (that is not verified by the LLM)
@@ -1732,7 +1730,7 @@ void RequestManager::add_root_to_spec_token_tree(
 bool RequestManager::add_tokens_to_spec_token_tree(
     InferenceResult const &ssm_inference_result) {
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       // Request in this slot is unavailable
@@ -1760,7 +1758,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     }
 
     bool token_pool_full =
-        token_tree_node_pool.size() == BatchConfig::MAX_NUM_TOKENS;
+        token_tree_node_pool.size() == get_max_tokens_per_batch();
 
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
@@ -1847,7 +1845,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
 
   bool all_request_last_layer_empty = true;
 
-  for (int request_index = 0; request_index < BatchConfig::MAX_NUM_REQUESTS;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
       // Request in this slot is unavailable

From 60618f84fb4d8f435bc52648d7bb1201337729a9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 19:56:43 -0400
Subject: [PATCH 232/667] Removed unused code.

---
 src/runtime/batch_config.cc | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 0bf2d1ea8..9938a8356 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -25,13 +25,6 @@ LegionRuntime::Logger::Category log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-// BatchConfig::BatchConfig() : model_id(0), inference_mode(INC_DECODING_MODE) {
-//   std::fill(std::begin(request_available), std::end(request_available), 0);
-//   // Don't need to initialize requestInfo ,tokensInfo, causalMask and
-//   // committed_tokens here because they initialize themselves.
-//   // Other fields are already initialized to proper value.
-// }
-
 BatchConfig::BatchConfig(InferenceMode inference_mode_, int model_id_)
     : model_id(model_id_), inference_mode(inference_mode_) {
   std::fill(std::begin(request_available), std::end(request_available), 0);
@@ -40,26 +33,6 @@ BatchConfig::BatchConfig(InferenceMode inference_mode_, int model_id_)
   // Other fields are already initialized to proper value.
 }
 
-/*static*/
-// BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future)
-// {
-//   BatchConfig const *bc = static_cast<BatchConfig const *>(
-//       Future(future).get_buffer(Memory::SYSTEM_MEM));
-//   // Check future size
-//   if (bc->get_mode() == INC_DECODING_MODE) {
-//     assert(Future(future).get_untyped_size() == sizeof(BatchConfig));
-//   } else if (bc->get_mode() == TREE_SEARCH_MODE) {
-//     assert(Future(future).get_untyped_size() ==
-//     sizeof(TreeSearchBatchConfig));
-//   } else if (bc->get_mode() == TREE_VERIFY_MODE) {
-//     assert(Future(future).get_untyped_size() ==
-//     sizeof(TreeVerifyBatchConfig));
-//   } else {
-//     assert(false && "Unsupported inference mode");
-//   }
-//   return bc;
-// }
-
 /*static*/
 BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) {
   return static_cast<BatchConfig const *>(

From ac950f74bec428df2532ae1d791a8d848cd35940 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 21:46:55 -0400
Subject: [PATCH 233/667] Removed unused code.

---
 src/runtime/request_manager.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 07830e828..4c0684994 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -951,7 +951,6 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
 
 /***** Verify Phase *****/
 BatchConfig RequestManager::prepare_verify_batch_config() {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   if (verbose) {
     std::cout
         << "\n############### prepare_verify_batch_config ###############\n";

From 7631e3a5471e17afad0456872497e956da250615 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 9 May 2024 21:48:10 -0400
Subject: [PATCH 234/667] For debug purpose, modified some instruction and
 settings.

---
 inference/spec_infer/spec_infer.cc     | 4 ++--
 tests/inference/cpp_inference_tests.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 7df39e94c..6fe8e7619 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -274,10 +274,10 @@ void FlexFlow::top_level_task(Task const *task,
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
-  int max_requests_per_batch = 16;
+  int max_requests_per_batch = 8;
   int max_tokens_per_batch = 256;
   int max_sequence_length = 256;
-  int max_spec_tree_token_num = 23;
+  int max_spec_tree_token_num = 64;
   int expansion_degree = 3;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::SPECULATIVE_DECODING;
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index eda43a0e2..c9e4b3621 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -10,7 +10,7 @@ cd "${BASH_SOURCE[0]%/*}"
 ###############################################################################################
 
 # LLAMA
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 20000 -ll:zsize 30000 --verbose --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
 # LLAMA (half precision)
 ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
 

From 2fbc7bf6d431482362d2996561502ce789220b5d Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Fri, 10 May 2024 04:07:14 -0400
Subject: [PATCH 235/667] feat: add GumbelTopK cuda implementation

---
 include/flexflow/ops/gumbel_topk.h | 20 +++++++++++-
 src/ops/gumbel_topk.cc             |  7 +++-
 src/ops/gumbel_topk.cu             | 52 ++++++++++++++++++++++++------
 3 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/include/flexflow/ops/gumbel_topk.h b/include/flexflow/ops/gumbel_topk.h
index 129b9e83b..454c8e4fd 100644
--- a/include/flexflow/ops/gumbel_topk.h
+++ b/include/flexflow/ops/gumbel_topk.h
@@ -5,15 +5,33 @@
 #include "flexflow/model.h"
 #include "flexflow/node.h"
 #include "flexflow/ops/gumbel_topk_params.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include <curand.h>
+#include <curand_kernel.h>
+#elif defined(FF_USE_HIP_ROCM)
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+#endif
+#include "flexflow/utils/memory_allocator.h"
 
 namespace FlexFlow {
 
 class GumbelTopKMeta : public OpMeta {
 public:
-  GumbelTopKMeta(FFHandler handle, Op const *op);
   bool sorted;
   int k;
   bool speculative_decoding;
+  Realm::RegionInstance reserveInst;
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  curandState *state;
+  int state_max_length;
+#elif defined(FF_USE_HIP_ROCM)
+  hiprandState *state;
+#endif
+  GumbelTopKMeta(FFHandler handle,
+                 Op const *op,
+                 MemoryAllocator &gpu_mem_allocator);
+  ~GumbelTopKMeta(void);
 };
 
 class GumbelTopK : public Op {
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index 0623569e8..7a59131ad 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -272,7 +272,12 @@ OpMeta *GumbelTopK::init_task(Task const *task,
                            Runtime *runtime) {
   GumbelTopK *gumbel_topk = (GumbelTopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  GumbelTopKMeta *m = new GumbelTopKMeta(handle, gumbel_topk);
+  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                       .only_kind(Memory::GPU_FB_MEM)
+                       .best_affinity_to(task->target_proc)
+                       .first();
+  MemoryAllocator gpu_mem_allocator(gpu_mem);
+  GumbelTopKMeta *m = new GumbelTopKMeta(handle, gumbel_topk, gpu_mem_allocator);
   m->profiling = gumbel_topk->profiling;
   m->inference_debugging = gumbel_topk->inference_debugging;
   m->sorted = gumbel_topk->sorted;
diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index f0dd412ef..a9929f578 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -213,6 +213,13 @@ __device__ IndexedHeap<heapType, preferIndices, Data, T>
   return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
 }
 
+__global__ void
+    init_random_kernel(curandState *state, int batch_size, long rand) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
+    curand_init(rand, i, 0, &state[i]);
+  }
+}
+
 // heapGumbelTopK walks over [input, input+length) with `step_size` stride starting
 // at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
 // using `Accessor` to access elements in `heap_entries`. If sorted=true, the
@@ -220,7 +227,8 @@ __device__ IndexedHeap<heapType, preferIndices, Data, T>
 // NOTE that it applies Gumbel trick on `input`, which is,
 // input -> log(input) - log(-log(U)), where U is a uniform random number in (0, 1).
 template <typename T, template <typename> class Data = LinearData>
-__device__ void heapGumbelTopK(T const *__restrict__ input,
+__device__ void heapGumbelTopK(const curandState state,
+                            T const *__restrict__ input,
                             int length,
                             int k,
                             GumbelEntry<T> *__restrict__ heap_entries,
@@ -229,7 +237,6 @@ __device__ void heapGumbelTopK(T const *__restrict__ input,
                             int step_size = 1) {
   assert(k <= length);
 
-  // TODO: apply uniform random
   auto heap =
       make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
           heap_entries);
@@ -242,7 +249,7 @@ __device__ void heapGumbelTopK(T const *__restrict__ input,
   for (int index = start_index, slot = 0; index < heap_end_index;
        index += step_size, slot++) {
     T value = log(input[index]);
-    T perturbed_value = value - log(-log(1.0f * (index + 1) / length));
+    T perturbed_value = value - log(-log(curand_uniform(state)));
     heap.assign(slot, {index, value, perturbed_value});
   }
 
@@ -255,7 +262,7 @@ __device__ void heapGumbelTopK(T const *__restrict__ input,
     // We prefer elements with lower indices. This is given here.
     // Later elements automatically have higher indices, so can be discarded.
     T value = log(input[index]);
-    T perturbed_value = value - log(-log(1.0f * (index + 1) / length));
+    T perturbed_value = value - log(-log(curand_uniform(state)));
     if (perturbed_value > heap.root().perturbed_value) {
       // This element should replace the min.
       heap.replace_root({index, value, perturbed_value}, k);
@@ -359,7 +366,8 @@ __device__ void mergeShards(int num_shards,
 }
 
 template <typename T>
-__global__ void gumbel_topk_forward_kernel(T const *__restrict__ input,
+__global__ void gumbel_topk_forward_kernel(curandState *state,
+                                        T const *__restrict__ input,
                                         size_t shared_memory_size,
                                         int length,
                                         int k,
@@ -375,7 +383,7 @@ __global__ void gumbel_topk_forward_kernel(T const *__restrict__ input,
   int const thread_count = blockDim.x;
   GumbelEntry<T> *shared_entries = (GumbelEntry<T> *)shared_memory;
   heapGumbelTopK<T, StridedData>(
-      batch_input, length, k, shared_entries, true, thread_index, thread_count);
+      state[thread_index + batch_index * thread_count], batch_input, length, k, shared_entries, true, thread_index, thread_count);
   __syncthreads();
   if (thread_index == 0) {
     int const offset = batch_index * k;
@@ -431,7 +439,15 @@ void GumbelTopK::forward_kernel(
     assert(bc->num_active_requests() >= 0);
     assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
+
+    size_t state_length = batch_size * num_shards;
+    init_random_kernel<<<GET_BLOCKS(state_length),
+                        min(CUDA_NUM_THREADS, state_length),
+                        0,
+                      stream>>>(m->state, state_length, rand());
+
     gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        m->state,
         input_ptr,
         shared_memory_size,
         length,
@@ -442,10 +458,17 @@ void GumbelTopK::forward_kernel(
         indices_ptr,
         m->speculative_decoding);
   } else {
-
     assert(num_shards >= (size_t)k);
     num_shards = k;
+    
+    size_t state_length = batch_size * num_shards;
+    init_random_kernel<<<GET_BLOCKS(state_length),
+                        min(CUDA_NUM_THREADS, state_length),
+                        0,
+                      stream>>>(m->state, state_length, rand());
+
     gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        m->state,
         input_ptr,
         shared_memory_size,
         length,
@@ -551,7 +574,18 @@ void GumbelTopK::forward_kernel_wrapper(GumbelTopKMeta const *m,
   }
 }
 
-GumbelTopKMeta::GumbelTopKMeta(FFHandler handler, Op const *op)
-    : OpMeta(handler, op) {}
+GumbelTopKMeta::GumbelTopKMeta(FFHandler handler,
+                               Op const *op,
+                               MemoryAllocator &gpu_mem_allocator)
+    : OpMeta(handler, op) {
+  state_max_length = BatchConfig::MAX_NUM_TOKENS * max(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, CUDA_NUM_THREADS);
+  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(curandState) * state_max_length);
+  state = gpu_mem_allocator.allocate_instance<curandState>(state_max_length);
+}
 
+GumbelTopKMeta::~GumbelTopKMeta(void) {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
 }; // namespace FlexFlow

From 4d83080302fa7446dd5b4fde24150e8e12e4ba6f Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Fri, 10 May 2024 04:34:10 -0400
Subject: [PATCH 236/667] fix: minor typos

---
 src/ops/gumbel_topk.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index a9929f578..c284bf7d1 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -227,7 +227,7 @@ __global__ void
 // NOTE that it applies Gumbel trick on `input`, which is,
 // input -> log(input) - log(-log(U)), where U is a uniform random number in (0, 1).
 template <typename T, template <typename> class Data = LinearData>
-__device__ void heapGumbelTopK(const curandState state,
+__device__ void heapGumbelTopK(curandState state,
                             T const *__restrict__ input,
                             int length,
                             int k,
@@ -249,7 +249,7 @@ __device__ void heapGumbelTopK(const curandState state,
   for (int index = start_index, slot = 0; index < heap_end_index;
        index += step_size, slot++) {
     T value = log(input[index]);
-    T perturbed_value = value - log(-log(curand_uniform(state)));
+    T perturbed_value = value - log(-log(curand_uniform(&state)));
     heap.assign(slot, {index, value, perturbed_value});
   }
 
@@ -262,7 +262,7 @@ __device__ void heapGumbelTopK(const curandState state,
     // We prefer elements with lower indices. This is given here.
     // Later elements automatically have higher indices, so can be discarded.
     T value = log(input[index]);
-    T perturbed_value = value - log(-log(curand_uniform(state)));
+    T perturbed_value = value - log(-log(curand_uniform(&state)));
     if (perturbed_value > heap.root().perturbed_value) {
       // This element should replace the min.
       heap.replace_root({index, value, perturbed_value}, k);
@@ -440,9 +440,9 @@ void GumbelTopK::forward_kernel(
     assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
     num_shards = k;
 
-    size_t state_length = batch_size * num_shards;
+    int state_length = batch_size * num_shards;
     init_random_kernel<<<GET_BLOCKS(state_length),
-                        min(CUDA_NUM_THREADS, state_length),
+                        min((int)CUDA_NUM_THREADS, state_length),
                         0,
                       stream>>>(m->state, state_length, rand());
 
@@ -454,16 +454,16 @@ void GumbelTopK::forward_kernel(
         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
         sorted,
         log_probs_ptr,
-        perturbed_log_probs_ptr
+        perturbed_log_probs_ptr,
         indices_ptr,
         m->speculative_decoding);
   } else {
     assert(num_shards >= (size_t)k);
     num_shards = k;
     
-    size_t state_length = batch_size * num_shards;
+    int state_length = batch_size * num_shards;
     init_random_kernel<<<GET_BLOCKS(state_length),
-                        min(CUDA_NUM_THREADS, state_length),
+                        min((int)CUDA_NUM_THREADS, state_length),
                         0,
                       stream>>>(m->state, state_length, rand());
 

From 74e18f4ee62155e916568ed80c613ec33c68318d Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Fri, 10 May 2024 04:38:30 -0400
Subject: [PATCH 237/667] fix: minor typo

---
 src/ops/gumbel_topk.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index c284bf7d1..7b00ac8fe 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -395,7 +395,7 @@ __global__ void gumbel_topk_forward_kernel(curandState *state,
                 k,
                 shared_entries,
                 top_k_heap,
-                batch_log_probs_ptr
+                batch_log_probs_ptr,
                 batch_perturbed_log_probs_ptr,
                 batch_indices,
                 speculative_decoding);

From 5239ee32af3eb31bca5130bdf2c42fe4046c7836 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Fri, 10 May 2024 04:42:51 -0400
Subject: [PATCH 238/667] fix: unify the log function

---
 src/ops/gumbel_topk.cu | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index 7b00ac8fe..0bdf9f244 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -220,6 +220,16 @@ __global__ void
   }
 }
 
+// Unified log function for float
+__device__ inline float unified_log(float x) {
+    return logf(x);
+}
+
+// Unified log function for half
+__device__ inline __half unified_log(__half x) {
+    return hlog(x);
+}
+
 // heapGumbelTopK walks over [input, input+length) with `step_size` stride starting
 // at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
 // using `Accessor` to access elements in `heap_entries`. If sorted=true, the
@@ -248,8 +258,8 @@ __device__ void heapGumbelTopK(curandState state,
   // Initialize the min-heap.
   for (int index = start_index, slot = 0; index < heap_end_index;
        index += step_size, slot++) {
-    T value = log(input[index]);
-    T perturbed_value = value - log(-log(curand_uniform(&state)));
+    T value = unified_log(input[index]);
+    T perturbed_value = value - unified_log(-unified_log((T)curand_uniform(&state)));
     heap.assign(slot, {index, value, perturbed_value});
   }
 
@@ -261,8 +271,8 @@ __device__ void heapGumbelTopK(curandState state,
   for (int index = heap_end_index; index < length; index += step_size) {
     // We prefer elements with lower indices. This is given here.
     // Later elements automatically have higher indices, so can be discarded.
-    T value = log(input[index]);
-    T perturbed_value = value - log(-log(curand_uniform(&state)));
+    T value = unified_log(input[index]);
+    T perturbed_value = value - unified_log(-unified_log((T)curand_uniform(&state)));
     if (perturbed_value > heap.root().perturbed_value) {
       // This element should replace the min.
       heap.replace_root({index, value, perturbed_value}, k);

From 54e274dbb8ea567946280b31c131928c9785f319 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Fri, 10 May 2024 05:09:58 -0400
Subject: [PATCH 239/667] fix: some system utils' definitions

---
 include/flexflow/model.h | 3 +++
 src/runtime/graph.cc     | 1 +
 2 files changed, 4 insertions(+)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 7db864def..0346cf5cf 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -339,6 +339,7 @@ class Reshape;
 class Softmax;
 class Split;
 class TopK;
+class GumbelTopK;
 class ArgTopK;
 class Transpose;
 class RMSNorm;
@@ -1233,6 +1234,8 @@ class FFModel {
       std::unordered_map<std::pair<ParallelTensorShape, SoftmaxParams>,
                          Softmax *>,
       std::unordered_map<std::pair<ParallelTensorShape, TopKParams>, TopK *>,
+      std::unordered_map<std::pair<ParallelTensorShape, GumbelTopKParams>,
+                         GumbelTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, ArgTopKParams>,
                          ArgTopK *>,
       std::unordered_map<std::pair<ParallelTensorShape, TransposeParams>,
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 64a577132..0c41091e5 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -33,6 +33,7 @@
 #include "flexflow/ops/flat.h"
 #include "flexflow/ops/gather.h"
 #include "flexflow/ops/groupby.h"
+#include "flexflow/ops/gumbel_topk.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"

From c4c0430f182bcb8e0eb9e3531a0058f32eb779d8 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <aetiurf@gmail.com>
Date: Fri, 10 May 2024 05:17:27 -0400
Subject: [PATCH 240/667] fix: avoid conflict name

---
 src/ops/gumbel_topk.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index 0bdf9f244..7ccd4ade0 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -214,7 +214,7 @@ __device__ IndexedHeap<heapType, preferIndices, Data, T>
 }
 
 __global__ void
-    init_random_kernel(curandState *state, int batch_size, long rand) {
+    init_random_state_kernel(curandState *state, int batch_size, long rand) {
   CUDA_KERNEL_LOOP(i, batch_size) {
     curand_init(rand, i, 0, &state[i]);
   }
@@ -451,7 +451,7 @@ void GumbelTopK::forward_kernel(
     num_shards = k;
 
     int state_length = batch_size * num_shards;
-    init_random_kernel<<<GET_BLOCKS(state_length),
+    init_random_state_kernel<<<GET_BLOCKS(state_length),
                         min((int)CUDA_NUM_THREADS, state_length),
                         0,
                       stream>>>(m->state, state_length, rand());
@@ -472,7 +472,7 @@ void GumbelTopK::forward_kernel(
     num_shards = k;
     
     int state_length = batch_size * num_shards;
-    init_random_kernel<<<GET_BLOCKS(state_length),
+    init_random_state_kernel<<<GET_BLOCKS(state_length),
                         min((int)CUDA_NUM_THREADS, state_length),
                         0,
                       stream>>>(m->state, state_length, rand());

From c2032261a37854eb313b1f6104251819273e0cc1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 10 May 2024 22:49:18 -0400
Subject: [PATCH 241/667] Added some debug outputs.

---
 include/flexflow/request_manager.h |  2 ++
 src/runtime/request_manager.cc     | 28 ++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 80ce1ca9f..70120cbfb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -173,6 +173,8 @@ class TokenTree {
   TokenTree() : tree_size(0) {}
 };
 
+std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree);
+
 class RequestManager {
 public:
   enum State {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4c0684994..bf1d58e50 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1096,10 +1096,16 @@ bool RequestManager::update_llm_verify_results(
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
+    if (verbose) {
+      std::cout << "Request " << guid << " token tree: " << std::endl;
+      std::cout << request.speculative_token_trees[0];
+    }
+
     // Initialize the token tree for the request
     init_token_tree(guid);
     assert(!request.committed_tokens.empty() &&
            "The committed tokens should not be empty.");
+
     // Add the last committed token as the root of the speculative token tree
     add_root_to_spec_token_tree(guid, request.committed_tokens.back().token_id);
 
@@ -1299,7 +1305,7 @@ void RequestManager::append_bitmask(RequestGuid guid) {
   bitmask.current_layer_size = new_layer_size;
   bitmask.tree_or_prompt_size += new_layer_size;
 
-  assert(bitmask.tree_or_prompt_size <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  assert(bitmask.tree_or_prompt_size <= get_max_spec_tree_token_num());
 
   int parent_offset = previous_tree_size - last_layer_size;
   int child_offset = previous_tree_size;
@@ -1748,7 +1754,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     int result_offset = request.first_token_offset_in_batch *
                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     int current_tree_size = request.causal_mask.tree_or_prompt_size;
-    int empty_slots_on_tree = BatchConfig::MAX_SPEC_TREE_TOKEN_NUM -
+    int empty_slots_on_tree = get_max_spec_tree_token_num() -
                               current_tree_size; // The number of empty slots
 
     if (empty_slots_on_tree == 0) {
@@ -1879,5 +1885,23 @@ bool RequestManager::add_tokens_to_spec_token_tree(
   return all_request_last_layer_empty;
 }
 
+std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
+  os << "Token tree: " << std::endl;
+  int layer_idx = 0;
+  for (auto const &layer : token_tree.tree_layers) {
+    os << "Layer: " << layer_idx << std::endl;
+    int token_pos = 0;
+    for (auto const &node : layer) {
+      if (!node->pruned) {
+        os << "token pos: " << token_pos << "token id: " << node->id << "\t"
+           << "parent pos: " << node->parent_pos << "\t" << std::endl;
+      }
+      token_pos++;
+    }
+    layer_idx++;
+  }
+  return os;
+}
+
 /* --------- Request Token Tree Related Functions --------- */
 }; // namespace FlexFlow

From 5fa6b43966654f73f427ff7a8f5998f6ae9b6b96 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 10 May 2024 22:49:52 -0400
Subject: [PATCH 242/667] Removed unused code.

---
 include/flexflow/batch_config.h | 56 ---------------------------------
 1 file changed, 56 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 30903271a..801f9671d 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -20,11 +20,6 @@
 #include <cstddef>
 #include <cstdlib>
 
-// #define MAX_SEQ_LEN 1024
-// #define BATCH_SIZE 2
-// #define BATCH_SIZE 16
-// #define MAX_REQUESTS 256
-
 namespace FlexFlow {
 
 class InferenceResult;
@@ -153,25 +148,6 @@ class BatchConfig {
   bool request_available[MAX_NUM_REQUESTS];
 };
 
-// class TreeVerifyBatchConfig : public BatchConfig {
-// public:
-//   TreeVerifyBatchConfig();
-//   ~TreeVerifyBatchConfig();
-//   InferenceMode get_mode() const;
-//   friend std::ostream &operator<<(std::ostream &os,
-//                                   TreeVerifyBatchConfig const &bc);
-//   void print() const;
-//   void save_to_file(std::string const &filename) const;
-//   struct CommittedTokensInfo {
-//     int token_index;   // the index of the token in the previous batch
-//     int request_index; // request index in the batch
-//     int token_depth;   // position of the token in the request's sequence
-//   };
-
-//   int num_tokens_to_commit = 0;
-//   CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
-// };
-
 struct InferenceResult {
   BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS *
                                  BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
@@ -180,36 +156,4 @@ struct InferenceResult {
   float topk_logits[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
 };
 
-// class TreeSearchBatchConfig : public BatchConfig {
-// public:
-//   TreeSearchBatchConfig();
-//   TreeSearchBatchConfig(int model_id);
-//   TreeSearchBatchConfig(TreeSearchBatchConfig const &other, int model_id);
-//   InferenceMode get_mode() const;
-
-//   ~TreeSearchBatchConfig();
-
-//   friend std::ostream &operator<<(std::ostream &os,
-//                                   TreeSearchBatchConfig const &bc);
-//   void print() const;
-//   void save_to_file(std::string const &filename) const;
-
-//   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
-//   inline static int const MAX_TREE_DEPTH = 16;
-
-//   // how many requests is in speculative phase
-//   int model_id;
-// };
-
-// class SsmInferenceResult : public InferenceResult {
-// public:
-//   BatchConfig::TokenId
-//       token_ids[MAX_NUM_TOKENS *
-//                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-//   float probs[MAX_NUM_TOKENS *
-//               TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-//   int parent_id[MAX_NUM_TOKENS *
-//                 TreeSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-// };
-
 }; // namespace FlexFlow

From cfae2eefd77f54d8b4f704450209a9040f1c694d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 10 May 2024 22:50:25 -0400
Subject: [PATCH 243/667] Small fix.

---
 .../ops/kernels/inc_multihead_self_attention_utils.cuh        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index 99e033c20..546d5e9a9 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -523,5 +523,5 @@ struct threads_per_value_t {
 #define test_bit(bit_mask, idx, pos)                                           \
   (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
-#endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
+#endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file

From 054528b34304dafaa345280617d259abde95341c Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 10 May 2024 22:54:29 -0400
Subject: [PATCH 244/667] Fix.

---
 src/runtime/request_manager.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bf1d58e50..23e20c680 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -100,8 +100,7 @@ int RequestManager::get_max_spec_tree_token_num() {
 
 int RequestManager::get_max_verify_tokens_per_batch() {
   assert(max_tokens_per_batch > 0);
-  return max_tokens_per_batch +
-         max_spec_tree_token_num * max_requests_per_batch;
+  return max_tokens_per_batch;
 }
 
 void RequestManager::set_max_sequence_length(int max_seq_length) {
@@ -1105,7 +1104,6 @@ bool RequestManager::update_llm_verify_results(
     init_token_tree(guid);
     assert(!request.committed_tokens.empty() &&
            "The committed tokens should not be empty.");
-
     // Add the last committed token as the root of the speculative token tree
     add_root_to_spec_token_tree(guid, request.committed_tokens.back().token_id);
 

From 5447bcf22fba7c4624c5f91e9913879f09810acf Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 10 May 2024 22:55:09 -0400
Subject: [PATCH 245/667] Modified debug output.

---
 src/runtime/batch_config.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 9938a8356..c9e1b2062 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -79,7 +79,7 @@ int BatchConfig::max_spec_tree_token_num() {
 // Overloading the << operator for the Bitset class
 std::ostream &operator<<(std::ostream &os,
                          BatchConfig::BitMask::Bitset const &bitset) {
-  for (size_t i = 0; i < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; i++) {
+  for (size_t i = 0; i < BatchConfig::max_spec_tree_token_num(); i++) {
     os << (bitset.test_bit(i) ? '1' : '0');
   }
   return os;
@@ -168,7 +168,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
         os << "    Current layer size: " << bc.causalMask[i].current_layer_size
            << std::endl;
         os << "    Bit mask: " << std::endl;
-        for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM; j++) {
+        for (int j = 0; j < BatchConfig::max_spec_tree_token_num(); j++) {
           os << "      " << bc.causalMask[i].bit_mask[j] << std::endl;
         }
       }

From dc5fb7534b29cc920ec0355c52a3b544364b5b33 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 11 May 2024 01:08:41 -0400
Subject: [PATCH 246/667] Updated the kernels. Removed some unused logic. Added
 some debug outputs.

---
 inference/spec_infer/spec_infer.cc           |  4 +-
 src/ops/inc_multihead_self_attention.cu      | 67 ++++++++++---
 src/ops/tree_inc_multihead_self_attention.cu | 99 ++++++++++++--------
 3 files changed, 116 insertions(+), 54 deletions(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 6fe8e7619..1b86a9c30 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -276,8 +276,8 @@ void FlexFlow::top_level_task(Task const *task,
   bool verbose = false;
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 256;
-  int max_sequence_length = 256;
-  int max_spec_tree_token_num = 64;
+  int max_sequence_length = 512;
+  int max_spec_tree_token_num = 8;
   int expansion_degree = 3;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::SPECULATIVE_DECODING;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 54b1704a3..8b6814eb3 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -826,6 +826,35 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
 
+  int size = m->num_q_heads * m->kProjSize * 50;
+  int *temp_key = new int[size];
+  int *temp_value = new int[size];
+  cudaMemcpy(temp_key, m->keyCache, size * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(
+      temp_value, m->valueCache, size * sizeof(int), cudaMemcpyDeviceToHost);
+
+  printf("key: ");
+  for (int i = 0; i < 50; ++i) {
+    int temp = 0;
+    for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
+      temp += temp_key[i * m->num_q_heads * m->kProjSize + j];
+    }
+    printf("%d ", temp);
+  }
+  printf("\n");
+
+  printf("value: ");
+  for (int i = 0; i < 50; ++i) {
+    int temp = 0;
+    for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
+      temp += temp_value[i * m->num_q_heads * m->kProjSize + j];
+    }
+    printf("%d ", temp);
+  }
+  printf("\n");
+  delete[] temp_key;
+  delete[] temp_value;
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
@@ -847,6 +876,21 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
+  size = m->hidden_size * 50;
+  int *temp_output = new int[size];
+  cudaMemcpy(
+      temp_output, m->attn_heads, size * sizeof(int), cudaMemcpyDeviceToHost);
+  printf("Output: ");
+  for (int i = 0; i < 50; ++i) {
+    int temp = 0;
+    for (int j = 0; j < m->hidden_size; ++j) {
+      temp += temp_output[i * m->hidden_size + j];
+    }
+    printf("%d ", temp);
+  }
+  printf("\n");
+  delete[] temp_output;
+
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
   compute_o_prod_bias(
@@ -1345,13 +1389,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE
-                                   ? BatchConfig::max_verify_tokens_per_batch()
-                                   : BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
-    size_t key_cache_size = 0, value_cache_size = 0;
+    size_t key_cache_size = 0, value_cache_size = 0, qk_prod_size = 0;
     switch (infer_mode) {
       case INC_DECODING_MODE: {
         key_cache_size = num_q_heads * kProjSize *
@@ -1360,6 +1402,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         value_cache_size = num_q_heads * vProjSize *
                            BatchConfig::max_requests_per_batch() *
                            BatchConfig::max_sequence_length();
+        qk_prod_size = BatchConfig::max_spec_tree_token_num() *
+                       BatchConfig::max_sequence_length() * num_q_heads;
         break;
       }
       case TREE_SEARCH_MODE:
@@ -1373,15 +1417,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_requests_per_batch() *
                            (BatchConfig::max_sequence_length() +
                             BatchConfig::max_spec_tree_token_num());
+        qk_prod_size = BatchConfig::max_spec_tree_token_num() *
+                       (BatchConfig::max_sequence_length() +
+                        BatchConfig::max_spec_tree_token_num()) *
+                       num_q_heads;
         break;
       }
       default:
         assert(false && "Unkown inference mode");
     }
-    size_t requestinfo_size = BatchConfig::max_requests_per_batch();
-    // size_t tokeninfo_size = max_tokens_per_batch;
-    size_t qk_prod_size =
-        max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads;
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
     size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads +
                                                    kProjSize * num_q_heads)) /
@@ -1419,13 +1463,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     }
 
     // in tree_verify, enable devQKVProjArray;
-    if (!offload || infer_mode == TREE_VERIFY_MODE) {
-      devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped(
+    if (offload) {
+      devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped(
           qkv_max_proj_size * size_of_dt);
     } else {
-      devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped(
+      devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped(
           qkv_max_proj_size * size_of_dt);
-      // offset += qkv_max_proj_size * size_of_dt;
     }
 
     // use key value cache in all mode.
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index fe3b47af2..18da245aa 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -278,22 +278,10 @@ __global__ void compute_attention_kernel_fused_kernel(
       for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
         // Load the values from the cache.
         int const ti_circ = ti % max_seq_length;
-        // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti];
         V_vec v = *reinterpret_cast<V_vec const *>(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-
-        if (ti < tlength) {
-          bool const mask =
-              prompt_phase ? (q_start + qi < ti)
-                           : (ti >= bitmask->non_tree_cache_size &&
-                              (!test_bit(bitmask->bit_mask,
-                                         qi,
-                                         ti - bitmask->non_tree_cache_size)));
-          // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-          //   (1 << qi))));
-          float logit = mask ? 0.0f : qk_smem[ti - first_step];
-          out = FlexFlow::fma(logit, cast_to_float(v), out);
-        }
+        float logit = qk_smem[ti - first_step];
+        out = FlexFlow::fma(logit, cast_to_float(v), out);
       }
     }
 
@@ -407,6 +395,35 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         BatchConfig::max_sequence_length() +
             BatchConfig::max_spec_tree_token_num(),
         m->hidden_size);
+
+    int size = m->num_q_heads * m->kProjSize * 50;
+    int *temp_key = new int[size];
+    int *temp_value = new int[size];
+    cudaMemcpy(
+        temp_key, m->keyCache, size * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(
+        temp_value, m->valueCache, size * sizeof(int), cudaMemcpyDeviceToHost);
+    printf("key: ");
+    for (int i = 0; i < 50; ++i) {
+      int temp = 0;
+      for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
+        temp += temp_key[i * m->num_q_heads * m->kProjSize + j];
+      }
+      printf("%d ", temp);
+    }
+    printf("\n");
+
+    printf("value: ");
+    for (int i = 0; i < 50; ++i) {
+      int temp = 0;
+      for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
+        temp += temp_value[i * m->num_q_heads * m->kProjSize + j];
+      }
+      printf("%d ", temp);
+    }
+    printf("\n");
+    delete[] temp_key;
+    delete[] temp_value;
   }
 }
 
@@ -468,27 +485,14 @@ __global__ void update_tree_branch_kv_cache_fused(
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
-    int const req_id = tokenInfos[token_idx].request_index;
+    int const req_idx = tokenInfos[token_idx].request_index;
+    int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
     // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
 
-    int const request_token_offset =
-        request_infos[req_id].first_token_offset_in_batch;
-    int const first_token_depth =
-        request_infos[req_id].first_token_index_in_request;
-
-    // if(i % hidden_size == 0){
-    //   printf("update token request id: %d, %d, %d  real id %d, value%.10f\n",
-    //   req_id, token_idx, request_token_offset,(token_idx + first_token_depth
-    //   - request_token_offset), kVal);
-    // }
-    kCache_ptr[req_id * (hidden_size * max_seq_len) +
-               (token_idx + first_token_depth - request_token_offset) *
-                   hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) +
-               (token_idx + first_token_depth - request_token_offset) *
-                   hidden_size +
-               offset] = vVal;
+    kCache_ptr[req_idx * (hidden_size * max_seq_len) +
+               token_abs_idx * hidden_size + offset] = kVal;
+    vCache_ptr[req_idx * (hidden_size * max_seq_len) +
+               token_abs_idx * hidden_size + offset] = vVal;
   }
 }
 
@@ -539,13 +543,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
   int q_block_size = m->qProjSize;
   int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
+  int kt_req_block_size = kt_block_size * m->num_q_heads *
+                          (BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
   int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
-      BatchConfig::max_spec_tree_token_num();
+  int vt_req_block_size = vt_block_size * m->num_q_heads *
+                          (BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
@@ -585,7 +589,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
             m->num_active_tokens,      // total_tokens_in_batch
-            BatchConfig::max_sequence_length(),
+            BatchConfig::max_sequence_length() +
+                BatchConfig::max_spec_tree_token_num(),
             m->hidden_size);
       }
 
@@ -951,6 +956,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   compute_attention_kernel_fused<DT>(
       m, bc, static_cast<DT *>(m->attn_heads), stream);
 
+  int s = m->hidden_size * 50;
+  int *temp_output = new int[s];
+  cudaMemcpy(
+      temp_output, m->attn_heads, s * sizeof(int), cudaMemcpyDeviceToHost);
+  printf("Output: ");
+  for (int i = 0; i < 50; ++i) {
+    int temp = 0;
+    for (int j = 0; j < m->hidden_size; ++j) {
+      temp += temp_output[i * m->hidden_size + j];
+    }
+    printf("%d ", temp);
+  }
+  printf("\n");
+
   int processed_tokens_in_batch = bc->num_active_tokens();
 
   compute_o_prod_bias(m,

From 06d07b664fe64edaf8b0cbc9b088a1bd64b82a63 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 11 May 2024 20:46:49 -0400
Subject: [PATCH 247/667] Fixed the rotary embedding bug by adding a new field
 abs_depth_in_request in BatchConfig::PerTokenInfo.

---
 include/flexflow/batch_config.h              |  5 +++
 src/ops/inc_multihead_self_attention.cpp     |  4 +--
 src/ops/inc_multihead_self_attention.cu      | 37 ++++++++++----------
 src/ops/tree_inc_multihead_self_attention.cu | 26 +++++++-------
 src/runtime/batch_config.cc                  |  4 ++-
 src/runtime/request_manager.cc               | 11 ++++++
 6 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 801f9671d..04369f6f5 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -71,7 +71,12 @@ class BatchConfig {
 
   struct PerTokenInfo {
     TokenId token_id = -1;
+    // Difference between the two:
+    // abs_index_in_request: non-tree cache size + index in the flattened
+    // speculative tree
+    // abs_depth_in_request: non_tree cache size + depth in the speculative tree
     int abs_index_in_request = -1;
+    int abs_depth_in_request = -1;
     int request_index = -1;
   };
 
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index c5480e187..d020cc104 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -158,7 +158,7 @@ __global__ void
 
     int token_idx =
         (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_index_in_request;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
     int pos_i = real_i % (proj_size / 2);
     float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
@@ -204,7 +204,7 @@ __global__ void
     // get position of token
 
     // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_index_in_request;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 8b6814eb3..338ca2765 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -429,7 +429,7 @@ __global__ void
 
     int token_idx =
         (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_index_in_request;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
     // float before_real = complex_input[i].x, before_complex =
     // complex_input[i].y;
@@ -479,7 +479,7 @@ __global__ void
     // get position of token
 
     // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_index_in_request;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
@@ -876,20 +876,21 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
-  size = m->hidden_size * 50;
-  int *temp_output = new int[size];
-  cudaMemcpy(
-      temp_output, m->attn_heads, size * sizeof(int), cudaMemcpyDeviceToHost);
-  printf("Output: ");
-  for (int i = 0; i < 50; ++i) {
-    int temp = 0;
-    for (int j = 0; j < m->hidden_size; ++j) {
-      temp += temp_output[i * m->hidden_size + j];
-    }
-    printf("%d ", temp);
-  }
-  printf("\n");
-  delete[] temp_output;
+  //   size = m->hidden_size * 50;
+  //   int *temp_output = new int[size];
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(int),
+  //       cudaMemcpyDeviceToHost);
+  //   printf("Output: ");
+  //   for (int i = 0; i < 50; ++i) {
+  //     int temp = 0;
+  //     for (int j = 0; j < m->hidden_size; ++j) {
+  //       temp += temp_output[i * m->hidden_size + j];
+  //     }
+  //     printf("%d ", temp);
+  //   }
+  //   printf("\n");
+  //   delete[] temp_output;
 
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
@@ -1402,7 +1403,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         value_cache_size = num_q_heads * vProjSize *
                            BatchConfig::max_requests_per_batch() *
                            BatchConfig::max_sequence_length();
-        qk_prod_size = BatchConfig::max_spec_tree_token_num() *
+        qk_prod_size = BatchConfig::max_sequence_length() *
                        BatchConfig::max_sequence_length() * num_q_heads;
         break;
       }
@@ -1417,7 +1418,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_requests_per_batch() *
                            (BatchConfig::max_sequence_length() +
                             BatchConfig::max_spec_tree_token_num());
-        qk_prod_size = BatchConfig::max_spec_tree_token_num() *
+        qk_prod_size = BatchConfig::max_sequence_length() *
                        (BatchConfig::max_sequence_length() +
                         BatchConfig::max_spec_tree_token_num()) *
                        num_q_heads;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 18da245aa..daa61401f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -956,19 +956,19 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   compute_attention_kernel_fused<DT>(
       m, bc, static_cast<DT *>(m->attn_heads), stream);
 
-  int s = m->hidden_size * 50;
-  int *temp_output = new int[s];
-  cudaMemcpy(
-      temp_output, m->attn_heads, s * sizeof(int), cudaMemcpyDeviceToHost);
-  printf("Output: ");
-  for (int i = 0; i < 50; ++i) {
-    int temp = 0;
-    for (int j = 0; j < m->hidden_size; ++j) {
-      temp += temp_output[i * m->hidden_size + j];
-    }
-    printf("%d ", temp);
-  }
-  printf("\n");
+  //   int s = m->hidden_size * 50;
+  //   int *temp_output = new int[s];
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, s * sizeof(int), cudaMemcpyDeviceToHost);
+  //   printf("Output: ");
+  //   for (int i = 0; i < 50; ++i) {
+  //     int temp = 0;
+  //     for (int j = 0; j < m->hidden_size; ++j) {
+  //       temp += temp_output[i * m->hidden_size + j];
+  //     }
+  //     printf("%d ", temp);
+  //   }
+  //   printf("\n");
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index c9e1b2062..efbf82ab0 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -134,8 +134,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "Per-token info:\n";
   for (int i = 0; i < bc.num_tokens; i++) {
     os << "  Token " << i << ":\n";
-    os << "    Absolute depth in request: "
+    os << "    Absolute index in request: "
        << bc.tokensInfo[i].abs_index_in_request << std::endl;
+    os << "    Absolute depth in request: "
+       << bc.tokensInfo[i].abs_depth_in_request << std::endl;
     os << "    Request index: " << bc.tokensInfo[i].request_index << std::endl;
     os << "    Token id: " << bc.tokensInfo[i].token_id << std::endl;
   }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 23e20c680..0be938547 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -663,6 +663,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
 
     bc.tokensInfo[token_idx].request_index = request_index;
     bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
     bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
 
     bc.num_tokens++;
@@ -727,6 +728,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
 
     bc.tokensInfo[token_idx].request_index = request_index;
     bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
     bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
 
     bc.num_tokens++;
@@ -776,6 +778,7 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     // Per Token Info
     bc.tokensInfo[bc.num_tokens].request_index = request_index;
     bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
     bc.num_tokens++;
@@ -852,6 +855,8 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
       new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
           committed_tokens[committed_token_index].to_index;
+      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+          committed_tokens[committed_token_index].to_index;
       new_bc.tokensInfo[new_bc.num_tokens].token_id =
           committed_tokens[committed_token_index].token_id;
       new_bc.num_tokens++;
@@ -929,6 +934,8 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
         new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
             new_bc.requestsInfo[request_index].first_token_index_in_request +
             child_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.tokens.size() - 1 + current_speculation_step;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
 
         new_bc.num_tokens++;
@@ -1019,17 +1026,21 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // BatchConfig.tokensInfo.
     TokenTree &token_tree = request.speculative_token_trees[0];
     int token_tree_index = 0;
+    int layer_index = 0;
     for (auto const &tree_layer : token_tree.tree_layers) {
       for (auto const &tree_node : tree_layer) {
         if (tree_node->pruned == false) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               request.tokens.size() - 1 + token_tree_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              request.tokens.size() - 1 + layer_index;
           new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
           new_bc.num_tokens++;
           token_tree_index++;
         }
       }
+      layer_index++;
     }
     assert(token_tree_index == token_tree.tree_size);
     new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;

From cabe9c69694a9101298a7131c812d502430f2217 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 12 May 2024 16:45:14 -0400
Subject: [PATCH 248/667] Removed and commented debug outputs.

---
 src/ops/inc_multihead_self_attention.cu      | 43 ++++---------------
 src/ops/tree_inc_multihead_self_attention.cu | 45 +++++---------------
 2 files changed, 18 insertions(+), 70 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 338ca2765..988432dac 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -826,35 +826,6 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
 
-  int size = m->num_q_heads * m->kProjSize * 50;
-  int *temp_key = new int[size];
-  int *temp_value = new int[size];
-  cudaMemcpy(temp_key, m->keyCache, size * sizeof(int), cudaMemcpyDeviceToHost);
-  cudaMemcpy(
-      temp_value, m->valueCache, size * sizeof(int), cudaMemcpyDeviceToHost);
-
-  printf("key: ");
-  for (int i = 0; i < 50; ++i) {
-    int temp = 0;
-    for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
-      temp += temp_key[i * m->num_q_heads * m->kProjSize + j];
-    }
-    printf("%d ", temp);
-  }
-  printf("\n");
-
-  printf("value: ");
-  for (int i = 0; i < 50; ++i) {
-    int temp = 0;
-    for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
-      temp += temp_value[i * m->num_q_heads * m->kProjSize + j];
-    }
-    printf("%d ", temp);
-  }
-  printf("\n");
-  delete[] temp_key;
-  delete[] temp_value;
-
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
@@ -876,20 +847,22 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
-  //   size = m->hidden_size * 50;
-  //   int *temp_output = new int[size];
+  // Debug output:
+  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  //   float *temp_output = new float[size];
   //   cudaMemcpy(
-  //       temp_output, m->attn_heads, size * sizeof(int),
+  //       temp_output, m->attn_heads, size * sizeof(float),
   //       cudaMemcpyDeviceToHost);
   //   printf("Output: ");
-  //   for (int i = 0; i < 50; ++i) {
-  //     int temp = 0;
+  //   float temp = 0;
+  //   for (int i = 0; i < 1; ++i) {
   //     for (int j = 0; j < m->hidden_size; ++j) {
   //       temp += temp_output[i * m->hidden_size + j];
   //     }
-  //     printf("%d ", temp);
+  //     printf("%.6f ", temp);
   //   }
   //   printf("\n");
+
   //   delete[] temp_output;
 
   // compute output production and bias together for all tokens
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index daa61401f..4e25ed301 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -395,35 +395,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         BatchConfig::max_sequence_length() +
             BatchConfig::max_spec_tree_token_num(),
         m->hidden_size);
-
-    int size = m->num_q_heads * m->kProjSize * 50;
-    int *temp_key = new int[size];
-    int *temp_value = new int[size];
-    cudaMemcpy(
-        temp_key, m->keyCache, size * sizeof(int), cudaMemcpyDeviceToHost);
-    cudaMemcpy(
-        temp_value, m->valueCache, size * sizeof(int), cudaMemcpyDeviceToHost);
-    printf("key: ");
-    for (int i = 0; i < 50; ++i) {
-      int temp = 0;
-      for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
-        temp += temp_key[i * m->num_q_heads * m->kProjSize + j];
-      }
-      printf("%d ", temp);
-    }
-    printf("\n");
-
-    printf("value: ");
-    for (int i = 0; i < 50; ++i) {
-      int temp = 0;
-      for (int j = 0; j < m->num_q_heads * m->kProjSize; ++j) {
-        temp += temp_value[i * m->num_q_heads * m->kProjSize + j];
-      }
-      printf("%d ", temp);
-    }
-    printf("\n");
-    delete[] temp_key;
-    delete[] temp_value;
   }
 }
 
@@ -956,20 +927,24 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   compute_attention_kernel_fused<DT>(
       m, bc, static_cast<DT *>(m->attn_heads), stream);
 
-  //   int s = m->hidden_size * 50;
-  //   int *temp_output = new int[s];
+  // Debug output:
+  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  //   float *temp_output = new float[size];
   //   cudaMemcpy(
-  //       temp_output, m->attn_heads, s * sizeof(int), cudaMemcpyDeviceToHost);
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
   //   printf("Output: ");
-  //   for (int i = 0; i < 50; ++i) {
-  //     int temp = 0;
+  //   float temp = 0;
+  //   for (int i = 0; i < 1; ++i) {
   //     for (int j = 0; j < m->hidden_size; ++j) {
   //       temp += temp_output[i * m->hidden_size + j];
   //     }
-  //     printf("%d ", temp);
+  //     printf("%.6f ", temp);
   //   }
   //   printf("\n");
 
+  //   delete[] temp_output;
+
   int processed_tokens_in_batch = bc->num_active_tokens();
 
   compute_o_prod_bias(m,

From d3aef0822ea6337f78408f5665b33686de417a09 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 13 May 2024 20:21:32 -0400
Subject: [PATCH 249/667] Added support for coninuous prefilling requests.
 Modified the logic for LLM KV cache commit.

---
 include/flexflow/request_manager.h |   2 +
 src/runtime/request_manager.cc     | 387 +++++++++++++++++------------
 2 files changed, 225 insertions(+), 164 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 70120cbfb..a810500d7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -120,6 +120,8 @@ struct Request {
         : from_index(from_index), to_index(to_index), token_id(token_id) {}
   };
   std::vector<CommittedToken> committed_tokens;
+  bool llm_committed = true;
+  bool ssm_committed = true;
 };
 
 class TokenTreeNode {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0be938547..519e7187f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -440,10 +440,20 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
     case PREFILLING:
       if (decoding_mode == INCREMENTAL_DECODING) {
         if (update_llm_prefill_results(result)) {
-          // This indicates that the prefilling phase finishes
-          request_manager_status = DECODING;
+          // This indicates that the prefilling of the current request finishes
           // Reset the prefill_request
           prefill_request = nullptr;
+
+          // Check if there are more empty slots
+          if (num_available_requests < get_max_requests_per_batch() &&
+              !pending_request_queue.empty()) {
+            // Load the pending request to the batch
+            load_pending_reqeust_to_batch();
+            request_manager_status = PREFILLING;
+          } else {
+            // No more empty slots, start the decoding
+            request_manager_status = DECODING;
+          }
         }
         // Not completed, continue prefilling
       } else if (decoding_mode == SPECULATIVE_DECODING) {
@@ -457,10 +467,20 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         } else if (prefill_model == LLM) {
           if (update_llm_prefill_results(result)) {
             // This indicates that the prefilling phase finishes
-            request_manager_status = SSM_SPEC;
-            // Reset the prefill_request
             prefill_request = nullptr;
-            current_speculation_step = 0;
+            // Check if there are more empty slots
+            if (num_available_requests < get_max_requests_per_batch() &&
+                !pending_request_queue.empty()) {
+              // Load the pending request to the batch
+              load_pending_reqeust_to_batch();
+              request_manager_status = PREFILLING;
+              prefill_model = SSM;
+            } else {
+              // No more empty slots, start the speculation
+              request_manager_status = SSM_SPEC;
+              // Reset the prefill_request
+              current_speculation_step = 0;
+            }
           }
           // Not completed, continue LLM prefilling
         } else {
@@ -515,16 +535,21 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
   int committed_token_offset = prefill_request->llm_cache_size;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
-
   prefill_request->committed_tokens.clear();
+
   if (decoding_mode == SPECULATIVE_DECODING) {
-    // Add the committed tokens to the token tree
+    // Modified the state because the last commitment completes
+    prefill_request->llm_committed = true;
+    assert(prefill_request->ssm_committed and prefill_request->llm_committed);
+
     for (int i = 0; i < prefill_request->num_tokens_in_batch; i++) {
       prefill_request->committed_tokens.push_back(Request::CommittedToken{
           i,
           committed_token_offset + i,
           prefill_request->tokens[i + committed_token_offset]});
     }
+    // Modified the state because the new commitment is unfinished
+    prefill_request->llm_committed = false;
   }
 
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
@@ -539,6 +564,8 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
           Request::CommittedToken{-1,
                                   (int)prefill_request->tokens.size() - 1,
                                   prefill_request->tokens.back()});
+      // Modified the state because the ssm also need to commit the last token
+      prefill_request->ssm_committed = false;
 
       init_token_tree(prefill_request->guid);
       add_root_to_spec_token_tree(prefill_request->guid,
@@ -546,6 +573,30 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
       update_bitmask_prompt(prefill_request->guid, 1);
     }
   }
+
+  // Manages the committed states for other requests in the batch
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    if (request_index == prefill_request->batch_index) {
+
+      continue;
+    }
+
+    if (!request.llm_committed) {
+      request.llm_committed = true;
+      if (request.ssm_committed and request.llm_committed) {
+        request.llm_cache_size = request.tokens.size() - 1;
+        request.committed_tokens.clear();
+      }
+    }
+  }
   return prefill_completed;
 }
 
@@ -567,6 +618,12 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
       request_completed = true;
       request_complete_clean_up(request_index);
     }
+
+    if (verbose) {
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Request " << guid << " tokens: " << std::endl
+                << output << std::endl;
+    }
   }
   return request_completed;
 }
@@ -641,42 +698,63 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
             std::begin(bc.request_available));
   bc.num_available_requests = num_available_requests;
 
-  int request_index = prefill_request->batch_index;
-  // Request Info
-  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].first_token_index_in_request =
-      prefill_request->llm_cache_size;
-  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-      get_max_tokens_per_batch(),
-      (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
-
-  prefill_request->first_token_offset_in_batch = 0;
-  prefill_request->num_tokens_in_batch =
-      bc.requestsInfo[request_index].num_tokens_in_batch;
-
-  // Token Info
-  for (int token_idx = 0;
-       token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
-       token_idx++) {
-    int abs_idx = prefill_request->llm_cache_size + token_idx;
-    assert(abs_idx < prefill_request->tokens.size());
-
-    bc.tokensInfo[token_idx].request_index = request_index;
-    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
-    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-    bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-    bc.num_tokens++;
-  }
+    if (request_index == prefill_request->batch_index) {
+      // Request Info
+      bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+      bc.requestsInfo[request_index].first_token_index_in_request =
+          prefill_request->llm_cache_size;
+      bc.requestsInfo[request_index].num_tokens_in_batch =
+          std::min(get_max_tokens_per_batch(),
+                   (int)prefill_request->tokens.size() -
+                       prefill_request->llm_cache_size);
+
+      prefill_request->first_token_offset_in_batch = 0;
+      prefill_request->num_tokens_in_batch =
+          bc.requestsInfo[request_index].num_tokens_in_batch;
+
+      // Token Info
+      for (int token_idx = 0;
+           token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
+           token_idx++) {
+        int abs_idx = prefill_request->llm_cache_size + token_idx;
+        assert(abs_idx < prefill_request->tokens.size());
+
+        bc.tokensInfo[token_idx].request_index = request_index;
+        bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+        bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+        bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
+
+        bc.num_tokens++;
+      }
 
-  // Committed tokens
-  for (auto const &committed_token : prefill_request->committed_tokens) {
-    bc.committed_tokens[bc.num_tokens_to_commit].token_index =
-        committed_token.from_index;
-    bc.committed_tokens[bc.num_tokens_to_commit].request_index = request_index;
-    bc.committed_tokens[bc.num_tokens_to_commit].token_depth =
-        committed_token.to_index;
-    bc.num_tokens_to_commit++;
+    } else {
+      bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+      bc.requestsInfo[request_index].first_token_index_in_request =
+          request.llm_cache_size;
+      bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+
+      if (!request.llm_committed) {
+        // Committed tokens
+        for (int i = 0; i < request.committed_tokens.size() - 1; i++) {
+          bc.committed_tokens[bc.num_tokens_to_commit].token_index =
+              request.committed_tokens[i].from_index;
+          bc.committed_tokens[bc.num_tokens_to_commit].request_index =
+              request_index;
+          bc.committed_tokens[bc.num_tokens_to_commit].token_depth =
+              request.committed_tokens[i].to_index;
+          bc.num_tokens_to_commit++;
+        }
+      }
+    }
   }
 
   if (verbose) {
@@ -701,10 +779,9 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   BatchConfig bc;
   bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
   bc.prompt_phase = true;
-  std::copy(std::begin(request_available),
-            std::end(request_available),
-            std::begin(bc.request_available));
-  bc.num_available_requests = num_available_requests;
+  // Only set the prefilling request to be available
+  bc.request_available[prefill_request->batch_index] = true;
+  bc.num_available_requests = 1;
 
   int request_index = prefill_request->batch_index;
   // Request Info
@@ -915,6 +992,8 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
           request.causal_mask.non_tree_cache_size +
           request.causal_mask.tree_or_prompt_size -
           request.causal_mask.current_layer_size;
+      request.num_tokens_in_batch = 0;
+      request.first_token_offset_in_batch = new_bc.num_tokens;
       continue;
     } else {
       std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
@@ -928,6 +1007,10 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
       new_bc.requestsInfo[request_index].num_tokens_in_batch =
           request.causal_mask.current_layer_size;
 
+      request.num_tokens_in_batch =
+          new_bc.requestsInfo[request_index].num_tokens_in_batch;
+      request.first_token_offset_in_batch = new_bc.num_tokens;
+
       int child_index = 0;
       for (auto const &node_ptr : current_layer) {
         new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
@@ -1006,21 +1089,23 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // BatchConfig.committed_tokens.
     // Note here, we shouldn't put the last token in request.committed_tokens
     // into new_bc. Because the LLM don't have that token's KV cache.
-    std::vector<Request::CommittedToken> &committed_tokens =
-        request.committed_tokens;
-    for (int committed_token_index = 0;
-         committed_token_index < committed_tokens.size() - 1;
-         committed_token_index++) {
-      Request::CommittedToken &committed_token =
-          committed_tokens.at(committed_token_index);
-      new_bc.committed_tokens[committed_token_index].request_index =
-          request_index;
-      new_bc.committed_tokens[committed_token_index].token_index =
-          committed_token.from_index;
-      new_bc.committed_tokens[committed_token_index].token_depth =
-          committed_token.to_index;
+    if (!request.llm_committed) {
+      std::vector<Request::CommittedToken> &committed_tokens =
+          request.committed_tokens;
+      for (int committed_token_index = 0;
+           committed_token_index < committed_tokens.size() - 1;
+           committed_token_index++) {
+        Request::CommittedToken &committed_token =
+            committed_tokens.at(committed_token_index);
+        new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+            request_index;
+        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+            committed_token.from_index;
+        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+            committed_token.to_index;
+        new_bc.num_tokens_to_commit++;
+      }
     }
-    new_bc.num_tokens_to_commit = committed_tokens.size() - 1;
 
     // Load the tokens on the token tree that are not yet pruned to
     // BatchConfig.tokensInfo.
@@ -1086,9 +1171,20 @@ bool RequestManager::update_llm_verify_results(
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    request.llm_cache_size +=
-        request.committed_tokens.size() - 1; // Exclude the last token
-    request.committed_tokens.clear();
+    if (!request.llm_committed) {
+      request.llm_committed = true;
+      request.llm_cache_size +=
+          request.committed_tokens.size() - 1; // Exclude the last token
+      // Check if both the KV cache of SSM and LLM are committed, because
+      // sometimes the LLM KV cache is committed by a verifying batch config,
+      // sometimes it is committed by a LLM prefilling batch config. We don't
+      // know when the tokens are committed, so we have to add these checks
+      // whenever the SSM or the LLM commits tokens. If the both caches are
+      // committed, we can clear the committed tokens.
+      if (request.ssm_committed and request.llm_committed) {
+        request.committed_tokens.clear();
+      }
+    }
   }
 
   // Process the LLM results greedily
@@ -1168,61 +1264,19 @@ bool RequestManager::update_ssm_inference_results(
     assert(request.status == Request::RUNNING);
 
     if (current_speculation_step == 1) {
+      request.ssm_committed = true;
+      // Check if both the KV cache of SSM and LLM are committed, because
+      // sometimes the LLM KV cache is committed by a verifying batch config,
+      // sometimes it is committed by a LLM prefilling batch config. We don't
+      // know when the tokens are committed, so we have to add these checks
+      // whenever the SSM or the LLM commits tokens. If the both caches are
+      // committed, we can clear the committed tokens.
+      if (request.ssm_committed and request.llm_committed) {
+        request.committed_tokens.clear();
+      }
       request.ssm_cache_size = request.tokens.size();
     }
 
-    // TokenTree &token_tree = request.speculative_token_trees[0];
-    // if (token_tree.tree_layers.size() < current_speculation_step) {
-    //   // This means that the parent layer is empty
-    //   continue;
-    // } else {
-    //   std::list<std::shared_ptr<TokenTreeNode>> &parent_tree_layer =
-    //       token_tree.tree_layers.back();
-    //   int parent_pos = 0;
-    //   //   for (auto &parent_it = parent_tree_layer.begin();
-    //   //        parent_it != parent_tree_layer.end();
-    //   //        parent_it++) {
-    //   for (auto parent_ptr : parent_tree_layer) {
-    //     if (parent_ptr->pruned) {
-    //       // Parent token is pruned, we have to skip all its children
-    //       // Because no token is pruned in the last layer during the small
-    //       // model inference, the reason why some parents are pruned is that
-    //       // adding tokens to the new layer of the tree may result in some
-    //       // node being pruned in internal layers.
-    //       result_index += num_branches;
-    //     } else {
-    //       // Parent token is not pruned
-    //       for (int child_idx = 0; child_idx < num_branches; child_idx++) {
-    //         float parent_log_prob = parent_ptr->log_accumulated_prob;
-    //         std::cout << "Probability: "
-    //                   << ssm_inference_result.probs[result_index] <<
-    //                   std::endl;
-    //         std::cout << "Log Probability: "
-    //                   << log(ssm_inference_result.probs[result_index])
-    //                   << std::endl;
-    //         assert(parent_log_prob != -std::numeric_limits<float>::infinity()
-    //         &&
-    //                "Parent log probability should not be -inf.");
-    //         assert(log(ssm_inference_result.probs[result_index]) !=
-    //                    -std::numeric_limits<float>::infinity() &&
-    //                "Child log probability should not be -inf.");
-    //         add_token_to_spec_token_tree(
-    //             guid,
-    //             ssm_inference_result.token_ids[result_index],
-    //             parent_pos,
-    //             log(ssm_inference_result.probs[result_index]) +
-    //                 parent_log_prob);
-    //         result_index++;
-    //       }
-    //     }
-    //     parent_pos++;
-    //   }
-    // }
-
-    // bool last_layer_empty = prune_last_layer_of_spec_token_tree(guid);
-    // all_request_last_layer_empty =
-    //     all_request_last_layer_empty && last_layer_empty;
-
     if (current_speculation_step == 1) {
       init_bitmask_spec(guid);
     }
@@ -1474,7 +1528,10 @@ void RequestManager::get_verify_results_greedy(
         llm_verify_result
             .token_ids[llm_result_offset + last_accepted_token_index]);
 
-    llm_result_offset += request.num_tokens_in_batch;
+    llm_result_offset = request.first_token_offset_in_batch;
+
+    request.llm_committed = false;
+    request.ssm_committed = false;
 
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
@@ -1781,57 +1838,59 @@ bool RequestManager::add_tokens_to_spec_token_tree(
         tokens;
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
-      if (parent_ptr->pruned) {
-        continue;
-      }
-      for (int child_pos = 0;
-           child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-           child_pos++) {
-        int result_idx =
-            result_offset +
-            parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES + child_pos;
-        float log_prob = log(ssm_inference_result.probs[result_idx]);
-        float log_accumulated_prob =
-            log_prob + parent_ptr->log_accumulated_prob;
-
-        std::cout << "Probability at result index" << result_idx << ": "
-                  << ssm_inference_result.probs[result_idx] << std::endl;
-        std::cout << "Token id: " << ssm_inference_result.token_ids[result_idx]
-                  << std::endl;
-        std::cout << "Log Probability: " << log_prob << std::endl;
-        assert(log_prob != -std::numeric_limits<float>::infinity() &&
-               "Child log probability should not be -inf.");
-
-        if (tokens.size() == empty_slots_on_tree and
-            log_accumulated_prob <= (*tokens.begin())->log_accumulated_prob) {
-          // The token tree is full, and the new token has a lower joint
-          // probability than the minimum node in the pool, we don't need to add
-          // the new token and the following tokens belong to the same parent
-          // to the tree, because the tokens are sorted by their probability
-          break;
-        } else if (token_pool_full and
-                   log_accumulated_prob <=
-                       token_tree_node_pool.top().first->log_accumulated_prob) {
-          // The token tree is not full, but the token pool is full, and the new
-          // token has a lower joint probability than the minimum node in the
-          // pool, we don't need to add the new token and the following tokens
-          // belong to the same parent to the tree, because the tokens are
-          // sorted by their probability
-          break;
-        } else {
-          std::shared_ptr<TokenTreeNode> node_ptr =
-              std::make_shared<TokenTreeNode>(
-                  ssm_inference_result.token_ids[result_idx],
-                  log_accumulated_prob,
-                  parent_pos);
+      if (!parent_ptr->pruned) {
+        for (int child_pos = 0;
+             child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+             child_pos++) {
+          int result_idx =
+              result_offset +
+              parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES +
+              child_pos;
+          float log_prob = log(ssm_inference_result.probs[result_idx]);
+          float log_accumulated_prob =
+              log_prob + parent_ptr->log_accumulated_prob;
+
+          std::cout << "Probability at result index " << result_idx << ": "
+                    << ssm_inference_result.probs[result_idx] << "\t";
+          std::cout << "Token id: "
+                    << ssm_inference_result.token_ids[result_idx] << std::endl;
+          assert(log_prob != -std::numeric_limits<float>::infinity() &&
+                 "Child log probability should not be -inf.");
+
           if (tokens.size() == empty_slots_on_tree and
-              log_accumulated_prob > (*tokens.begin())->log_accumulated_prob) {
-            // The token tree is full, and the new token has a higher joint
-            // probability than the minimum node in the pool, we need to remove
-            // the minimum node from the pool and add the new token to the tree
-            tokens.erase(tokens.begin());
+              log_accumulated_prob <= (*tokens.begin())->log_accumulated_prob) {
+            // The token tree is full, and the new token has a lower joint
+            // probability than the minimum node in the pool, we don't need to
+            // add the new token and the following tokens belong to the same
+            // parent to the tree, because the tokens are sorted by their
+            // probability
+            break;
+          } else if (token_pool_full and
+                     log_accumulated_prob <= token_tree_node_pool.top()
+                                                 .first->log_accumulated_prob) {
+            // The token tree is not full, but the token pool is full, and the
+            // new token has a lower joint probability than the minimum node in
+            // the pool, we don't need to add the new token and the following
+            // tokens belong to the same parent to the tree, because the tokens
+            // are sorted by their probability
+            break;
+          } else {
+            std::shared_ptr<TokenTreeNode> node_ptr =
+                std::make_shared<TokenTreeNode>(
+                    ssm_inference_result.token_ids[result_idx],
+                    log_accumulated_prob,
+                    parent_pos);
+            if (tokens.size() == empty_slots_on_tree and
+                log_accumulated_prob >
+                    (*tokens.begin())->log_accumulated_prob) {
+              // The token tree is full, and the new token has a higher joint
+              // probability than the minimum node in the pool, we need to
+              // remove the minimum node from the pool and add the new token to
+              // the tree
+              tokens.erase(tokens.begin());
+            }
+            tokens.insert(node_ptr);
           }
-          tokens.insert(node_ptr);
         }
       }
       parent_pos++;

From 0e9e87596159b2943dcf7ee0970559ba2aa91e4b Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 13 May 2024 20:23:13 -0400
Subject: [PATCH 250/667] Fixed request index issue.

---
 src/ops/inc_multihead_self_attention.cu      | 14 ++++--
 src/ops/spec_inc_multihead_self_attention.cu | 49 +++++++++++---------
 src/ops/tree_inc_multihead_self_attention.cu | 13 ++++--
 3 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 988432dac..168fc47cf 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -82,10 +82,12 @@ __global__ void compute_attention_kernel_generation_kernel(
   // request idx
   int const request_idx = blockIdx.y;
 
-  int requext_idx_in_batch = 0;
-  for (int i = 0; i < request_idx; i++) {
-    while (!request_available[requext_idx_in_batch]) {
-      requext_idx_in_batch++;
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
     }
   }
 
@@ -850,6 +852,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
   // Debug output:
   //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
   //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
   //   cudaMemcpy(
   //       temp_output, m->attn_heads, size * sizeof(float),
   //       cudaMemcpyDeviceToHost);
@@ -961,6 +964,9 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    if (num_new_tokens == 0) {
+      continue;
+    }
     int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
     // Step 1: compute query-key product QK.T/sqrt(d_k)
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 41e8bc38a..cb3537c4b 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -74,10 +74,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   int const request_idx = blockIdx.y;
 
   // request id in batch config
-  int requext_idx_in_batch = 0;
-  for (int i = 0; i < request_idx; i++) {
-    while (!request_available[requext_idx_in_batch]) {
-      requext_idx_in_batch++;
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
     }
   }
 
@@ -353,25 +355,7 @@ __global__ void
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
     int const req_id = tokenInfos[token_idx].request_index;
-    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    int const request_token_offset =
-        requestInfo[req_id].first_token_offset_in_batch;
-
-    // BatchConfig::BitMask bitmask = causalMask[req_id];
-    BatchConfig::BitMask *bitmask = &causalMask[req_id];
-
-    // if prompt token -> token id
-    // if tree token:
-
-    // int const cache_idx = bitmask->prompt_size + bitmask->non_tree_cache_size
-    // +
-    //                       bitmask->tree_or_prompt_size - 1 -
-    //                       bitmask->current_layer_size + token_idx -
-    //                       request_token_offset;
-    int const cache_idx =
-        bitmask->non_tree_cache_size + bitmask->tree_or_prompt_size -
-        bitmask->current_layer_size + token_idx - request_token_offset;
+    int const cache_idx = tokenInfos[token_idx].abs_index_in_request;
 
     kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
                offset] = kVal;
@@ -750,6 +734,25 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
+  // Debug output:
+  int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  float *temp_output = new float[size];
+  cudaDeviceSynchronize();
+  cudaMemcpy(
+      temp_output, m->attn_heads, size * sizeof(float), cudaMemcpyDeviceToHost);
+
+  printf("Output: ");
+  for (int i = 0; i < bc->num_tokens; ++i) {
+    float temp = 0;
+    for (int j = 0; j < m->hidden_size; ++j) {
+      temp += temp_output[i * m->hidden_size + j];
+    }
+    printf("%.6f ", temp);
+  }
+  printf("\n");
+
+  delete[] temp_output;
+
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 4e25ed301..66f2f4b05 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -80,10 +80,12 @@ __global__ void compute_attention_kernel_fused_kernel(
   int const request_idx = blockIdx.y;
 
   // request id in batch config
-  int requext_idx_in_batch = 0;
-  for (int i = 0; i < request_idx; i++) {
-    while (!request_available[requext_idx_in_batch]) {
-      requext_idx_in_batch++;
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
     }
   }
 
@@ -930,12 +932,13 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // Debug output:
   //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
   //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
   //   cudaMemcpy(
   //       temp_output, m->attn_heads, size * sizeof(float),
   //       cudaMemcpyDeviceToHost);
   //   printf("Output: ");
-  //   float temp = 0;
   //   for (int i = 0; i < 1; ++i) {
+  //     float temp = 0;
   //     for (int j = 0; j < m->hidden_size; ++j) {
   //       temp += temp_output[i * m->hidden_size + j];
   //     }

From c765011f994c7d1ce2c92f65cc41b345ac1f27d4 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 13 May 2024 22:52:40 -0400
Subject: [PATCH 251/667] Added width constrain to speculative trees.
 Parameterized max_tree_depth and max_tree_width.

---
 include/flexflow/batch_config.h    |  5 +--
 include/flexflow/request_manager.h |  9 +++++
 src/runtime/request_manager.cc     | 54 ++++++++++++++++++++++++++----
 3 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 04369f6f5..5ac175e00 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -51,9 +51,10 @@ class BatchConfig {
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
   inline static int const MAX_NUM_TOKENS = 1024;
-  inline static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
-  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+  inline static int const MAX_SPEC_TREE_TOKEN_NUM = 128;
+  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 4;
   inline static int const MAX_TREE_DEPTH = 16;
+  inline static int const MAX_TREE_WIDTH = 64;
   inline static int const MAX_K_LOGITS = 16;
 
   int num_tokens = 0;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a810500d7..145986af6 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -219,6 +219,12 @@ class RequestManager {
   int get_max_sequence_length();
   void set_decoding_mode(DecodingMode mode);
   void set_verbose(bool verbose_);
+  int get_k();
+  void set_k(int k);
+  int get_max_tree_depth();
+  void set_max_tree_depth(int max_tree_depth);
+  int get_max_tree_width();
+  void set_max_tree_width(int max_tree_width);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -284,6 +290,9 @@ class RequestManager {
   int max_tokens_per_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
+  int max_tree_depth;
+  int max_tree_width;
+  int k;
   State request_manager_status;
   BackgroundServerStatus background_server_status;
   DecodingMode decoding_mode;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 519e7187f..094204cf3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -58,6 +58,9 @@ RequestManager::RequestManager()
   max_tokens_per_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
+  max_tree_depth = -1;
+  max_tree_width = -1;
+  k = -1;
   std::fill(std::begin(request_available), std::end(request_available), false);
   std::fill(
       std::begin(guid_of_requests), std::end(guid_of_requests), INVALID_GUID);
@@ -122,6 +125,44 @@ void RequestManager::set_verbose(bool verbose_) {
   verbose = verbose_;
 }
 
+int RequestManager::get_k() {
+  assert(k > 0 and k <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM and "Invalid k");
+  return k;
+}
+
+void RequestManager::set_k(int _k) {
+  assert(_k > 0 and _k <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM and "Invalid k");
+  k = _k;
+}
+
+int RequestManager::get_max_tree_depth() {
+  assert(max_tree_depth > 0 and
+         max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and
+         "Invalid max_tree_depth");
+  return max_tree_depth;
+}
+
+void RequestManager::set_max_tree_depth(int max_tree_depth) {
+  assert(max_tree_depth > 0 and
+         max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and
+         "Invalid max_tree_depth");
+  this->max_tree_depth = max_tree_depth;
+}
+
+int RequestManager::get_max_tree_width() {
+  assert(max_tree_width > 0 and
+         max_tree_width <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid max_tree_width");
+  return max_tree_width;
+}
+
+void RequestManager::set_max_tree_width(int max_tree_width) {
+  assert(max_tree_width > 0 and
+         max_tree_width <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid max_tree_width");
+  this->max_tree_width = max_tree_width;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -1285,7 +1326,7 @@ bool RequestManager::update_ssm_inference_results(
 
   // Stop conditions
   return all_request_last_layer_empty ||
-         current_speculation_step > BatchConfig::MAX_TREE_DEPTH;
+         current_speculation_step > get_max_tree_depth();
 }
 
 /* --------- Bitmask Related Functions --------- */
@@ -1820,10 +1861,11 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     int result_offset = request.first_token_offset_in_batch *
                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     int current_tree_size = request.causal_mask.tree_or_prompt_size;
-    int empty_slots_on_tree = get_max_spec_tree_token_num() -
-                              current_tree_size; // The number of empty slots
+    int empty_slots_in_layer =
+        min(get_max_spec_tree_token_num() - current_tree_size,
+            get_max_tree_width()); // The number of empty slots
 
-    if (empty_slots_on_tree == 0) {
+    if (empty_slots_in_layer == 0) {
       // The token tree is full, we don't need to add tokens to it
       continue;
     }
@@ -1857,7 +1899,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
           assert(log_prob != -std::numeric_limits<float>::infinity() &&
                  "Child log probability should not be -inf.");
 
-          if (tokens.size() == empty_slots_on_tree and
+          if (tokens.size() == empty_slots_in_layer and
               log_accumulated_prob <= (*tokens.begin())->log_accumulated_prob) {
             // The token tree is full, and the new token has a lower joint
             // probability than the minimum node in the pool, we don't need to
@@ -1880,7 +1922,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
                     ssm_inference_result.token_ids[result_idx],
                     log_accumulated_prob,
                     parent_pos);
-            if (tokens.size() == empty_slots_on_tree and
+            if (tokens.size() == empty_slots_in_layer and
                 log_accumulated_prob >
                     (*tokens.begin())->log_accumulated_prob) {
               // The token tree is full, and the new token has a higher joint

From d4c03c3109eaecd3809121b10f5b296cb51665a6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 14 May 2024 13:07:44 -0400
Subject: [PATCH 252/667] Fixed bug that leads to invalid last output index.

---
 src/ops/argmax.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 05c84719c..d9fb7198d 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -23,7 +23,7 @@ __global__ void init_offset(int batch_size,
                             int vocab_size,
                             int total_eles,
                             int *d_offsets) {
-  CUDA_KERNEL_LOOP(i, total_eles) {
+  CUDA_KERNEL_LOOP(i, total_eles + 1) {
     if (i % vocab_size == 0) {
       d_offsets[i / vocab_size] = i;
     }
@@ -83,7 +83,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           prob_ptr,
                           batch_size,
                           m->beam_search);
-  // print_tensor<int>(indices_ptr, 32, "argmax op");
+  //   print_tensor<int>(indices_ptr, 4, "argmax op");
 }
 
 /*static*/
@@ -151,7 +151,7 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  size_t d_offsets_size = batch_size;
+  size_t d_offsets_size = batch_size + 1;
   size_t prob_size = batch_size;
   assert(data_type == DT_FLOAT || data_type == DT_HALF);
   size_t total_size =

From a7084723ddd57c1b2824449c99de6e95985dd8ae Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 14 May 2024 13:08:06 -0400
Subject: [PATCH 253/667] Commented debug output.

---
 src/ops/spec_inc_multihead_self_attention.cu | 35 ++++++++++----------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index cb3537c4b..92fd54ade 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -735,23 +735,24 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   }
 
   // Debug output:
-  int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
-  float *temp_output = new float[size];
-  cudaDeviceSynchronize();
-  cudaMemcpy(
-      temp_output, m->attn_heads, size * sizeof(float), cudaMemcpyDeviceToHost);
-
-  printf("Output: ");
-  for (int i = 0; i < bc->num_tokens; ++i) {
-    float temp = 0;
-    for (int j = 0; j < m->hidden_size; ++j) {
-      temp += temp_output[i * m->hidden_size + j];
-    }
-    printf("%.6f ", temp);
-  }
-  printf("\n");
-
-  delete[] temp_output;
+  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
+
+  //   printf("Output: ");
+  //   for (int i = 0; i < bc->num_tokens; ++i) {
+  //     float temp = 0;
+  //     for (int j = 0; j < m->hidden_size; ++j) {
+  //       temp += temp_output[i * m->hidden_size + j];
+  //     }
+  //     printf("%.6f ", temp);
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
 
   // compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();

From 97f7bb69e61a64fda94687d2f6cea23b24a1eff5 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Tue, 14 May 2024 13:08:56 -0400
Subject: [PATCH 254/667] Bug fix.

---
 src/runtime/request_manager.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 094204cf3..249abd0c0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1473,7 +1473,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 
 void RequestManager::get_verify_results_greedy(
     InferenceResult const &llm_verify_result) {
-  int llm_result_offset = 0;
   // This function maintain the generated token list of the request and the
   // committed tokens.
   for (int request_index = 0; request_index < get_max_requests_per_batch();
@@ -1485,6 +1484,7 @@ void RequestManager::get_verify_results_greedy(
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
+    int llm_result_offset = request.first_token_offset_in_batch;
     int committed_token_index = request.tokens.size() - 1;
 
     TokenTree &token_tree = request.speculative_token_trees[0];
@@ -1569,8 +1569,6 @@ void RequestManager::get_verify_results_greedy(
         llm_verify_result
             .token_ids[llm_result_offset + last_accepted_token_index]);
 
-    llm_result_offset = request.first_token_offset_in_batch;
-
     request.llm_committed = false;
     request.ssm_committed = false;
 
@@ -1871,7 +1869,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     }
 
     bool token_pool_full =
-        token_tree_node_pool.size() == get_max_tokens_per_batch();
+        token_tree_node_pool.size() >= get_max_tokens_per_batch();
 
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
@@ -1943,12 +1941,17 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     spec_token_tree.add_layer();
     for (auto token_it = tokens.crbegin(); token_it != tokens.crend();
          token_it++) {
+      token_pool_full =
+          token_tree_node_pool.size() == get_max_tokens_per_batch();
       if (token_pool_full and
           token_tree_node_pool.top().first->log_accumulated_prob >=
               (*token_it)->log_accumulated_prob) {
         break;
       } else if (token_pool_full) {
         token_tree_node_pool.top().first->pruned = true;
+        all_requests[token_tree_node_pool.top().second]
+            .speculative_token_trees[0]
+            .tree_size--;
         token_tree_node_pool.pop();
       }
 
@@ -1981,7 +1984,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     for (auto it = last_layer.begin(); it != last_layer.end();) {
       if ((*it)->pruned) {
         it = last_layer.erase(it);
-        spec_token_tree.tree_size--;
+        // spec_token_tree.tree_size--;
       } else {
         ++it;
       }
@@ -1992,6 +1995,8 @@ bool RequestManager::add_tokens_to_spec_token_tree(
       spec_token_tree.tree_layers.pop_back();
     }
   }
+  assert(token_tree_node_pool.size() <= get_max_tokens_per_batch() &&
+         "The token tree node pool should not exceed the maximum size.");
   return all_request_last_layer_empty;
 }
 

From 2bb2e75d1a3f7eec515aead397a8e59904feb99d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 15 May 2024 11:11:31 -0400
Subject: [PATCH 255/667] Added some parameters for the experiment.

---
 inference/spec_infer/spec_infer.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 1b86a9c30..141538c75 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -277,8 +277,10 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 256;
   int max_sequence_length = 512;
-  int max_spec_tree_token_num = 8;
+  int max_spec_tree_token_num = 64;
   int expansion_degree = 3;
+  int max_tree_depth = 16;
+  int max_tree_width = 16;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::SPECULATIVE_DECODING;
 
@@ -310,6 +312,8 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_spec_tree_token_num(max_spec_tree_token_num);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_tree_depth(max_tree_depth);
+  rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
@@ -357,6 +361,9 @@ void FlexFlow::top_level_task(Task const *task,
   FFConfig bm_config = ffconfig;
   bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
       bm_config.pipeline_parallelism_degree = 1;
+  //   bm_config.data_parallelism_degree = 1;
+  //   bm_config.tensor_parallelism_degree = 4;
+  //   bm_config.pipeline_parallelism_degree = 1;
   for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
     FFModel beam_model(bm_config);
     ssm_models.push_back(beam_model);

From 54954fc5a4559da8c3e3629d21860fc38fdd1b35 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 15 May 2024 11:11:46 -0400
Subject: [PATCH 256/667] Fixed a bug.

---
 src/runtime/model.cc | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index b68e12a39..a27c2c0f9 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3327,10 +3327,14 @@ void FFModel::create_operators_from_layers() {
     }
     Op *op = nullptr;
     // add a combine before arg_topk
+    // if (config.computationMode == COMP_MODE_INFERENCE &&
+    //     config.tensor_parallelism_degree > 1 &&
+    //     (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
+    //      l->op_type == OP_ARGMAX || l->op_type == OP_GUMBEL_TOPK)) {
     if (config.computationMode == COMP_MODE_INFERENCE &&
         config.tensor_parallelism_degree > 1 &&
-        (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
-         l->op_type == OP_ARGMAX || l->op_type == OP_GUMBEL_TOPK)) {
+        (l->op_type == OP_SOFTMAX || l->op_type == OP_ARGMAX ||
+         l->op_type == OP_GUMBEL_TOPK)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,
@@ -5962,11 +5966,13 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<OpMeta *, GumbelTopK::init_task>(registrar);
+      runtime->register_task_variant<OpMeta *, GumbelTopK::init_task>(
+          registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_TASK_ID, "GumbelTopK Inference");
+    TaskVariantRegistrar registrar(GUMBEL_TOPK_INF_TASK_ID,
+                                   "GumbelTopK Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
@@ -5977,8 +5983,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<InferenceResult, GumbelTopK::inference_task>(
-          registrar);
+      runtime
+          ->register_task_variant<InferenceResult, GumbelTopK::inference_task>(
+              registrar);
     }
   }
   {

From 36b2143b9ffff8c55eb52088730dfaab1a966aab Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 15 May 2024 11:12:26 -0400
Subject: [PATCH 257/667] Modifed some parameters for a inc decoding test
 setting.

---
 tests/inference/cpp_inference_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index c9e4b3621..cceca7845 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -45,7 +45,7 @@ fi
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
 
 # LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 20000 -ll:zsize 30000 --verbose --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
 # LLAMA (big model, half precision)
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
 

From b07c48b07c8e2dda4ab44ee687dcafb704c479e1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 16 May 2024 00:30:01 -0400
Subject: [PATCH 258/667] Removed some synchronization.

---
 include/flexflow/request_manager.h           |  8 +++++---
 src/ops/inc_multihead_self_attention.cu      |  2 +-
 src/ops/spec_inc_multihead_self_attention.cu | 15 ++++++---------
 src/ops/tree_inc_multihead_self_attention.cu |  2 +-
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 145986af6..076768efb 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -345,9 +345,11 @@ class RequestManager {
   size_t num_processed_requests;
 
   struct ProfileInfo {
-    int llm_decoding_steps;
-    int ssm_decoding_steps;
-    double start_time, finish_time;
+    int llm_prefilling_steps = 0;
+    int ssm_prefilling_steps = 0;
+    int llm_decoding_steps = 0;
+    int ssm_decoding_steps = 0;
+    long long start_time = 0, start_decoding_time = 0, finish_time = 0;
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 168fc47cf..cf42245f5 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -92,7 +92,7 @@ __global__ void compute_attention_kernel_generation_kernel(
   }
 
   // threads converge
-  __syncthreads();
+  //   __syncthreads();
 
   int const first_step = 0;
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 92fd54ade..f9a4a168a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -84,7 +84,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   }
 
   // threads converge
-  __syncthreads();
+  //   __syncthreads();
 
   // request_idx = re
 
@@ -279,14 +279,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
         int const ti_circ = ti % max_seq_length;
         V_vec v = *reinterpret_cast<V_vec const *>(
             v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-
-        bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                           (!test_bit(bitmask->bit_mask,
-                                      query_token,
-                                      ti - bitmask->non_tree_cache_size)));
-        // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-        //   (1 << query_token))));
-        float logit = mask ? 0.0f : qk_smem[ti - first_step];
+        float logit = qk_smem[ti - first_step];
         out = FlexFlow::fma(logit, cast_to_float(v), out);
       }
     }
@@ -713,6 +706,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       cudaStream_t stream) {
   // phase 1: Implement kernel to compute KQV for input tokens
 
+  long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
@@ -759,6 +753,9 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+  time_2 = Realm::Clock::current_time_in_microseconds();
+  std::cout << "SpecIncMultiHeadSelfAttention kernel time: "
+            << (time_2 - time_1) << "us" << std::endl;
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 66f2f4b05..273de4d5f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -90,7 +90,7 @@ __global__ void compute_attention_kernel_fused_kernel(
   }
 
   // threads converge
-  __syncthreads();
+  //   __syncthreads();
 
   int const first_step = 0;
 

From 6fac6e19a46c7dbfd625b8193bb20fe55dffda3a Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 16 May 2024 00:30:23 -0400
Subject: [PATCH 259/667] Removed unused code.

---
 include/flexflow/request_manager.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 076768efb..cc4d77fec 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -395,11 +395,6 @@ class RequestManager {
                                    BatchConfig::TokenId token_id);
   bool add_tokens_to_spec_token_tree(
       InferenceResult const &ssm_inference_result);
-  //   bool add_token_to_spec_token_tree(RequestGuid guid,
-  //                                     BatchConfig::TokenId token_id,
-  //                                     int parent_pos,
-  //                                     float log_accumulated_prob);
-  //   bool prune_last_layer_of_spec_token_trees();
   /* ---------- Spec Decoding Helper Functions ---------- */
 };
 

From 74d79b1ab382c9903cf73ae345f12c56e70d2521 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 16 May 2024 00:31:08 -0400
Subject: [PATCH 260/667] Added profiling.

---
 src/runtime/request_manager.cc | 93 ++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 26 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 249abd0c0..dd54e0710 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -440,12 +440,17 @@ void RequestManager::load_pending_reqeust_to_batch() {
   num_available_requests++;
   // Initialize the bitmask for the new request with its prompt length
   init_bitmask_prompt(guid, prefill_request->tokens.size());
+
+  profiling_requests[guid] = ProfileInfo();
+  profiling_requests[guid].start_time =
+      Realm::Clock::current_time_in_microseconds();
 }
 
 void RequestManager::request_complete_clean_up(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
+  profiling_requests[guid].finish_time =
+      Realm::Clock::current_time_in_microseconds();
   Request &request = all_requests[guid];
-
   guid_of_requests[batch_index] = INVALID_GUID;
   request_available[batch_index] = false;
   num_available_requests--;
@@ -454,7 +459,23 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   std::string output = this->tokenizer_->Decode(request.tokens);
   std::cout << "Request " << guid << " completed: " << std::endl
             << output << std::endl;
-  // TODO: remove the request from all_requests?
+  std::cout << "Request " << guid << " profiling: " << std::endl
+            << "Decoding time: "
+            << (profiling_requests[guid].finish_time -
+                profiling_requests[guid].start_decoding_time) *
+                   1e-3
+            << "ms" << std::endl
+            << "Total time: "
+            << (profiling_requests[guid].finish_time -
+                profiling_requests[guid].start_time) *
+                   1e-3
+            << "ms" << std::endl
+            << "LLM decoding steps: "
+            << profiling_requests[guid].llm_decoding_steps << std::endl;
+  if (decoding_mode == SPECULATIVE_DECODING) {
+    std::cout << "SSM decoding steps: "
+              << profiling_requests[guid].ssm_decoding_steps << std::endl;
+  }
 
   trigger_request_completion_future(guid);
 }
@@ -615,6 +636,8 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
     }
   }
 
+  profiling_requests[prefill_request->guid].llm_prefilling_steps++;
+
   // Manages the committed states for other requests in the batch
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
@@ -655,7 +678,9 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     request.llm_cache_size++;
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
-    if (request.tokens.size() == get_max_sequence_length()) {
+
+    profiling_requests[guid].llm_decoding_steps++;
+    if (request.tokens.size() >= get_max_sequence_length()) {
       request_completed = true;
       request_complete_clean_up(request_index);
     }
@@ -675,6 +700,9 @@ bool RequestManager::update_ssm_prefill_results(
   // request_manager_status is PREFILLING and the prefill_model is SSM.
   // There's no results to update, but we should update ssm_cache_size.
   prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
+
+  profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
+
   if (prefill_request->ssm_cache_size == prefill_request->tokens.size()) {
     return true;
   }
@@ -900,6 +928,11 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
     bc.num_tokens++;
+
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
+    }
   }
 
   if (verbose) {
@@ -942,9 +975,6 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
-    // TODO: check this profiling, what is profiling
-    profiling_requests[request.guid].ssm_decoding_steps += 1;
-
     std::vector<Request::CommittedToken> &committed_tokens =
         request.committed_tokens;
 
@@ -983,6 +1013,11 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     // Copy the causal mask, it should already been updated in
     // update_llm_verify_results
     new_bc.causalMask[request_index] = request.causal_mask;
+
+    if (profiling_requests[guid].ssm_decoding_steps == 0) {
+      profiling_requests[guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
+    }
   }
   if (verbose) {
     std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
@@ -1020,8 +1055,6 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
     assert(request.status == Request::RUNNING);
     new_bc.requestsInfo[request_index].first_token_offset_in_batch =
         new_bc.num_tokens;
-    // TODO: check this profiling
-    profiling_requests[request.guid].ssm_decoding_steps += 1;
 
     // Fill in the tokens
     TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
@@ -1116,9 +1149,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
-    // TODO: check this profiling
-    profiling_requests[request.guid].llm_decoding_steps += 1;
-
     // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.tokens.size() - 1; // Exclude the last token
@@ -1226,11 +1256,19 @@ bool RequestManager::update_llm_verify_results(
         request.committed_tokens.clear();
       }
     }
+
+    profiling_requests[guid].llm_decoding_steps++;
   }
 
   // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
 
+  // Clear the token tree node pool
+  token_tree_node_pool = std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
+      CompareSharedTokenTreeNodePtrRequestGuidPair>();
+
   bool request_completed = false;
 
   // Iterate over the requests
@@ -1266,12 +1304,6 @@ bool RequestManager::update_llm_verify_results(
     }
   }
 
-  // Clear the token tree node pool
-  token_tree_node_pool = std::priority_queue<
-      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
-      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
-      CompareSharedTokenTreeNodePtrRequestGuidPair>();
-
   // Some requests may be completed after appending the verified tokens.
   // If there is a request completed, return true.
   return request_completed;
@@ -1322,6 +1354,8 @@ bool RequestManager::update_ssm_inference_results(
       init_bitmask_spec(guid);
     }
     append_bitmask(guid);
+
+    profiling_requests[guid].ssm_decoding_steps++;
   }
 
   // Stop conditions
@@ -1745,26 +1779,32 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   request_manager_status = PREFILLING;
   prefill_model = SSM;
 
+  long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
+
   while (!is_background_server_terminated()) {
-    last_irf.get_void_result();
+    // last_irf.get_void_result();
     BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
     bcf.get_void_result();
+    time_2 = Realm::Clock::current_time_in_microseconds();
+    std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
+              << std::endl;
 
+    time_1 = Realm::Clock::current_time_in_microseconds();
     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
         request_manager_status == LLM_VERIFY) {
-      std::cout << "Branch 1" << std::endl;
+      //   std::cout << "Branch 1" << std::endl;
       runtime->begin_trace(ctx, 12345 /*trace_id*/);
       FutureMap fm = im->inference(llm, 0, bcf);
-      assert(fm.get_future_map_domain().get_volume() == 1);
+      //   assert(fm.get_future_map_domain().get_volume() == 1);
       last_irf = fm.get_future(0);
       runtime->end_trace(ctx, 12345 /*trace_id*/);
     } else if ((request_manager_status == PREFILLING and
                 prefill_model == SSM) or
                request_manager_status == SSM_SPEC) {
-      std::cout << "Branch 2" << std::endl;
+      //   std::cout << "Branch 2" << std::endl;
       runtime->begin_trace(ctx, 23456 /*trace_id*/);
       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-      assert(fm.get_future_map_domain().get_volume() == 1);
+      //   assert(fm.get_future_map_domain().get_volume() == 1);
       last_irf = fm.get_future(0);
       runtime->end_trace(ctx, 23456 /*trace_id*/);
     } else {
@@ -1890,10 +1930,11 @@ bool RequestManager::add_tokens_to_spec_token_tree(
           float log_accumulated_prob =
               log_prob + parent_ptr->log_accumulated_prob;
 
-          std::cout << "Probability at result index " << result_idx << ": "
-                    << ssm_inference_result.probs[result_idx] << "\t";
-          std::cout << "Token id: "
-                    << ssm_inference_result.token_ids[result_idx] << std::endl;
+          //   std::cout << "Probability at result index " << result_idx << ": "
+          //             << ssm_inference_result.probs[result_idx] << "\t";
+          //   std::cout << "Token id: "
+          //             << ssm_inference_result.token_ids[result_idx] <<
+          //             std::endl;
           assert(log_prob != -std::numeric_limits<float>::infinity() &&
                  "Child log probability should not be -inf.");
 

From 6df6030158b30681a9cbc4f311ee078980202a4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <remi.delacourt@gmail.com>
Date: Fri, 17 May 2024 01:50:23 +0000
Subject: [PATCH 261/667] Add profiling and write statistics to output file

---
 include/flexflow/request_manager.h |  19 ++++-
 src/runtime/request_manager.cc     | 107 +++++++++++++++++++++++++----
 2 files changed, 111 insertions(+), 15 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index cc4d77fec..f125f8955 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -344,16 +344,29 @@ class RequestManager {
   // TODO: maintain this field
   size_t num_processed_requests;
 
-  struct ProfileInfo {
+  struct RequestProfileInfo {
     int llm_prefilling_steps = 0;
     int ssm_prefilling_steps = 0;
     int llm_decoding_steps = 0;
     int ssm_decoding_steps = 0;
     long long start_time = 0, start_decoding_time = 0, finish_time = 0;
   };
-  std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
+  struct ProfileInfo {
+    // For SpecInfer: One step is comprised of one ssm speculation phase + a single llm verification phase (forward pass + verification)
+    // For Incr Decoding: One step is one LLM decoding phase
+    long long llm_step_start = 0, ssm_step_start = 0;
+    // Times for each LLM verification phase (in ms)
+    std::vector<double> llm_step_times;
+    // Times for each SSM speculation phase (in ms)
+    std::vector<double> ssm_step_times;
+    // Number of requests getting decoded at each step
+    std::vector<int> requests_per_step;
+  };
+
+  ProfileInfo profiling;
+  std::unordered_map<RequestGuid, RequestProfileInfo> profiling_requests;
   double total_request_run_time;
-  void load_pending_reqeust_to_batch();
+  void load_pending_request_to_batch();
   void request_complete_clean_up(int batch_index);
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index dd54e0710..6f9e2db4c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -32,6 +32,20 @@ using tokenizers::Tokenizer;
 
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
+void write_to_output_file(std::string const &output_filepath, std::string const &str) {
+  if (!output_filepath.empty()) {
+    std::ofstream outputFile(output_filepath, std::ios::app);
+    if (outputFile.is_open()) {
+      outputFile << str << std::endl;
+      outputFile.close();
+    } else {
+      std::cout << "Unable to open the output file: " << output_filepath
+                << std::endl;
+      assert(false);
+    }
+  }
+}
+
 std::string LoadBytesFromFile(std::string const &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
   assert(fs.is_open() && "Failed to open file for reading.");
@@ -328,11 +342,12 @@ RequestManager::RequestGuid
 
   {
     std::string output = "New request tokens:";
-    output = "[" + std::to_string(request.guid) + "]" + output;
+    output = "[" + std::to_string(request.guid) + "] " + output;
     for (int i = 0; i < request.tokens.size(); i++) {
       output = output + " " + std::to_string(request.tokens[i]);
     }
     log_req_mgr.print("%s", output.c_str());
+    write_to_output_file(output_filepath, output);
   }
 
   GenerationResult gr;
@@ -422,7 +437,7 @@ BatchConfig
   return prepare_next_batch();
 }
 
-void RequestManager::load_pending_reqeust_to_batch() {
+void RequestManager::load_pending_request_to_batch() {
   assert(!pending_request_queue.empty() && "No pending request to process.");
   RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
@@ -441,7 +456,7 @@ void RequestManager::load_pending_reqeust_to_batch() {
   // Initialize the bitmask for the new request with its prompt length
   init_bitmask_prompt(guid, prefill_request->tokens.size());
 
-  profiling_requests[guid] = ProfileInfo();
+  profiling_requests[guid] = RequestProfileInfo();
   profiling_requests[guid].start_time =
       Realm::Clock::current_time_in_microseconds();
 }
@@ -476,6 +491,22 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     std::cout << "SSM decoding steps: "
               << profiling_requests[guid].ssm_decoding_steps << std::endl;
   }
+  std::string str = "[" + std::to_string(guid) + "] Request completed: " + 
+                      "decoding_time_ms(" + std::to_string(
+                        (profiling_requests[guid].finish_time-
+                          profiling_requests[guid].start_decoding_time)
+                          *1e-3) + ") " + 
+                      "total_time_ms(" + std::to_string(
+                        (profiling_requests[guid].finish_time-
+                          profiling_requests[guid].start_time)
+                          *1e-3) + ") " + 
+                      "LLM_decoding_steps(" + std::to_string(
+                        profiling_requests[guid].llm_decoding_steps) 
+                        + ") " + 
+                      "SSM_decoding_steps(" + std::to_string(
+                        profiling_requests[guid].ssm_decoding_steps) 
+                        + ")";
+  write_to_output_file(output_filepath, str);
 
   trigger_request_completion_future(guid);
 }
@@ -489,7 +520,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
     // Update nothing
     if (!pending_request_queue.empty()) {
       // Load the pending request to the batch
-      load_pending_reqeust_to_batch();
+      load_pending_request_to_batch();
       request_manager_status = PREFILLING;
       if (decoding_mode == SPECULATIVE_DECODING) {
         prefill_model = SSM;
@@ -510,7 +541,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           if (num_available_requests < get_max_requests_per_batch() &&
               !pending_request_queue.empty()) {
             // Load the pending request to the batch
-            load_pending_reqeust_to_batch();
+            load_pending_request_to_batch();
             request_manager_status = PREFILLING;
           } else {
             // No more empty slots, start the decoding
@@ -534,7 +565,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             if (num_available_requests < get_max_requests_per_batch() &&
                 !pending_request_queue.empty()) {
               // Load the pending request to the batch
-              load_pending_reqeust_to_batch();
+              load_pending_request_to_batch();
               request_manager_status = PREFILLING;
               prefill_model = SSM;
             } else {
@@ -560,7 +591,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           request_manager_status = DECODING;
         } else {
           request_manager_status = PREFILLING;
-          load_pending_reqeust_to_batch();
+          load_pending_request_to_batch();
         }
       }
       break;
@@ -573,7 +604,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           current_speculation_step = 0;
         } else {
           request_manager_status = PREFILLING;
-          load_pending_reqeust_to_batch();
+          load_pending_request_to_batch();
           prefill_model = SSM;
         }
       } else {
@@ -666,6 +697,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   bool request_completed = false;
+  int nb_requests_decoded = 0;
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -680,6 +712,7 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
         result.token_ids[request.first_token_offset_in_batch]);
 
     profiling_requests[guid].llm_decoding_steps++;
+    nb_requests_decoded++;
     if (request.tokens.size() >= get_max_sequence_length()) {
       request_completed = true;
       request_complete_clean_up(request_index);
@@ -691,6 +724,10 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
                 << output << std::endl;
     }
   }
+  profiling.llm_step_times.push_back((
+        Realm::Clock::current_time_in_microseconds() - 
+        profiling.llm_step_start) * 1e-3);
+  profiling.requests_per_step.push_back(nb_requests_decoded);
   return request_completed;
 }
 
@@ -939,6 +976,7 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     std::cout << "prepare_decoding_batch NEW batchconfig:" << std::endl;
     bc.print();
   }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
   return bc;
 }
 /* ----- Speculative Inference Specific functions ----- */
@@ -1018,6 +1056,8 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
       profiling_requests[guid].start_decoding_time =
           Realm::Clock::current_time_in_microseconds();
     }
+    profiling.ssm_step_start = 
+      Realm::Clock::current_time_in_microseconds();
   }
   if (verbose) {
     std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
@@ -1213,6 +1253,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     std::cout << "prepare_verify_batch_config NEW batchconfig:" << std::endl;
     new_bc.print();
   }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
   return new_bc;
 }
 
@@ -1233,6 +1274,7 @@ bool RequestManager::update_llm_verify_results(
 
   // Update llm_cache_size with the last committed_tokens, and clear
   // committed_tokens
+  int nb_requests_decoded = 0;
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1258,11 +1300,17 @@ bool RequestManager::update_llm_verify_results(
     }
 
     profiling_requests[guid].llm_decoding_steps++;
+    nb_requests_decoded++;
   }
 
   // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
 
+  profiling.llm_step_times.push_back((
+        Realm::Clock::current_time_in_microseconds() - 
+        profiling.llm_step_start) * 1e-3);
+  profiling.requests_per_step.push_back(nb_requests_decoded);
+
   // Clear the token tree node pool
   token_tree_node_pool = std::priority_queue<
       std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
@@ -1311,7 +1359,7 @@ bool RequestManager::update_llm_verify_results(
 
 bool RequestManager::update_ssm_inference_results(
     InferenceResult const &ssm_inference_result) {
-  // This function returns false if no tokens are added to the token tree,
+  // This function returns true if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
   assert(current_speculation_step >= 0 &&
          "The current speculation step should be no less than 0");
@@ -1359,14 +1407,22 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  return all_request_last_layer_empty ||
-         current_speculation_step > get_max_tree_depth();
+  if (all_request_last_layer_empty || 
+        current_speculation_step > get_max_tree_depth()) {
+    // Update profiling statistics before returning
+    profiling.ssm_step_times.push_back((
+        Realm::Clock::current_time_in_microseconds() -
+        profiling.ssm_step_start) * 1e-3
+      );
+    return true;
+  }
+  return false;
 }
 
 /* --------- Bitmask Related Functions --------- */
 
 void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
-  // This method is called by load_pending_reqeust_to_batch when there is a
+  // This method is called by load_pending_request_to_batch when there is a
   // new request to load into the batch
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
@@ -1829,6 +1885,33 @@ void RequestManager::terminate_background_server_at_exit() {
 
 void RequestManager::terminate_background_server() {
   if (background_server_status == SERVING) {
+    assert(profiling.llm_step_times.size() == profiling.requests_per_step.size());
+    // Write the last profiling statistics to output file
+    std::string str = "[Profiling Statistics]\n llm_step_times_ms(";
+    std::string llm_step_times_ms = " ";
+    for (double time : profiling.llm_step_times) {
+      llm_step_times_ms += std::to_string(time) + " ";
+    }
+    llm_step_times_ms += ")";
+    str += llm_step_times_ms;
+    str += "\n requests_per_step(";
+    std::string req_per_step = " ";
+    for (int nb : profiling.requests_per_step) {
+      req_per_step += std::to_string(nb) + " ";
+    }
+    req_per_step += ")";
+    str += req_per_step;
+    if (profiling.ssm_step_times.size() > 0) {
+      assert(profiling.ssm_step_times.size() == profiling.llm_step_times.size());
+      str += "\n ssm_step_times_ms(";
+      std::string ssm_step_times_ms = " ";
+      for (double time : profiling.ssm_step_times) {
+        ssm_step_times_ms += std::to_string(time) + " ";
+      }
+      ssm_step_times_ms += ")";
+      str += ssm_step_times_ms;
+    }
+    write_to_output_file(output_filepath, str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate
     Runtime *runtime = Runtime::get_runtime();

From 55ccb346cd1d17fd2ecabdfd3e772a9946031392 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 17 May 2024 01:10:15 -0700
Subject: [PATCH 262/667] chore: output profiling to file

---
 src/runtime/request_manager.cc | 42 +++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index dd54e0710..d6117ad69 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -459,22 +459,32 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   std::string output = this->tokenizer_->Decode(request.tokens);
   std::cout << "Request " << guid << " completed: " << std::endl
             << output << std::endl;
-  std::cout << "Request " << guid << " profiling: " << std::endl
-            << "Decoding time: "
-            << (profiling_requests[guid].finish_time -
-                profiling_requests[guid].start_decoding_time) *
-                   1e-3
-            << "ms" << std::endl
-            << "Total time: "
-            << (profiling_requests[guid].finish_time -
-                profiling_requests[guid].start_time) *
-                   1e-3
-            << "ms" << std::endl
-            << "LLM decoding steps: "
-            << profiling_requests[guid].llm_decoding_steps << std::endl;
-  if (decoding_mode == SPECULATIVE_DECODING) {
-    std::cout << "SSM decoding steps: "
-              << profiling_requests[guid].ssm_decoding_steps << std::endl;
+  ProfileInfo profile_info = profiling_requests[guid];
+  if (!output_filepath.empty()) {
+    std::ofstream outputFile(output_filepath, std::ios::app);
+    if (outputFile.is_open()) {
+      outputFile << "Request " << guid << " profiling: " << std::endl;
+      outputFile << "Decoding time: "
+                  << (profile_info.finish_time -
+                  profile_info.start_decoding_time) * 1e-3
+                  << "ms" << std::endl;
+      outputFile << "Total time: "
+                  << (profile_info.finish_time -
+                      profile_info.start_time) * 1e-3
+                  << "ms" << std::endl;
+      outputFile << "LLM decoding steps: "
+                   << profile_info.llm_decoding_steps << std::endl;
+      if (decoding_mode == SPECULATIVE_DECODING) {
+        outputFile << "SSM decoding steps: "
+                    << profile_info.ssm_decoding_steps << std::endl;
+      }
+      outputFile << output << std::endl << std::endl;
+      outputFile.close();
+    } else {
+      std::cout << "Unable to open the output file: " << output_filepath
+                << std::endl;
+      assert(false);
+    }
   }
 
   trigger_request_completion_future(guid);

From ccc457d789d1b506f7ac267f8b0e32624f3ddc0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <remi.delacourt@gmail.com>
Date: Fri, 17 May 2024 23:07:26 +0000
Subject: [PATCH 263/667] Count tokens generated per step

---
 include/flexflow/request_manager.h           |  4 +++-
 src/ops/spec_inc_multihead_self_attention.cu |  4 ++--
 src/runtime/request_manager.cc               | 15 +++++++++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f125f8955..17a161fe7 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -79,7 +79,7 @@ struct Request {
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
   BatchConfig::BitMask causal_mask;
-  // Here we maintain a struct CommitTokens which has a field `from_index` and
+  // Here we maintain a struct CommittedToken which has a field `from_index` and
   // `to_index`. The `from_index` is used by the LLM KV cache commitment and the
   // `to_index` is used both by the the SSM KV cache recomputation and the LLM
   // KV cache commitment. Details are as follows:
@@ -361,6 +361,8 @@ class RequestManager {
     std::vector<double> ssm_step_times;
     // Number of requests getting decoded at each step
     std::vector<int> requests_per_step;
+    // Number of generated tokens at each step
+    std::vector<int> generated_tokens_per_step;
   };
 
   ProfileInfo profiling;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index f9a4a168a..94129ad7a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -754,8 +754,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
   time_2 = Realm::Clock::current_time_in_microseconds();
-  std::cout << "SpecIncMultiHeadSelfAttention kernel time: "
-            << (time_2 - time_1) << "us" << std::endl;
+  // std::cout << "SpecIncMultiHeadSelfAttention kernel time: "
+  //           << (time_2 - time_1) << "us" << std::endl;
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6f9e2db4c..0db0d56fc 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -728,6 +728,7 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
         Realm::Clock::current_time_in_microseconds() - 
         profiling.llm_step_start) * 1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
+  profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
   return request_completed;
 }
 
@@ -1565,6 +1566,7 @@ void RequestManager::get_verify_results_greedy(
     InferenceResult const &llm_verify_result) {
   // This function maintain the generated token list of the request and the
   // committed tokens.
+  int total_nb_generated_tokens = 0;
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1662,6 +1664,7 @@ void RequestManager::get_verify_results_greedy(
     request.llm_committed = false;
     request.ssm_committed = false;
 
+    total_nb_generated_tokens += request.committed_tokens.size();
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
       for (auto const &committed_token : request.committed_tokens) {
@@ -1673,6 +1676,7 @@ void RequestManager::get_verify_results_greedy(
       std::cout << "Output sequence: " << output << std::endl;
     }
   }
+  profiling.generated_tokens_per_step.push_back(total_nb_generated_tokens);
 }
 
 // TODO: the max_seq_length is not used in the current implementation
@@ -1842,8 +1846,8 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
     bcf.get_void_result();
     time_2 = Realm::Clock::current_time_in_microseconds();
-    std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
-              << std::endl;
+    // std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
+    //           << std::endl;
 
     time_1 = Realm::Clock::current_time_in_microseconds();
     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
@@ -1911,6 +1915,13 @@ void RequestManager::terminate_background_server() {
       ssm_step_times_ms += ")";
       str += ssm_step_times_ms;
     }
+    str += "\n generated_tokens_per_step(";
+    std::string generated_tokens_per_step = " ";
+    for (int nb : profiling.generated_tokens_per_step) {
+      generated_tokens_per_step += std::to_string(nb) + " ";
+    }
+    generated_tokens_per_step += ")";
+    str += generated_tokens_per_step;
     write_to_output_file(output_filepath, str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate

From e0c0c17d2daf81209bc2a12448d1a0210688eb80 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 21 May 2024 02:20:34 -0700
Subject: [PATCH 264/667] chore: eliminate immediate outputs

---
 src/ops/spec_inc_multihead_self_attention.cu |  8 ++++----
 src/runtime/request_manager.cc               | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index f9a4a168a..c89f5b2bd 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -706,7 +706,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       cudaStream_t stream) {
   // phase 1: Implement kernel to compute KQV for input tokens
 
-  long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
+  // long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
@@ -753,9 +753,9 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
-  time_2 = Realm::Clock::current_time_in_microseconds();
-  std::cout << "SpecIncMultiHeadSelfAttention kernel time: "
-            << (time_2 - time_1) << "us" << std::endl;
+  // time_2 = Realm::Clock::current_time_in_microseconds();
+  // std::cout << "SpecIncMultiHeadSelfAttention kernel time: "
+  //           << (time_2 - time_1) << "us" << std::endl;
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d6117ad69..7d9a6982e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1789,17 +1789,17 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   request_manager_status = PREFILLING;
   prefill_model = SSM;
 
-  long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
+  // long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
 
   while (!is_background_server_terminated()) {
     // last_irf.get_void_result();
     BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
     bcf.get_void_result();
-    time_2 = Realm::Clock::current_time_in_microseconds();
-    std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
-              << std::endl;
+    // time_2 = Realm::Clock::current_time_in_microseconds();
+    // std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
+    //           << std::endl;
 
-    time_1 = Realm::Clock::current_time_in_microseconds();
+    // time_1 = Realm::Clock::current_time_in_microseconds();
     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
         request_manager_status == LLM_VERIFY) {
       //   std::cout << "Branch 1" << std::endl;

From d01c9437b4a950c6d53aaf45d94e21d786be294f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 21 May 2024 06:41:09 -0700
Subject: [PATCH 265/667] feat: add CUDA Graph optimization

---
 .gitignore                          |    2 +
 include/flexflow/ops/fused.h        |   28 +-
 include/flexflow/ops/graph_params.h |   45 ++
 src/ops/fused.cu                    | 1079 ++++++++++++++-------------
 4 files changed, 635 insertions(+), 519 deletions(-)
 create mode 100644 include/flexflow/ops/graph_params.h

diff --git a/.gitignore b/.gitignore
index 7f6a3c413..34ecb8e0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ python/flexflow/core/flexflow_cffi_header.py
 *.pb.h
 *.o
 *.a
+*.nsys-rep
+*.nfs*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h
index a8326e9ab..e0481302e 100644
--- a/include/flexflow/ops/fused.h
+++ b/include/flexflow/ops/fused.h
@@ -1,17 +1,43 @@
 #ifndef _FLEXFLOW_FUSED_H_
 #define _FLEXFLOW_FUSED_H_
 
+#include "flexflow/batch_config.h"
 #include "flexflow/model.h"
+#include "graph_params.h"
 
 namespace FlexFlow {
 
+// declare Legion names
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::LogicalPartition;
+using Legion::LogicalRegion;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Runtime;
+using Legion::Task;
+
 class FusedOp;
 class FusedOpMeta {
 public:
-  FusedOpMeta(void) {}
+  FusedOpMeta(void) {
+    graphCaptured = false;
+    graph_collections.reserve(BatchConfig::MAX_NUM_REQUESTS *
+                              BatchConfig::MAX_NUM_TOKENS * 2);
+  }
   OpMeta *meta[MAX_NUM_FUSED_OPERATORS];
   FusedOp *fused_op;
   int numOperators;
+  bool graphCaptured=false;
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+  std::unordered_map<GraphParams, cudaGraphExec_t>
+      graph_collections;
+#else
+  std::unordered_map<GraphParams, hipGraphExec_t>
+      graph_collections;
+#endif
 };
 
 class FusedOp : public Op {
diff --git a/include/flexflow/ops/graph_params.h b/include/flexflow/ops/graph_params.h
new file mode 100644
index 000000000..55964dc30
--- /dev/null
+++ b/include/flexflow/ops/graph_params.h
@@ -0,0 +1,45 @@
+#ifndef _FLEXFLOW_GRAPH_PARAMS_H_
+#define _FLEXFLOW_GRAPH_PARAMS_H_
+
+#include <stdio.h>
+#include <string>
+
+namespace FlexFlow {
+  struct GraphParams {
+    int num_active_requests;
+    int num_active_tokens;
+    bool prompt_phase;
+
+    GraphParams(int num_active_requests, int num_active_tokens, bool prompt_phase)
+      : num_active_requests(num_active_requests), num_active_tokens(num_active_tokens), prompt_phase(prompt_phase) {}
+
+    void Print() const {
+      printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, prompt_phase: %d\n \n", num_active_requests, num_active_tokens, prompt_phase);
+    }
+  };
+
+}
+
+namespace std {
+  template <>
+  struct hash<FlexFlow::GraphParams> {
+    size_t operator()(const FlexFlow::GraphParams& gp) const {
+      return std::hash<int>()(gp.num_active_requests) ^
+             std::hash<int>()(gp.num_active_tokens) ^
+             std::hash<bool>()(gp.prompt_phase);
+    }
+  };
+}
+
+namespace std {
+  template <>
+  struct equal_to<FlexFlow::GraphParams> {
+    bool operator()(const FlexFlow::GraphParams& lhs, const FlexFlow::GraphParams& rhs) const {
+      return lhs.num_active_requests == rhs.num_active_requests &&
+             lhs.num_active_tokens == rhs.num_active_tokens && 
+             lhs.prompt_phase == rhs.prompt_phase;
+    }
+  };
+}
+
+#endif
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index ab41e5af1..ac0666c87 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "cuda.h"
 #include "flexflow/accessor.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
@@ -43,19 +44,9 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "flexflow/ffconst_utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::Future;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 OpMeta *FusedOp::init_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
@@ -608,530 +599,582 @@ __host__ void
     }
   }
 
-  int ioff = 0, woff = 0, ooff = 0;
-  for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
-    GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-    GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-    GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
-    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-      int my_off = fused->op_input_idx[i + ioff];
-      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
-        assert(my_off < fused->numInputs);
-        my_input_accessor[i] = input_accessor[my_off];
-      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
-        assert(my_off < fused->numOutputs);
-        my_input_accessor[i] = output_accessor[my_off];
-      } else {
-        assert(false);
-      }
-    }
-    for (int i = 0; i < fused->op_num_weights[op]; i++) {
-      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
-      assert(fused->op_weight_idx[i + woff] < fused->numWeights);
-      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-    }
-    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-      int my_off = fused->op_output_idx[i + ooff];
-      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      assert(my_off < fused->numOutputs);
-      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
-      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-      my_output_accessor[i] = output_accessor[my_off];
-    }
-    switch (fused->op_op_type[op]) {
-      case OP_CONCAT: {
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        ConcatMeta *m = (ConcatMeta *)metas->meta[op];
-        int num_inputs = fused->op_num_inputs[op];
-        Kernels::Concat::forward_kernel_wrapper(m,
-                                                my_output_accessor[0],
-                                                my_input_accessor,
-                                                num_inputs,
-                                                m->legion_axis);
-        break;
-      }
-      case OP_BATCHNORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 2);
-        assert(my_weight_accessor[1].domain.get_dim() == 2);
-        BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
-        BatchNorm::forward_kernel(m,
-                                  my_input_accessor[0].get_float_ptr(),
-                                  my_output_accessor[0].get_float_ptr(),
-                                  my_weight_accessor[0].get_float_ptr(),
-                                  my_weight_accessor[1].get_float_ptr());
-        break;
-      }
-      case OP_LINEAR: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        Domain kernel_domain = my_weight_accessor[0].domain;
-        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
-        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
-        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
-        assert(my_output_accessor[0].domain.get_volume() ==
-               out_dim * batch_size);
-        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        void const *bias_ptr = nullptr;
-        LinearMeta *m = (LinearMeta *)metas->meta[op];
-        if (fused->op_num_weights[op] == 2) {
-          assert(my_weight_accessor[1].domain.get_volume() == out_dim);
-          if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].ptr;
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  // create new cuda graph
+  cudaGraph_t graph;
+  cudaGraphExec_t instance;
+  cudaGraphExecUpdateResult updateResult;
+
+  GraphParams graph_params = {bc->num_active_requests(),
+                      bc->num_active_tokens(),
+                      bc->prompt_phase};
+  //graph_params.Print();
+  int shard_id = task->index_point.point_data[0];
+
+  {            
+      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+      int ioff = 0, woff = 0, ooff = 0;
+      for (int op = 0; op < fused->numOperators; op++) {
+        // Domain my_id[MAX_NUM_INPUTS];
+        // Domain my_wd[MAX_NUM_WEIGHTS];
+        // Domain my_od[MAX_NUM_OUTPUTS];
+        GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
+        GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+        GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
+        for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+          int my_off = fused->op_input_idx[i + ioff];
+          if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+            // my_id[i] = input_domain[my_off];
+            assert(my_off < fused->numInputs);
+            my_input_accessor[i] = input_accessor[my_off];
+          } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+            // my_id[i] = output_domain[my_off];
+            assert(my_off < fused->numOutputs);
+            my_input_accessor[i] = output_accessor[my_off];
+          } else {
+            assert(false);
           }
-        } else {
-          assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
-        break;
-      }
-      case OP_BATCHMATMUL: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        Domain out_domain = my_output_accessor[0].domain;
-        Domain a_domain = my_input_accessor[0].domain;
-        Domain b_domain = my_input_accessor[1].domain;
-        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
-        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
-        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
-        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
-        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
-        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
-        assert(a_domain.get_dim() == b_domain.get_dim());
-        assert(a_domain.get_dim() == out_domain.get_dim());
-        int batch = 1;
-        for (int i = 2; i < a_domain.get_dim(); i++) {
-          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
-          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
-          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
-          batch *= dim_size;
+        for (int i = 0; i < fused->op_num_weights[op]; i++) {
+          assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+          // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+          // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+          assert(fused->op_weight_idx[i + woff] < fused->numWeights);
+          my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
         }
-        BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
-        Kernels::BatchMatmul::forward_kernel_wrapper(
-            meta,
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].get_float_ptr(),
-            my_input_accessor[1].get_float_ptr(),
-            (float const *)nullptr,
-            m,
-            n,
-            k,
-            batch,
-            meta->a_seq_length_dim,
-            meta->b_seq_length_dim,
-            fused->iter_config.seq_length);
-        break;
-      }
-      case OP_EW_ADD:
-      case OP_EW_SUB:
-      case OP_EW_MUL:
-      case OP_EW_DIV:
-      case OP_EW_MAX:
-      case OP_EW_MIN: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
-        Kernels::ElementBinary::forward_kernel_wrapper(m,
-                                                       my_input_accessor[0],
-                                                       my_input_accessor[1],
-                                                       my_output_accessor[0]);
-        break;
-      }
-      case OP_EMBEDDING: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
-        if (m->aggr == AGGR_MODE_NONE) {
-          // assert(kernel_domain.get_dim() == 2);
-          assert(my_input_accessor[0].domain.get_dim() + 1 ==
-                 my_output_accessor[0].domain.get_dim());
-          for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
-            assert(my_input_accessor[0].domain.hi()[i] ==
-                   my_output_accessor[0].domain.hi()[i + 1]);
-            assert(my_input_accessor[0].domain.lo()[i] ==
-                   my_output_accessor[0].domain.lo()[i + 1]);
+        for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+          int my_off = fused->op_output_idx[i + ooff];
+          assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+          assert(my_off < fused->numOutputs);
+          // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+          // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+          my_output_accessor[i] = output_accessor[my_off];
+        }
+        switch (fused->op_op_type[op]) {
+          case OP_CONCAT: {
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+            int num_inputs = fused->op_num_inputs[op];
+            Kernels::Concat::forward_kernel_wrapper(m,
+                                                    my_output_accessor[0],
+                                                    my_input_accessor,
+                                                    num_inputs,
+                                                    m->legion_axis);
+            break;
           }
-          assert(my_weight_accessor[0].domain.hi()[0] -
-                     my_weight_accessor[0].domain.lo()[0] ==
-                 my_output_accessor[0].domain.hi()[0] -
-                     my_output_accessor[0].domain.lo()[0]);
-        } else {
-          assert(my_input_accessor[0].domain.get_dim() ==
-                 my_output_accessor[0].domain.get_dim());
-          for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
-            assert(my_input_accessor[0].domain.hi()[i] ==
-                   my_output_accessor[0].domain.hi()[i]);
-            assert(my_input_accessor[0].domain.lo()[i] ==
-                   my_output_accessor[0].domain.lo()[i]);
+          case OP_BATCHNORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain.get_dim() == 5);
+            assert(my_output_accessor[0].domain.get_dim() == 5);
+            assert(my_weight_accessor[0].domain.get_dim() == 2);
+            assert(my_weight_accessor[1].domain.get_dim() == 2);
+            BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+            BatchNorm::forward_kernel(m,
+                                      my_input_accessor[0].get_float_ptr(),
+                                      my_output_accessor[0].get_float_ptr(),
+                                      my_weight_accessor[0].get_float_ptr(),
+                                      my_weight_accessor[1].get_float_ptr());
+            break;
           }
-          assert(my_weight_accessor[0].domain.hi()[0] -
-                     my_weight_accessor[0].domain.lo()[0] ==
-                 my_output_accessor[0].domain.hi()[0] -
-                     my_output_accessor[0].domain.lo()[0]);
-        }
-        int in_dim, out_dim, effective_batch_size;
-        if (m->aggr == AGGR_MODE_NONE) {
-          in_dim = 1;
-          out_dim = my_output_accessor[0].domain.hi()[0] -
-                    my_output_accessor[0].domain.lo()[0] + 1;
-          effective_batch_size =
-              my_output_accessor[0].domain.get_volume() / out_dim;
-          assert(effective_batch_size * in_dim ==
-                 my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
-          in_dim = my_input_accessor[0].domain.hi()[0] -
-                   my_input_accessor[0].domain.lo()[0] + 1;
-          out_dim = my_output_accessor[0].domain.hi()[0] -
-                    my_output_accessor[0].domain.lo()[0] + 1;
-          effective_batch_size =
-              my_output_accessor[0].domain.get_volume() / out_dim;
-          assert(effective_batch_size * in_dim ==
-                 my_input_accessor[0].domain.get_volume());
-        }
-
-        assert(my_input_accessor[0].data_type == DT_INT32 ||
-               my_input_accessor[0].data_type == DT_INT64);
-        Kernels::Embedding::forward_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_output_accessor[0],
-                                                   my_weight_accessor[0],
-                                                   in_dim,
-                                                   out_dim,
-                                                   effective_batch_size);
-        break;
-      }
-      case OP_GELU:
-      case OP_RELU:
-      case OP_SIGMOID:
-      case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
-        break;
-      }
-      case OP_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
-        break;
-      }
-      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
-            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeVerifyBatchConfig const *verify_bc =
-        //     (TreeVerifyBatchConfig *)task->args;
-        BatchConfig const &verify_bc =
-            Future(task->futures[0]).get_result<BatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &verify_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeSearchBatchConfig const *search_bc =
-        //     (TreeSearchBatchConfig *)task->args;
-        BatchConfig const &search_bc =
-            Future(task->futures[0]).get_result<BatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &search_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
-        break;
-      }
-      case OP_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
-        if (m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
+          case OP_LINEAR: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            Domain kernel_domain = my_weight_accessor[0].domain;
+            int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+            int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+            int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+            assert(my_output_accessor[0].domain.get_volume() ==
+                  out_dim * batch_size);
+            assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+            void const *bias_ptr = nullptr;
+            LinearMeta *m = (LinearMeta *)metas->meta[op];
+            if (fused->op_num_weights[op] == 2) {
+              assert(my_weight_accessor[1].domain.get_volume() == out_dim);
+              if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
+                bias_ptr = my_weight_accessor[1].ptr;
+              }
+            } else {
+              assert(fused->op_num_weights[op] == 1);
+            }
+            assert(m->input_type[0] == my_input_accessor[0].data_type);
+            assert(m->input_type[0] == my_output_accessor[0].data_type);
+            batch_size = bc->num_active_tokens();
+            Kernels::Linear::forward_kernel_wrapper(m,
+                                                    my_input_accessor[0].ptr,
+                                                    my_output_accessor[0].ptr,
+                                                    my_weight_accessor[0].ptr,
+                                                    bias_ptr,
+                                                    in_dim,
+                                                    out_dim,
+                                                    batch_size);
+            break;
           }
-        }
-        LayerNorm::forward_kernel_wrapper(
-            m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
-        break;
-      }
-      case OP_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
-        if (m->use_two_residuals) {
-          assert(fused->op_num_inputs[op] == 3);
-        } else {
-          assert(fused->op_num_inputs[op] == 2);
-        }
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 0);
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 1); // weight
-          } else {
-            assert(fused->op_num_weights[op] == 2); // weight + bias
+          case OP_BATCHMATMUL: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            Domain out_domain = my_output_accessor[0].domain;
+            Domain a_domain = my_input_accessor[0].domain;
+            Domain b_domain = my_input_accessor[1].domain;
+            int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+            assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+            int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+            assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+            int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+            assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+            assert(a_domain.get_dim() == b_domain.get_dim());
+            assert(a_domain.get_dim() == out_domain.get_dim());
+            int batch = 1;
+            for (int i = 2; i < a_domain.get_dim(); i++) {
+              int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+              assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+              assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+              batch *= dim_size;
+            }
+            BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+            Kernels::BatchMatmul::forward_kernel_wrapper(
+                meta,
+                my_output_accessor[0].get_float_ptr(),
+                my_input_accessor[0].get_float_ptr(),
+                my_input_accessor[1].get_float_ptr(),
+                (float const *)nullptr,
+                m,
+                n,
+                k,
+                batch,
+                meta->a_seq_length_dim,
+                meta->b_seq_length_dim,
+                fused->iter_config.seq_length);
+            break;
           }
-        }
-        GenericTensorAccessorR residual2;
-        if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
+          case OP_EW_ADD:
+          case OP_EW_SUB:
+          case OP_EW_MUL:
+          case OP_EW_DIV:
+          case OP_EW_MAX:
+          case OP_EW_MIN: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
+            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+            ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+            Kernels::ElementBinary::forward_kernel_wrapper(m,
+                                                          my_input_accessor[0],
+                                                          my_input_accessor[1],
+                                                          my_output_accessor[0]);
+            break;
           }
-        }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
+          case OP_EMBEDDING: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
+            if (m->aggr == AGGR_MODE_NONE) {
+              // assert(kernel_domain.get_dim() == 2);
+              assert(my_input_accessor[0].domain.get_dim() + 1 ==
+                    my_output_accessor[0].domain.get_dim());
+              for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
+                assert(my_input_accessor[0].domain.hi()[i] ==
+                      my_output_accessor[0].domain.hi()[i + 1]);
+                assert(my_input_accessor[0].domain.lo()[i] ==
+                      my_output_accessor[0].domain.lo()[i + 1]);
+              }
+              assert(my_weight_accessor[0].domain.hi()[0] -
+                        my_weight_accessor[0].domain.lo()[0] ==
+                    my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0]);
+            } else {
+              assert(my_input_accessor[0].domain.get_dim() ==
+                    my_output_accessor[0].domain.get_dim());
+              for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
+                assert(my_input_accessor[0].domain.hi()[i] ==
+                      my_output_accessor[0].domain.hi()[i]);
+                assert(my_input_accessor[0].domain.lo()[i] ==
+                      my_output_accessor[0].domain.lo()[i]);
+              }
+              assert(my_weight_accessor[0].domain.hi()[0] -
+                        my_weight_accessor[0].domain.lo()[0] ==
+                    my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0]);
+            }
+            int in_dim, out_dim, effective_batch_size;
+            if (m->aggr == AGGR_MODE_NONE) {
+              in_dim = 1;
+              out_dim = my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0] + 1;
+              effective_batch_size =
+                  my_output_accessor[0].domain.get_volume() / out_dim;
+              assert(effective_batch_size * in_dim ==
+                    my_input_accessor[0].domain.get_volume());
+            } else {
+              assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
+              in_dim = my_input_accessor[0].domain.hi()[0] -
+                      my_input_accessor[0].domain.lo()[0] + 1;
+              out_dim = my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0] + 1;
+              effective_batch_size =
+                  my_output_accessor[0].domain.get_volume() / out_dim;
+              assert(effective_batch_size * in_dim ==
+                    my_input_accessor[0].domain.get_volume());
+            }
+
+            assert(my_input_accessor[0].data_type == DT_INT32 ||
+                  my_input_accessor[0].data_type == DT_INT64);
+            Kernels::Embedding::forward_kernel_wrapper(m,
+                                                      my_input_accessor[0],
+                                                      my_output_accessor[0],
+                                                      my_weight_accessor[0],
+                                                      in_dim,
+                                                      out_dim,
+                                                      effective_batch_size);
+            break;
+          }
+          case OP_GELU:
+          case OP_RELU:
+          case OP_SIGMOID:
+          case OP_TANH:
+          case OP_ELU:
+          case OP_SCALAR_TRUE_DIV: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+            ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+            if (m->data_type == DT_HALF) {
+              ElementUnary::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_half_ptr(),
+                  my_output_accessor[0].get_half_ptr(),
+                  my_input_accessor[0].domain.get_volume());
+            } else if (m->data_type == DT_FLOAT) {
+              ElementUnary::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_float_ptr(),
+                  my_output_accessor[0].get_float_ptr(),
+                  my_input_accessor[0].domain.get_volume());
+            } else {
+              assert(false && "Unsupported data type in ElementUnary forward");
+            }
+            break;
+          }
+          case OP_RMS_NORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+            Kernels::RMSNorm::forward_kernel_wrapper(m,
                                                     my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
-        break;
-      }
-      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
-            (AddBiasResidualLayerNormMeta *)metas->meta[op];
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1); // attn bias
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 2); // attn bias + weight
-          } else {
-            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+                                                    my_weight_accessor[0],
+                                                    my_output_accessor[0]);
+            break;
           }
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
+          case OP_RESIDUAL_RMS_NORM: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 2);
+            ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+            Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
+                                                            my_input_accessor[0],
+                                                            my_input_accessor[1],
+                                                            my_weight_accessor[0],
+                                                            my_output_accessor[0],
+                                                            my_output_accessor[1]);
+            break;
+          }
+          case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            IncMultiHeadSelfAttentionMeta const *m =
+                (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            assert(fused->op_num_weights[op] ==
+                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            IncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
+          }
+          case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            TreeIncMultiHeadSelfAttentionMeta *m =
+                (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            // TreeVerifyBatchConfig const *verify_bc =
+            //     (TreeVerifyBatchConfig *)task->args;
+            BatchConfig const &verify_bc =
+                Future(task->futures[0]).get_result<BatchConfig>();
+            assert(fused->op_num_weights[op] ==
+                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                &verify_bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
+          }
+          case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            SpecIncMultiHeadSelfAttentionMeta const *m =
+                (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            // TreeSearchBatchConfig const *search_bc =
+            //     (TreeSearchBatchConfig *)task->args;
+            BatchConfig const &search_bc =
+                Future(task->futures[0]).get_result<BatchConfig>();
+            assert(fused->op_num_weights[op] ==
+                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                &search_bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
+          }
+          case OP_LAYERNORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+            if (m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[0];
+              if (m->use_bias) {
+                beta = my_weight_accessor[1];
+              }
+            }
+            LayerNorm::forward_kernel_wrapper(
+                m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
+            break;
+          }
+          case OP_RESIDUAL_LAYERNORM: {
+            assert(fused->op_num_outputs[op] == 2);
+            ResidualLayerNormMeta const *m =
+                (ResidualLayerNormMeta *)metas->meta[op];
+            if (m->use_two_residuals) {
+              assert(fused->op_num_inputs[op] == 3);
+            } else {
+              assert(fused->op_num_inputs[op] == 2);
+            }
+            if (!m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 0);
+            } else {
+              if (!m->use_bias) {
+                assert(fused->op_num_weights[op] == 1); // weight
+              } else {
+                assert(fused->op_num_weights[op] == 2); // weight + bias
+              }
+            }
+            GenericTensorAccessorR residual2;
+            if (m->use_two_residuals) {
+              residual2 = my_input_accessor[2];
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[0];
+              if (m->use_bias) {
+                beta = my_weight_accessor[1];
+              }
+            }
+            ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                        my_input_accessor[0],
+                                                        my_input_accessor[1],
+                                                        residual2,
+                                                        my_output_accessor[0],
+                                                        my_output_accessor[1],
+                                                        gamma,
+                                                        beta);
+            break;
+          }
+          case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_outputs[op] == 2);
+            AddBiasResidualLayerNormMeta const *m =
+                (AddBiasResidualLayerNormMeta *)metas->meta[op];
+            if (!m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 1); // attn bias
+            } else {
+              if (!m->use_bias) {
+                assert(fused->op_num_weights[op] == 2); // attn bias + weight
+              } else {
+                assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+              }
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[1];
+              if (m->use_bias) {
+                beta = my_weight_accessor[2];
+              }
+            }
+            Domain attn_bias_domain = my_weight_accessor[0].domain;
+            Domain residual_domain = my_input_accessor[1].domain;
+            int attn_bias_dim =
+                attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
+            int residual_volume = residual_domain.get_volume();
+            AddBiasResidualLayerNorm::inference_kernel_wrapper(
+                m,
+                attn_bias_dim,
+                residual_volume,
+                my_input_accessor[0],
+                my_output_accessor[0],
+                my_output_accessor[1],
+                my_input_accessor[1],
+                my_weight_accessor[0],
+                gamma,
+                beta);
+            break;
+          }
+          case OP_SIGMOID_SILU_MULTI: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_outputs[op] == 1);
+            SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+            SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                      my_input_accessor[0],
+                                                      my_input_accessor[1],
+                                                      my_output_accessor[0]);
+            break;
+          }
+          case OP_SOFTMAX: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain.get_volume() ==
+                  my_output_accessor[0].domain.get_volume());
+            SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+            if (m->input_type == DT_HALF) {
+              Kernels::Softmax::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_half_ptr(),
+                  my_output_accessor[0].get_half_ptr());
+            } else if (m->input_type == DT_FLOAT) {
+              Kernels::Softmax::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_float_ptr(),
+                  my_output_accessor[0].get_float_ptr());
+            }
+            break;
+          }
+          case OP_ALLREDUCE: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+            Kernels::AllReduce::inference_kernel_wrapper(
+                m, bc, my_input_accessor[0], my_output_accessor[0]);
+            break;
+          }
+          default: {
+            fprintf(stderr,
+                    "Fusion currently does not support type = %d\n",
+                    fused->op_op_type[op]);
+            assert(false && "Fusion currently does not support type");
           }
         }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
-        break;
-      }
-      case OP_SIGMOID_SILU_MULTI: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
-        break;
-      }
-      case OP_SOFTMAX: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
+        if (metas->meta[op]->inference_debugging) {
+          std::vector<GenericTensorAccessorR> input_accessors_to_save;
+          std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+          std::vector<GenericTensorAccessorR> output_accessors_to_save;
+          for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+            int my_off = fused->op_input_idx[i + ioff];
+            if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+              input_accessors_to_save.push_back(input_accessor[my_off]);
+            } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+              input_accessors_to_save.push_back(output_accessor[my_off]);
+            } else {
+              assert(false);
+            }
+          }
+          for (int i = 0; i < fused->op_num_weights[op]; i++) {
+            assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+            weight_accessors_to_save.push_back(
+                weight_accessor[fused->op_weight_idx[i + woff]]);
+          }
+          for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+            output_accessors_to_save.push_back(output_accessor[i + ooff]);
+          }
+          assert(task->index_point.get_dim() == 1);
+          int shard_id = task->index_point.point_data[0];
+          FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                                  shard_id,
+                                                  bc,
+                                                  input_accessors_to_save,
+                                                  weight_accessors_to_save,
+                                                  output_accessors_to_save);
         }
-        break;
-      }
-      case OP_ALLREDUCE: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
-        break;
-      }
-      default: {
-        fprintf(stderr,
-                "Fusion currently does not support type = %d\n",
-                fused->op_op_type[op]);
-        assert(false && "Fusion currently does not support type");
+        ioff += fused->op_num_inputs[op];
+        woff += fused->op_num_weights[op];
+        ooff += fused->op_num_outputs[op];
       }
+      // for (int i = 0; i < fused->numOutputs; i++)
+      //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+      //   "[Fused:forward:output]");
+      cudaStreamEndCapture(stream, &graph);
     }
-    if (metas->meta[op]->inference_debugging) {
-      std::vector<GenericTensorAccessorR> input_accessors_to_save;
-      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-      std::vector<GenericTensorAccessorR> output_accessors_to_save;
-      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          input_accessors_to_save.push_back(input_accessor[my_off]);
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          input_accessors_to_save.push_back(output_accessor[my_off]);
-        } else {
-          assert(false);
-        }
-      }
-      for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(
-            weight_accessor[fused->op_weight_idx[i + woff]]);
-      }
-      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        output_accessors_to_save.push_back(output_accessor[i + ooff]);
-      }
-      assert(task->index_point.get_dim() == 1);
-      int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                              shard_id,
-                                              bc,
-                                              input_accessors_to_save,
-                                              weight_accessors_to_save,
-                                              output_accessors_to_save);
+
+  bool captured = false;
+
+  if(metas->graph_collections.count(graph_params)  != 0) {
+    captured = true;
+    instance = metas->graph_collections[graph_params];
+    if (cudaGraphExecUpdate(instance, graph, NULL, &updateResult) != cudaSuccess) {
+      cudaGraphExecDestroy(instance);
+      captured = false;
+    } else {
+      // if(shard_id == 0) {
+      //   printf("---------start to reuse the graph-------\n");
+      //   graph_params.Print();
+      //   // bc->print();
+      //   printf("---------end to reuse the graph-------\n");
+      // }
     }
-    ioff += fused->op_num_inputs[op];
-    woff += fused->op_num_weights[op];
-    ooff += fused->op_num_outputs[op];
+  } 
+  
+  if (!captured) {
+    cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+    metas->graph_collections[graph_params] = instance;
+    // if(shard_id == 0) {
+    //   printf("*************start cudaGraphInstantiate**********\n");
+    //   graph_params.Print();
+    //   // bc->print();
+    //   printf("*************end cudaGraphInstantiate**********\n");
+    // }
   }
-  // for (int i = 0; i < fused->numOutputs; i++)
-  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-  //   "[Fused:forward:output]");
+
+  assert(metas->graph_collections.find(graph_params) !=
+        metas->graph_collections.end());
+  cudaGraphDestroy(graph);
+  cudaGraphLaunch(instance, stream);
 }
 
 /*

From cc8ea38cf261a76c90439e7764308386d850a2e2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 21 May 2024 07:54:33 -0700
Subject: [PATCH 266/667] style: minor

---
 src/ops/fused.cu | 980 +++++++++++++++++++++++------------------------
 1 file changed, 490 insertions(+), 490 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index ac0666c87..d83244baf 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -611,536 +611,536 @@ __host__ void
                       bc->num_active_tokens(),
                       bc->prompt_phase};
   //graph_params.Print();
-  int shard_id = task->index_point.point_data[0];
+  // int shard_id = task->index_point.point_data[0];
 
   {            
-      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
-      int ioff = 0, woff = 0, ooff = 0;
-      for (int op = 0; op < fused->numOperators; op++) {
-        // Domain my_id[MAX_NUM_INPUTS];
-        // Domain my_wd[MAX_NUM_WEIGHTS];
-        // Domain my_od[MAX_NUM_OUTPUTS];
-        GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-        GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-        GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
-        for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-          int my_off = fused->op_input_idx[i + ioff];
-          if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-            // my_id[i] = input_domain[my_off];
-            assert(my_off < fused->numInputs);
-            my_input_accessor[i] = input_accessor[my_off];
-          } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-            // my_id[i] = output_domain[my_off];
-            assert(my_off < fused->numOutputs);
-            my_input_accessor[i] = output_accessor[my_off];
-          } else {
-            assert(false);
-          }
+    cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+    int ioff = 0, woff = 0, ooff = 0;
+    for (int op = 0; op < fused->numOperators; op++) {
+      // Domain my_id[MAX_NUM_INPUTS];
+      // Domain my_wd[MAX_NUM_WEIGHTS];
+      // Domain my_od[MAX_NUM_OUTPUTS];
+      GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
+      GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+      GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        int my_off = fused->op_input_idx[i + ioff];
+        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+          // my_id[i] = input_domain[my_off];
+          assert(my_off < fused->numInputs);
+          my_input_accessor[i] = input_accessor[my_off];
+        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+          // my_id[i] = output_domain[my_off];
+          assert(my_off < fused->numOutputs);
+          my_input_accessor[i] = output_accessor[my_off];
+        } else {
+          assert(false);
         }
-        for (int i = 0; i < fused->op_num_weights[op]; i++) {
-          assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-          // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-          // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
-          assert(fused->op_weight_idx[i + woff] < fused->numWeights);
-          my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+        // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+        // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+        assert(fused->op_weight_idx[i + woff] < fused->numWeights);
+        my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        int my_off = fused->op_output_idx[i + ooff];
+        assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+        assert(my_off < fused->numOutputs);
+        // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+        // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+        my_output_accessor[i] = output_accessor[my_off];
+      }
+      switch (fused->op_op_type[op]) {
+        case OP_CONCAT: {
+          assert(fused->op_num_weights[op] == 0);
+          assert(fused->op_num_outputs[op] == 1);
+          ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+          int num_inputs = fused->op_num_inputs[op];
+          Kernels::Concat::forward_kernel_wrapper(m,
+                                                  my_output_accessor[0],
+                                                  my_input_accessor,
+                                                  num_inputs,
+                                                  m->legion_axis);
+          break;
         }
-        for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-          int my_off = fused->op_output_idx[i + ooff];
-          assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-          assert(my_off < fused->numOutputs);
-          // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
-          // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-          my_output_accessor[i] = output_accessor[my_off];
+        case OP_BATCHNORM: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          assert(my_input_accessor[0].domain.get_dim() == 5);
+          assert(my_output_accessor[0].domain.get_dim() == 5);
+          assert(my_weight_accessor[0].domain.get_dim() == 2);
+          assert(my_weight_accessor[1].domain.get_dim() == 2);
+          BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+          BatchNorm::forward_kernel(m,
+                                    my_input_accessor[0].get_float_ptr(),
+                                    my_output_accessor[0].get_float_ptr(),
+                                    my_weight_accessor[0].get_float_ptr(),
+                                    my_weight_accessor[1].get_float_ptr());
+          break;
         }
-        switch (fused->op_op_type[op]) {
-          case OP_CONCAT: {
-            assert(fused->op_num_weights[op] == 0);
-            assert(fused->op_num_outputs[op] == 1);
-            ConcatMeta *m = (ConcatMeta *)metas->meta[op];
-            int num_inputs = fused->op_num_inputs[op];
-            Kernels::Concat::forward_kernel_wrapper(m,
-                                                    my_output_accessor[0],
-                                                    my_input_accessor,
-                                                    num_inputs,
-                                                    m->legion_axis);
-            break;
-          }
-          case OP_BATCHNORM: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            assert(my_input_accessor[0].domain.get_dim() == 5);
-            assert(my_output_accessor[0].domain.get_dim() == 5);
-            assert(my_weight_accessor[0].domain.get_dim() == 2);
-            assert(my_weight_accessor[1].domain.get_dim() == 2);
-            BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
-            BatchNorm::forward_kernel(m,
-                                      my_input_accessor[0].get_float_ptr(),
-                                      my_output_accessor[0].get_float_ptr(),
-                                      my_weight_accessor[0].get_float_ptr(),
-                                      my_weight_accessor[1].get_float_ptr());
-            break;
-          }
-          case OP_LINEAR: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            Domain kernel_domain = my_weight_accessor[0].domain;
-            int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
-            int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
-            int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
-            assert(my_output_accessor[0].domain.get_volume() ==
-                  out_dim * batch_size);
-            assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-            void const *bias_ptr = nullptr;
-            LinearMeta *m = (LinearMeta *)metas->meta[op];
-            if (fused->op_num_weights[op] == 2) {
-              assert(my_weight_accessor[1].domain.get_volume() == out_dim);
-              if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-                bias_ptr = my_weight_accessor[1].ptr;
-              }
-            } else {
-              assert(fused->op_num_weights[op] == 1);
-            }
-            assert(m->input_type[0] == my_input_accessor[0].data_type);
-            assert(m->input_type[0] == my_output_accessor[0].data_type);
-            batch_size = bc->num_active_tokens();
-            Kernels::Linear::forward_kernel_wrapper(m,
-                                                    my_input_accessor[0].ptr,
-                                                    my_output_accessor[0].ptr,
-                                                    my_weight_accessor[0].ptr,
-                                                    bias_ptr,
-                                                    in_dim,
-                                                    out_dim,
-                                                    batch_size);
-            break;
-          }
-          case OP_BATCHMATMUL: {
-            assert(fused->op_num_inputs[op] == 2);
-            assert(fused->op_num_weights[op] == 0);
-            assert(fused->op_num_outputs[op] == 1);
-            Domain out_domain = my_output_accessor[0].domain;
-            Domain a_domain = my_input_accessor[0].domain;
-            Domain b_domain = my_input_accessor[1].domain;
-            int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
-            assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
-            int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
-            assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
-            int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
-            assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
-            assert(a_domain.get_dim() == b_domain.get_dim());
-            assert(a_domain.get_dim() == out_domain.get_dim());
-            int batch = 1;
-            for (int i = 2; i < a_domain.get_dim(); i++) {
-              int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
-              assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
-              assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
-              batch *= dim_size;
+        case OP_LINEAR: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          Domain kernel_domain = my_weight_accessor[0].domain;
+          int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+          int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+          int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+          assert(my_output_accessor[0].domain.get_volume() ==
+                out_dim * batch_size);
+          assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+          void const *bias_ptr = nullptr;
+          LinearMeta *m = (LinearMeta *)metas->meta[op];
+          if (fused->op_num_weights[op] == 2) {
+            assert(my_weight_accessor[1].domain.get_volume() == out_dim);
+            if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
+              bias_ptr = my_weight_accessor[1].ptr;
             }
-            BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
-            Kernels::BatchMatmul::forward_kernel_wrapper(
-                meta,
-                my_output_accessor[0].get_float_ptr(),
-                my_input_accessor[0].get_float_ptr(),
-                my_input_accessor[1].get_float_ptr(),
-                (float const *)nullptr,
-                m,
-                n,
-                k,
-                batch,
-                meta->a_seq_length_dim,
-                meta->b_seq_length_dim,
-                fused->iter_config.seq_length);
-            break;
+          } else {
+            assert(fused->op_num_weights[op] == 1);
           }
-          case OP_EW_ADD:
-          case OP_EW_SUB:
-          case OP_EW_MUL:
-          case OP_EW_DIV:
-          case OP_EW_MAX:
-          case OP_EW_MIN: {
-            assert(fused->op_num_inputs[op] == 2);
-            assert(fused->op_num_weights[op] == 0);
-            assert(fused->op_num_outputs[op] == 1);
-            assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
-            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-            ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
-            Kernels::ElementBinary::forward_kernel_wrapper(m,
-                                                          my_input_accessor[0],
-                                                          my_input_accessor[1],
-                                                          my_output_accessor[0]);
-            break;
+          assert(m->input_type[0] == my_input_accessor[0].data_type);
+          assert(m->input_type[0] == my_output_accessor[0].data_type);
+          batch_size = bc->num_active_tokens();
+          Kernels::Linear::forward_kernel_wrapper(m,
+                                                  my_input_accessor[0].ptr,
+                                                  my_output_accessor[0].ptr,
+                                                  my_weight_accessor[0].ptr,
+                                                  bias_ptr,
+                                                  in_dim,
+                                                  out_dim,
+                                                  batch_size);
+          break;
+        }
+        case OP_BATCHMATMUL: {
+          assert(fused->op_num_inputs[op] == 2);
+          assert(fused->op_num_weights[op] == 0);
+          assert(fused->op_num_outputs[op] == 1);
+          Domain out_domain = my_output_accessor[0].domain;
+          Domain a_domain = my_input_accessor[0].domain;
+          Domain b_domain = my_input_accessor[1].domain;
+          int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+          assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+          int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+          assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+          int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+          assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+          assert(a_domain.get_dim() == b_domain.get_dim());
+          assert(a_domain.get_dim() == out_domain.get_dim());
+          int batch = 1;
+          for (int i = 2; i < a_domain.get_dim(); i++) {
+            int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+            assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+            assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+            batch *= dim_size;
           }
-          case OP_EMBEDDING: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_weights[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
-            if (m->aggr == AGGR_MODE_NONE) {
-              // assert(kernel_domain.get_dim() == 2);
-              assert(my_input_accessor[0].domain.get_dim() + 1 ==
-                    my_output_accessor[0].domain.get_dim());
-              for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
-                assert(my_input_accessor[0].domain.hi()[i] ==
-                      my_output_accessor[0].domain.hi()[i + 1]);
-                assert(my_input_accessor[0].domain.lo()[i] ==
-                      my_output_accessor[0].domain.lo()[i + 1]);
-              }
-              assert(my_weight_accessor[0].domain.hi()[0] -
-                        my_weight_accessor[0].domain.lo()[0] ==
-                    my_output_accessor[0].domain.hi()[0] -
-                        my_output_accessor[0].domain.lo()[0]);
-            } else {
-              assert(my_input_accessor[0].domain.get_dim() ==
-                    my_output_accessor[0].domain.get_dim());
-              for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
-                assert(my_input_accessor[0].domain.hi()[i] ==
-                      my_output_accessor[0].domain.hi()[i]);
-                assert(my_input_accessor[0].domain.lo()[i] ==
-                      my_output_accessor[0].domain.lo()[i]);
-              }
-              assert(my_weight_accessor[0].domain.hi()[0] -
-                        my_weight_accessor[0].domain.lo()[0] ==
-                    my_output_accessor[0].domain.hi()[0] -
-                        my_output_accessor[0].domain.lo()[0]);
+          BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+          Kernels::BatchMatmul::forward_kernel_wrapper(
+              meta,
+              my_output_accessor[0].get_float_ptr(),
+              my_input_accessor[0].get_float_ptr(),
+              my_input_accessor[1].get_float_ptr(),
+              (float const *)nullptr,
+              m,
+              n,
+              k,
+              batch,
+              meta->a_seq_length_dim,
+              meta->b_seq_length_dim,
+              fused->iter_config.seq_length);
+          break;
+        }
+        case OP_EW_ADD:
+        case OP_EW_SUB:
+        case OP_EW_MUL:
+        case OP_EW_DIV:
+        case OP_EW_MAX:
+        case OP_EW_MIN: {
+          assert(fused->op_num_inputs[op] == 2);
+          assert(fused->op_num_weights[op] == 0);
+          assert(fused->op_num_outputs[op] == 1);
+          assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
+          assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+          ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+          Kernels::ElementBinary::forward_kernel_wrapper(m,
+                                                        my_input_accessor[0],
+                                                        my_input_accessor[1],
+                                                        my_output_accessor[0]);
+          break;
+        }
+        case OP_EMBEDDING: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_weights[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
+          if (m->aggr == AGGR_MODE_NONE) {
+            // assert(kernel_domain.get_dim() == 2);
+            assert(my_input_accessor[0].domain.get_dim() + 1 ==
+                  my_output_accessor[0].domain.get_dim());
+            for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
+              assert(my_input_accessor[0].domain.hi()[i] ==
+                    my_output_accessor[0].domain.hi()[i + 1]);
+              assert(my_input_accessor[0].domain.lo()[i] ==
+                    my_output_accessor[0].domain.lo()[i + 1]);
             }
-            int in_dim, out_dim, effective_batch_size;
-            if (m->aggr == AGGR_MODE_NONE) {
-              in_dim = 1;
-              out_dim = my_output_accessor[0].domain.hi()[0] -
-                        my_output_accessor[0].domain.lo()[0] + 1;
-              effective_batch_size =
-                  my_output_accessor[0].domain.get_volume() / out_dim;
-              assert(effective_batch_size * in_dim ==
-                    my_input_accessor[0].domain.get_volume());
-            } else {
-              assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
-              in_dim = my_input_accessor[0].domain.hi()[0] -
-                      my_input_accessor[0].domain.lo()[0] + 1;
-              out_dim = my_output_accessor[0].domain.hi()[0] -
-                        my_output_accessor[0].domain.lo()[0] + 1;
-              effective_batch_size =
-                  my_output_accessor[0].domain.get_volume() / out_dim;
-              assert(effective_batch_size * in_dim ==
-                    my_input_accessor[0].domain.get_volume());
+            assert(my_weight_accessor[0].domain.hi()[0] -
+                      my_weight_accessor[0].domain.lo()[0] ==
+                  my_output_accessor[0].domain.hi()[0] -
+                      my_output_accessor[0].domain.lo()[0]);
+          } else {
+            assert(my_input_accessor[0].domain.get_dim() ==
+                  my_output_accessor[0].domain.get_dim());
+            for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
+              assert(my_input_accessor[0].domain.hi()[i] ==
+                    my_output_accessor[0].domain.hi()[i]);
+              assert(my_input_accessor[0].domain.lo()[i] ==
+                    my_output_accessor[0].domain.lo()[i]);
             }
-
-            assert(my_input_accessor[0].data_type == DT_INT32 ||
-                  my_input_accessor[0].data_type == DT_INT64);
-            Kernels::Embedding::forward_kernel_wrapper(m,
-                                                      my_input_accessor[0],
-                                                      my_output_accessor[0],
-                                                      my_weight_accessor[0],
-                                                      in_dim,
-                                                      out_dim,
-                                                      effective_batch_size);
-            break;
+            assert(my_weight_accessor[0].domain.hi()[0] -
+                      my_weight_accessor[0].domain.lo()[0] ==
+                  my_output_accessor[0].domain.hi()[0] -
+                      my_output_accessor[0].domain.lo()[0]);
           }
-          case OP_GELU:
-          case OP_RELU:
-          case OP_SIGMOID:
-          case OP_TANH:
-          case OP_ELU:
-          case OP_SCALAR_TRUE_DIV: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_weights[op] == 0);
-            assert(fused->op_num_outputs[op] == 1);
-            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-            ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-            if (m->data_type == DT_HALF) {
-              ElementUnary::forward_kernel_wrapper(
-                  m,
-                  my_input_accessor[0].get_half_ptr(),
-                  my_output_accessor[0].get_half_ptr(),
+          int in_dim, out_dim, effective_batch_size;
+          if (m->aggr == AGGR_MODE_NONE) {
+            in_dim = 1;
+            out_dim = my_output_accessor[0].domain.hi()[0] -
+                      my_output_accessor[0].domain.lo()[0] + 1;
+            effective_batch_size =
+                my_output_accessor[0].domain.get_volume() / out_dim;
+            assert(effective_batch_size * in_dim ==
                   my_input_accessor[0].domain.get_volume());
-            } else if (m->data_type == DT_FLOAT) {
-              ElementUnary::forward_kernel_wrapper(
-                  m,
-                  my_input_accessor[0].get_float_ptr(),
-                  my_output_accessor[0].get_float_ptr(),
+          } else {
+            assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
+            in_dim = my_input_accessor[0].domain.hi()[0] -
+                    my_input_accessor[0].domain.lo()[0] + 1;
+            out_dim = my_output_accessor[0].domain.hi()[0] -
+                      my_output_accessor[0].domain.lo()[0] + 1;
+            effective_batch_size =
+                my_output_accessor[0].domain.get_volume() / out_dim;
+            assert(effective_batch_size * in_dim ==
                   my_input_accessor[0].domain.get_volume());
-            } else {
-              assert(false && "Unsupported data type in ElementUnary forward");
-            }
-            break;
           }
-          case OP_RMS_NORM: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_weights[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-            Kernels::RMSNorm::forward_kernel_wrapper(m,
+
+          assert(my_input_accessor[0].data_type == DT_INT32 ||
+                my_input_accessor[0].data_type == DT_INT64);
+          Kernels::Embedding::forward_kernel_wrapper(m,
                                                     my_input_accessor[0],
+                                                    my_output_accessor[0],
                                                     my_weight_accessor[0],
-                                                    my_output_accessor[0]);
-            break;
-          }
-          case OP_RESIDUAL_RMS_NORM: {
-            assert(fused->op_num_inputs[op] == 2);
-            assert(fused->op_num_weights[op] == 1);
-            assert(fused->op_num_outputs[op] == 2);
-            ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-            Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                            my_input_accessor[0],
-                                                            my_input_accessor[1],
-                                                            my_weight_accessor[0],
-                                                            my_output_accessor[0],
-                                                            my_output_accessor[1]);
-            break;
-          }
-          case OP_INC_MULTIHEAD_SELF_ATTENTION: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            IncMultiHeadSelfAttentionMeta const *m =
-                (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-            assert(fused->op_num_weights[op] ==
-                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
-            GenericTensorAccessorR biases;
-            if (*m->qkv_bias || *m->final_bias) {
-              assert(fused->op_num_weights[op] == 2);
-              biases = my_weight_accessor[1];
-            }
-            IncMultiHeadSelfAttention::inference_kernel_wrapper(
+                                                    in_dim,
+                                                    out_dim,
+                                                    effective_batch_size);
+          break;
+        }
+        case OP_GELU:
+        case OP_RELU:
+        case OP_SIGMOID:
+        case OP_TANH:
+        case OP_ELU:
+        case OP_SCALAR_TRUE_DIV: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_weights[op] == 0);
+          assert(fused->op_num_outputs[op] == 1);
+          assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+          ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+          if (m->data_type == DT_HALF) {
+            ElementUnary::forward_kernel_wrapper(
                 m,
-                bc,
-                task->index_point.point_data[0],
-                my_input_accessor[0],
-                my_weight_accessor[0],
-                my_output_accessor[0],
-                biases);
-            break;
-          }
-          case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            TreeIncMultiHeadSelfAttentionMeta *m =
-                (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-            // TreeVerifyBatchConfig const *verify_bc =
-            //     (TreeVerifyBatchConfig *)task->args;
-            BatchConfig const &verify_bc =
-                Future(task->futures[0]).get_result<BatchConfig>();
-            assert(fused->op_num_weights[op] ==
-                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
-            GenericTensorAccessorR biases;
-            if (*m->qkv_bias || *m->final_bias) {
-              assert(fused->op_num_weights[op] == 2);
-              biases = my_weight_accessor[1];
-            }
-            TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                my_input_accessor[0].get_half_ptr(),
+                my_output_accessor[0].get_half_ptr(),
+                my_input_accessor[0].domain.get_volume());
+          } else if (m->data_type == DT_FLOAT) {
+            ElementUnary::forward_kernel_wrapper(
                 m,
-                &verify_bc,
-                task->index_point.point_data[0],
-                my_input_accessor[0],
-                my_weight_accessor[0],
-                my_output_accessor[0],
-                biases);
-            break;
+                my_input_accessor[0].get_float_ptr(),
+                my_output_accessor[0].get_float_ptr(),
+                my_input_accessor[0].domain.get_volume());
+          } else {
+            assert(false && "Unsupported data type in ElementUnary forward");
           }
-          case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            SpecIncMultiHeadSelfAttentionMeta const *m =
-                (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-            // TreeSearchBatchConfig const *search_bc =
-            //     (TreeSearchBatchConfig *)task->args;
-            BatchConfig const &search_bc =
-                Future(task->futures[0]).get_result<BatchConfig>();
-            assert(fused->op_num_weights[op] ==
-                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
-            GenericTensorAccessorR biases;
-            if (*m->qkv_bias || *m->final_bias) {
-              assert(fused->op_num_weights[op] == 2);
-              biases = my_weight_accessor[1];
-            }
-            SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-                m,
-                &search_bc,
-                task->index_point.point_data[0],
-                my_input_accessor[0],
-                my_weight_accessor[0],
-                my_output_accessor[0],
-                biases);
-            break;
+          break;
+        }
+        case OP_RMS_NORM: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_weights[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+          Kernels::RMSNorm::forward_kernel_wrapper(m,
+                                                  my_input_accessor[0],
+                                                  my_weight_accessor[0],
+                                                  my_output_accessor[0]);
+          break;
+        }
+        case OP_RESIDUAL_RMS_NORM: {
+          assert(fused->op_num_inputs[op] == 2);
+          assert(fused->op_num_weights[op] == 1);
+          assert(fused->op_num_outputs[op] == 2);
+          ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+          Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
+                                                          my_input_accessor[0],
+                                                          my_input_accessor[1],
+                                                          my_weight_accessor[0],
+                                                          my_output_accessor[0],
+                                                          my_output_accessor[1]);
+          break;
+        }
+        case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          IncMultiHeadSelfAttentionMeta const *m =
+              (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+          assert(fused->op_num_weights[op] ==
+                (1 + (int)(*m->qkv_bias || *m->final_bias)));
+          GenericTensorAccessorR biases;
+          if (*m->qkv_bias || *m->final_bias) {
+            assert(fused->op_num_weights[op] == 2);
+            biases = my_weight_accessor[1];
           }
-          case OP_LAYERNORM: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
-            if (m->elementwise_affine) {
-              assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
-            }
-            GenericTensorAccessorR gamma, beta;
-            if (m->elementwise_affine) {
-              gamma = my_weight_accessor[0];
-              if (m->use_bias) {
-                beta = my_weight_accessor[1];
-              }
-            }
-            LayerNorm::forward_kernel_wrapper(
-                m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
-            break;
+          IncMultiHeadSelfAttention::inference_kernel_wrapper(
+              m,
+              bc,
+              task->index_point.point_data[0],
+              my_input_accessor[0],
+              my_weight_accessor[0],
+              my_output_accessor[0],
+              biases);
+          break;
+        }
+        case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          TreeIncMultiHeadSelfAttentionMeta *m =
+              (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+          // TreeVerifyBatchConfig const *verify_bc =
+          //     (TreeVerifyBatchConfig *)task->args;
+          BatchConfig const &verify_bc =
+              Future(task->futures[0]).get_result<BatchConfig>();
+          assert(fused->op_num_weights[op] ==
+                (1 + (int)(*m->qkv_bias || *m->final_bias)));
+          GenericTensorAccessorR biases;
+          if (*m->qkv_bias || *m->final_bias) {
+            assert(fused->op_num_weights[op] == 2);
+            biases = my_weight_accessor[1];
           }
-          case OP_RESIDUAL_LAYERNORM: {
-            assert(fused->op_num_outputs[op] == 2);
-            ResidualLayerNormMeta const *m =
-                (ResidualLayerNormMeta *)metas->meta[op];
-            if (m->use_two_residuals) {
-              assert(fused->op_num_inputs[op] == 3);
-            } else {
-              assert(fused->op_num_inputs[op] == 2);
-            }
-            if (!m->elementwise_affine) {
-              assert(fused->op_num_weights[op] == 0);
-            } else {
-              if (!m->use_bias) {
-                assert(fused->op_num_weights[op] == 1); // weight
-              } else {
-                assert(fused->op_num_weights[op] == 2); // weight + bias
-              }
-            }
-            GenericTensorAccessorR residual2;
-            if (m->use_two_residuals) {
-              residual2 = my_input_accessor[2];
-            }
-            GenericTensorAccessorR gamma, beta;
-            if (m->elementwise_affine) {
-              gamma = my_weight_accessor[0];
-              if (m->use_bias) {
-                beta = my_weight_accessor[1];
-              }
-            }
-            ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                        my_input_accessor[0],
-                                                        my_input_accessor[1],
-                                                        residual2,
-                                                        my_output_accessor[0],
-                                                        my_output_accessor[1],
-                                                        gamma,
-                                                        beta);
-            break;
+          TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+              m,
+              &verify_bc,
+              task->index_point.point_data[0],
+              my_input_accessor[0],
+              my_weight_accessor[0],
+              my_output_accessor[0],
+              biases);
+          break;
+        }
+        case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          SpecIncMultiHeadSelfAttentionMeta const *m =
+              (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+          // TreeSearchBatchConfig const *search_bc =
+          //     (TreeSearchBatchConfig *)task->args;
+          BatchConfig const &search_bc =
+              Future(task->futures[0]).get_result<BatchConfig>();
+          assert(fused->op_num_weights[op] ==
+                (1 + (int)(*m->qkv_bias || *m->final_bias)));
+          GenericTensorAccessorR biases;
+          if (*m->qkv_bias || *m->final_bias) {
+            assert(fused->op_num_weights[op] == 2);
+            biases = my_weight_accessor[1];
           }
-          case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-            assert(fused->op_num_inputs[op] == 2);
-            assert(fused->op_num_outputs[op] == 2);
-            AddBiasResidualLayerNormMeta const *m =
-                (AddBiasResidualLayerNormMeta *)metas->meta[op];
-            if (!m->elementwise_affine) {
-              assert(fused->op_num_weights[op] == 1); // attn bias
-            } else {
-              if (!m->use_bias) {
-                assert(fused->op_num_weights[op] == 2); // attn bias + weight
-              } else {
-                assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
-              }
-            }
-            GenericTensorAccessorR gamma, beta;
-            if (m->elementwise_affine) {
-              gamma = my_weight_accessor[1];
-              if (m->use_bias) {
-                beta = my_weight_accessor[2];
-              }
+          SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+              m,
+              &search_bc,
+              task->index_point.point_data[0],
+              my_input_accessor[0],
+              my_weight_accessor[0],
+              my_output_accessor[0],
+              biases);
+          break;
+        }
+        case OP_LAYERNORM: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+          if (m->elementwise_affine) {
+            assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+          }
+          GenericTensorAccessorR gamma, beta;
+          if (m->elementwise_affine) {
+            gamma = my_weight_accessor[0];
+            if (m->use_bias) {
+              beta = my_weight_accessor[1];
             }
-            Domain attn_bias_domain = my_weight_accessor[0].domain;
-            Domain residual_domain = my_input_accessor[1].domain;
-            int attn_bias_dim =
-                attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-            int residual_volume = residual_domain.get_volume();
-            AddBiasResidualLayerNorm::inference_kernel_wrapper(
-                m,
-                attn_bias_dim,
-                residual_volume,
-                my_input_accessor[0],
-                my_output_accessor[0],
-                my_output_accessor[1],
-                my_input_accessor[1],
-                my_weight_accessor[0],
-                gamma,
-                beta);
-            break;
           }
-          case OP_SIGMOID_SILU_MULTI: {
+          LayerNorm::forward_kernel_wrapper(
+              m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
+          break;
+        }
+        case OP_RESIDUAL_LAYERNORM: {
+          assert(fused->op_num_outputs[op] == 2);
+          ResidualLayerNormMeta const *m =
+              (ResidualLayerNormMeta *)metas->meta[op];
+          if (m->use_two_residuals) {
+            assert(fused->op_num_inputs[op] == 3);
+          } else {
             assert(fused->op_num_inputs[op] == 2);
-            assert(fused->op_num_outputs[op] == 1);
-            SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-            SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                      my_input_accessor[0],
-                                                      my_input_accessor[1],
-                                                      my_output_accessor[0]);
-            break;
           }
-          case OP_SOFTMAX: {
-            assert(fused->op_num_inputs[op] == 1);
+          if (!m->elementwise_affine) {
             assert(fused->op_num_weights[op] == 0);
-            assert(fused->op_num_outputs[op] == 1);
-            assert(my_input_accessor[0].domain.get_volume() ==
-                  my_output_accessor[0].domain.get_volume());
-            SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-            if (m->input_type == DT_HALF) {
-              Kernels::Softmax::forward_kernel_wrapper(
-                  m,
-                  my_input_accessor[0].get_half_ptr(),
-                  my_output_accessor[0].get_half_ptr());
-            } else if (m->input_type == DT_FLOAT) {
-              Kernels::Softmax::forward_kernel_wrapper(
-                  m,
-                  my_input_accessor[0].get_float_ptr(),
-                  my_output_accessor[0].get_float_ptr());
+          } else {
+            if (!m->use_bias) {
+              assert(fused->op_num_weights[op] == 1); // weight
+            } else {
+              assert(fused->op_num_weights[op] == 2); // weight + bias
             }
-            break;
           }
-          case OP_ALLREDUCE: {
-            assert(fused->op_num_inputs[op] == 1);
-            assert(fused->op_num_outputs[op] == 1);
-            AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-            Kernels::AllReduce::inference_kernel_wrapper(
-                m, bc, my_input_accessor[0], my_output_accessor[0]);
-            break;
+          GenericTensorAccessorR residual2;
+          if (m->use_two_residuals) {
+            residual2 = my_input_accessor[2];
           }
-          default: {
-            fprintf(stderr,
-                    "Fusion currently does not support type = %d\n",
-                    fused->op_op_type[op]);
-            assert(false && "Fusion currently does not support type");
+          GenericTensorAccessorR gamma, beta;
+          if (m->elementwise_affine) {
+            gamma = my_weight_accessor[0];
+            if (m->use_bias) {
+              beta = my_weight_accessor[1];
+            }
           }
+          ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                      my_input_accessor[0],
+                                                      my_input_accessor[1],
+                                                      residual2,
+                                                      my_output_accessor[0],
+                                                      my_output_accessor[1],
+                                                      gamma,
+                                                      beta);
+          break;
         }
-        if (metas->meta[op]->inference_debugging) {
-          std::vector<GenericTensorAccessorR> input_accessors_to_save;
-          std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-          std::vector<GenericTensorAccessorR> output_accessors_to_save;
-          for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-            int my_off = fused->op_input_idx[i + ioff];
-            if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-              input_accessors_to_save.push_back(input_accessor[my_off]);
-            } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-              input_accessors_to_save.push_back(output_accessor[my_off]);
+        case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+          assert(fused->op_num_inputs[op] == 2);
+          assert(fused->op_num_outputs[op] == 2);
+          AddBiasResidualLayerNormMeta const *m =
+              (AddBiasResidualLayerNormMeta *)metas->meta[op];
+          if (!m->elementwise_affine) {
+            assert(fused->op_num_weights[op] == 1); // attn bias
+          } else {
+            if (!m->use_bias) {
+              assert(fused->op_num_weights[op] == 2); // attn bias + weight
             } else {
-              assert(false);
+              assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
             }
           }
-          for (int i = 0; i < fused->op_num_weights[op]; i++) {
-            assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-            weight_accessors_to_save.push_back(
-                weight_accessor[fused->op_weight_idx[i + woff]]);
+          GenericTensorAccessorR gamma, beta;
+          if (m->elementwise_affine) {
+            gamma = my_weight_accessor[1];
+            if (m->use_bias) {
+              beta = my_weight_accessor[2];
+            }
           }
-          for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-            output_accessors_to_save.push_back(output_accessor[i + ooff]);
+          Domain attn_bias_domain = my_weight_accessor[0].domain;
+          Domain residual_domain = my_input_accessor[1].domain;
+          int attn_bias_dim =
+              attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
+          int residual_volume = residual_domain.get_volume();
+          AddBiasResidualLayerNorm::inference_kernel_wrapper(
+              m,
+              attn_bias_dim,
+              residual_volume,
+              my_input_accessor[0],
+              my_output_accessor[0],
+              my_output_accessor[1],
+              my_input_accessor[1],
+              my_weight_accessor[0],
+              gamma,
+              beta);
+          break;
+        }
+        case OP_SIGMOID_SILU_MULTI: {
+          assert(fused->op_num_inputs[op] == 2);
+          assert(fused->op_num_outputs[op] == 1);
+          SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+          SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                    my_input_accessor[0],
+                                                    my_input_accessor[1],
+                                                    my_output_accessor[0]);
+          break;
+        }
+        case OP_SOFTMAX: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_weights[op] == 0);
+          assert(fused->op_num_outputs[op] == 1);
+          assert(my_input_accessor[0].domain.get_volume() ==
+                my_output_accessor[0].domain.get_volume());
+          SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+          if (m->input_type == DT_HALF) {
+            Kernels::Softmax::forward_kernel_wrapper(
+                m,
+                my_input_accessor[0].get_half_ptr(),
+                my_output_accessor[0].get_half_ptr());
+          } else if (m->input_type == DT_FLOAT) {
+            Kernels::Softmax::forward_kernel_wrapper(
+                m,
+                my_input_accessor[0].get_float_ptr(),
+                my_output_accessor[0].get_float_ptr());
           }
-          assert(task->index_point.get_dim() == 1);
-          int shard_id = task->index_point.point_data[0];
-          FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                                  shard_id,
-                                                  bc,
-                                                  input_accessors_to_save,
-                                                  weight_accessors_to_save,
-                                                  output_accessors_to_save);
+          break;
+        }
+        case OP_ALLREDUCE: {
+          assert(fused->op_num_inputs[op] == 1);
+          assert(fused->op_num_outputs[op] == 1);
+          AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+          Kernels::AllReduce::inference_kernel_wrapper(
+              m, bc, my_input_accessor[0], my_output_accessor[0]);
+          break;
+        }
+        default: {
+          fprintf(stderr,
+                  "Fusion currently does not support type = %d\n",
+                  fused->op_op_type[op]);
+          assert(false && "Fusion currently does not support type");
         }
-        ioff += fused->op_num_inputs[op];
-        woff += fused->op_num_weights[op];
-        ooff += fused->op_num_outputs[op];
       }
-      // for (int i = 0; i < fused->numOutputs; i++)
-      //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-      //   "[Fused:forward:output]");
-      cudaStreamEndCapture(stream, &graph);
+      if (metas->meta[op]->inference_debugging) {
+        std::vector<GenericTensorAccessorR> input_accessors_to_save;
+        std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+        std::vector<GenericTensorAccessorR> output_accessors_to_save;
+        for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+          int my_off = fused->op_input_idx[i + ioff];
+          if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+            input_accessors_to_save.push_back(input_accessor[my_off]);
+          } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+            input_accessors_to_save.push_back(output_accessor[my_off]);
+          } else {
+            assert(false);
+          }
+        }
+        for (int i = 0; i < fused->op_num_weights[op]; i++) {
+          assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+          weight_accessors_to_save.push_back(
+              weight_accessor[fused->op_weight_idx[i + woff]]);
+        }
+        for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+          output_accessors_to_save.push_back(output_accessor[i + ooff]);
+        }
+        assert(task->index_point.get_dim() == 1);
+        int shard_id = task->index_point.point_data[0];
+        FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                                shard_id,
+                                                bc,
+                                                input_accessors_to_save,
+                                                weight_accessors_to_save,
+                                                output_accessors_to_save);
+      }
+      ioff += fused->op_num_inputs[op];
+      woff += fused->op_num_weights[op];
+      ooff += fused->op_num_outputs[op];
     }
+    // for (int i = 0; i < fused->numOutputs; i++)
+    //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+    //   "[Fused:forward:output]");
+    cudaStreamEndCapture(stream, &graph);
+  }
 
   bool captured = false;
 

From 239a225db51420848d3674a147248b5661b9a3ad Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 21 May 2024 11:12:47 -0700
Subject: [PATCH 267/667] feat: remove cudaGraphExecUpdate

---
 src/ops/fused.cu | 1025 +++++++++++++++++++++++-----------------------
 1 file changed, 512 insertions(+), 513 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index d83244baf..7d1c2944d 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -603,9 +603,8 @@ __host__ void
   checkCUDA(get_legion_stream(&stream));
 
   // create new cuda graph
-  cudaGraph_t graph;
   cudaGraphExec_t instance;
-  cudaGraphExecUpdateResult updateResult;
+  // cudaGraphExecUpdateResult updateResult;
 
   GraphParams graph_params = {bc->num_active_requests(),
                       bc->num_active_tokens(),
@@ -613,554 +612,554 @@ __host__ void
   //graph_params.Print();
   // int shard_id = task->index_point.point_data[0];
 
-  {            
-    cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
-    int ioff = 0, woff = 0, ooff = 0;
-    for (int op = 0; op < fused->numOperators; op++) {
-      // Domain my_id[MAX_NUM_INPUTS];
-      // Domain my_wd[MAX_NUM_WEIGHTS];
-      // Domain my_od[MAX_NUM_OUTPUTS];
-      GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
-      GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
-      GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
-      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          // my_id[i] = input_domain[my_off];
-          assert(my_off < fused->numInputs);
-          my_input_accessor[i] = input_accessor[my_off];
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          // my_id[i] = output_domain[my_off];
-          assert(my_off < fused->numOutputs);
-          my_input_accessor[i] = output_accessor[my_off];
-        } else {
-          assert(false);
+  bool captured = false;
+
+  if(metas->graph_collections.count(graph_params)  != 0) {
+    captured = true;
+    instance = metas->graph_collections[graph_params];
+    // if (cudaGraphExecUpdate(instance, graph, NULL, &updateResult) != cudaSuccess) {
+    //   cudaGraphExecDestroy(instance);
+    //   captured = false;
+    // } else {
+    //   // if(shard_id == 0) {
+    //   //   printf("---------start to reuse the graph-------\n");
+    //   //   graph_params.Print();
+    //   //   // bc->print();
+    //   //   printf("---------end to reuse the graph-------\n");
+    //   // }
+    // }
+  }
+
+  if (!captured) {
+    cudaGraph_t graph;
+    {    
+      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+      int ioff = 0, woff = 0, ooff = 0;
+      for (int op = 0; op < fused->numOperators; op++) {
+        // Domain my_id[MAX_NUM_INPUTS];
+        // Domain my_wd[MAX_NUM_WEIGHTS];
+        // Domain my_od[MAX_NUM_OUTPUTS];
+        GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
+        GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+        GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
+        for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+          int my_off = fused->op_input_idx[i + ioff];
+          if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+            // my_id[i] = input_domain[my_off];
+            assert(my_off < fused->numInputs);
+            my_input_accessor[i] = input_accessor[my_off];
+          } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+            // my_id[i] = output_domain[my_off];
+            assert(my_off < fused->numOutputs);
+            my_input_accessor[i] = output_accessor[my_off];
+          } else {
+            assert(false);
+          }
         }
-      }
-      for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-        // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
-        assert(fused->op_weight_idx[i + woff] < fused->numWeights);
-        my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-      }
-      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        int my_off = fused->op_output_idx[i + ooff];
-        assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-        assert(my_off < fused->numOutputs);
-        // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
-        // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
-        my_output_accessor[i] = output_accessor[my_off];
-      }
-      switch (fused->op_op_type[op]) {
-        case OP_CONCAT: {
-          assert(fused->op_num_weights[op] == 0);
-          assert(fused->op_num_outputs[op] == 1);
-          ConcatMeta *m = (ConcatMeta *)metas->meta[op];
-          int num_inputs = fused->op_num_inputs[op];
-          Kernels::Concat::forward_kernel_wrapper(m,
-                                                  my_output_accessor[0],
-                                                  my_input_accessor,
-                                                  num_inputs,
-                                                  m->legion_axis);
-          break;
+        for (int i = 0; i < fused->op_num_weights[op]; i++) {
+          assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+          // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+          // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+          assert(fused->op_weight_idx[i + woff] < fused->numWeights);
+          my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
         }
-        case OP_BATCHNORM: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          assert(my_input_accessor[0].domain.get_dim() == 5);
-          assert(my_output_accessor[0].domain.get_dim() == 5);
-          assert(my_weight_accessor[0].domain.get_dim() == 2);
-          assert(my_weight_accessor[1].domain.get_dim() == 2);
-          BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
-          BatchNorm::forward_kernel(m,
-                                    my_input_accessor[0].get_float_ptr(),
-                                    my_output_accessor[0].get_float_ptr(),
-                                    my_weight_accessor[0].get_float_ptr(),
-                                    my_weight_accessor[1].get_float_ptr());
-          break;
+        for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+          int my_off = fused->op_output_idx[i + ooff];
+          assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+          assert(my_off < fused->numOutputs);
+          // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+          // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+          my_output_accessor[i] = output_accessor[my_off];
         }
-        case OP_LINEAR: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          Domain kernel_domain = my_weight_accessor[0].domain;
-          int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
-          int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
-          int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
-          assert(my_output_accessor[0].domain.get_volume() ==
-                out_dim * batch_size);
-          assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-          void const *bias_ptr = nullptr;
-          LinearMeta *m = (LinearMeta *)metas->meta[op];
-          if (fused->op_num_weights[op] == 2) {
-            assert(my_weight_accessor[1].domain.get_volume() == out_dim);
-            if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-              bias_ptr = my_weight_accessor[1].ptr;
+        switch (fused->op_op_type[op]) {
+          case OP_CONCAT: {
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+            int num_inputs = fused->op_num_inputs[op];
+            Kernels::Concat::forward_kernel_wrapper(m,
+                                                    my_output_accessor[0],
+                                                    my_input_accessor,
+                                                    num_inputs,
+                                                    m->legion_axis);
+            break;
+          }
+          case OP_BATCHNORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain.get_dim() == 5);
+            assert(my_output_accessor[0].domain.get_dim() == 5);
+            assert(my_weight_accessor[0].domain.get_dim() == 2);
+            assert(my_weight_accessor[1].domain.get_dim() == 2);
+            BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+            BatchNorm::forward_kernel(m,
+                                      my_input_accessor[0].get_float_ptr(),
+                                      my_output_accessor[0].get_float_ptr(),
+                                      my_weight_accessor[0].get_float_ptr(),
+                                      my_weight_accessor[1].get_float_ptr());
+            break;
+          }
+          case OP_LINEAR: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            Domain kernel_domain = my_weight_accessor[0].domain;
+            int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+            int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+            int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+            assert(my_output_accessor[0].domain.get_volume() ==
+                  out_dim * batch_size);
+            assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+            void const *bias_ptr = nullptr;
+            LinearMeta *m = (LinearMeta *)metas->meta[op];
+            if (fused->op_num_weights[op] == 2) {
+              assert(my_weight_accessor[1].domain.get_volume() == out_dim);
+              if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
+                bias_ptr = my_weight_accessor[1].ptr;
+              }
+            } else {
+              assert(fused->op_num_weights[op] == 1);
             }
-          } else {
-            assert(fused->op_num_weights[op] == 1);
+            assert(m->input_type[0] == my_input_accessor[0].data_type);
+            assert(m->input_type[0] == my_output_accessor[0].data_type);
+            batch_size = bc->num_active_tokens();
+            Kernels::Linear::forward_kernel_wrapper(m,
+                                                    my_input_accessor[0].ptr,
+                                                    my_output_accessor[0].ptr,
+                                                    my_weight_accessor[0].ptr,
+                                                    bias_ptr,
+                                                    in_dim,
+                                                    out_dim,
+                                                    batch_size);
+            break;
           }
-          assert(m->input_type[0] == my_input_accessor[0].data_type);
-          assert(m->input_type[0] == my_output_accessor[0].data_type);
-          batch_size = bc->num_active_tokens();
-          Kernels::Linear::forward_kernel_wrapper(m,
-                                                  my_input_accessor[0].ptr,
-                                                  my_output_accessor[0].ptr,
-                                                  my_weight_accessor[0].ptr,
-                                                  bias_ptr,
-                                                  in_dim,
-                                                  out_dim,
-                                                  batch_size);
-          break;
-        }
-        case OP_BATCHMATMUL: {
-          assert(fused->op_num_inputs[op] == 2);
-          assert(fused->op_num_weights[op] == 0);
-          assert(fused->op_num_outputs[op] == 1);
-          Domain out_domain = my_output_accessor[0].domain;
-          Domain a_domain = my_input_accessor[0].domain;
-          Domain b_domain = my_input_accessor[1].domain;
-          int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
-          assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
-          int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
-          assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
-          int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
-          assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
-          assert(a_domain.get_dim() == b_domain.get_dim());
-          assert(a_domain.get_dim() == out_domain.get_dim());
-          int batch = 1;
-          for (int i = 2; i < a_domain.get_dim(); i++) {
-            int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
-            assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
-            assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
-            batch *= dim_size;
+          case OP_BATCHMATMUL: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            Domain out_domain = my_output_accessor[0].domain;
+            Domain a_domain = my_input_accessor[0].domain;
+            Domain b_domain = my_input_accessor[1].domain;
+            int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+            assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+            int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+            assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+            int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+            assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+            assert(a_domain.get_dim() == b_domain.get_dim());
+            assert(a_domain.get_dim() == out_domain.get_dim());
+            int batch = 1;
+            for (int i = 2; i < a_domain.get_dim(); i++) {
+              int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+              assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+              assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+              batch *= dim_size;
+            }
+            BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+            Kernels::BatchMatmul::forward_kernel_wrapper(
+                meta,
+                my_output_accessor[0].get_float_ptr(),
+                my_input_accessor[0].get_float_ptr(),
+                my_input_accessor[1].get_float_ptr(),
+                (float const *)nullptr,
+                m,
+                n,
+                k,
+                batch,
+                meta->a_seq_length_dim,
+                meta->b_seq_length_dim,
+                fused->iter_config.seq_length);
+            break;
           }
-          BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
-          Kernels::BatchMatmul::forward_kernel_wrapper(
-              meta,
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].get_float_ptr(),
-              my_input_accessor[1].get_float_ptr(),
-              (float const *)nullptr,
-              m,
-              n,
-              k,
-              batch,
-              meta->a_seq_length_dim,
-              meta->b_seq_length_dim,
-              fused->iter_config.seq_length);
-          break;
-        }
-        case OP_EW_ADD:
-        case OP_EW_SUB:
-        case OP_EW_MUL:
-        case OP_EW_DIV:
-        case OP_EW_MAX:
-        case OP_EW_MIN: {
-          assert(fused->op_num_inputs[op] == 2);
-          assert(fused->op_num_weights[op] == 0);
-          assert(fused->op_num_outputs[op] == 1);
-          assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
-          assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-          ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
-          Kernels::ElementBinary::forward_kernel_wrapper(m,
-                                                        my_input_accessor[0],
-                                                        my_input_accessor[1],
-                                                        my_output_accessor[0]);
-          break;
-        }
-        case OP_EMBEDDING: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_weights[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
-          if (m->aggr == AGGR_MODE_NONE) {
-            // assert(kernel_domain.get_dim() == 2);
-            assert(my_input_accessor[0].domain.get_dim() + 1 ==
-                  my_output_accessor[0].domain.get_dim());
-            for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
-              assert(my_input_accessor[0].domain.hi()[i] ==
-                    my_output_accessor[0].domain.hi()[i + 1]);
-              assert(my_input_accessor[0].domain.lo()[i] ==
-                    my_output_accessor[0].domain.lo()[i + 1]);
+          case OP_EW_ADD:
+          case OP_EW_SUB:
+          case OP_EW_MUL:
+          case OP_EW_DIV:
+          case OP_EW_MAX:
+          case OP_EW_MIN: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
+            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+            ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+            Kernels::ElementBinary::forward_kernel_wrapper(m,
+                                                          my_input_accessor[0],
+                                                          my_input_accessor[1],
+                                                          my_output_accessor[0]);
+            break;
+          }
+          case OP_EMBEDDING: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op];
+            if (m->aggr == AGGR_MODE_NONE) {
+              // assert(kernel_domain.get_dim() == 2);
+              assert(my_input_accessor[0].domain.get_dim() + 1 ==
+                    my_output_accessor[0].domain.get_dim());
+              for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
+                assert(my_input_accessor[0].domain.hi()[i] ==
+                      my_output_accessor[0].domain.hi()[i + 1]);
+                assert(my_input_accessor[0].domain.lo()[i] ==
+                      my_output_accessor[0].domain.lo()[i + 1]);
+              }
+              assert(my_weight_accessor[0].domain.hi()[0] -
+                        my_weight_accessor[0].domain.lo()[0] ==
+                    my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0]);
+            } else {
+              assert(my_input_accessor[0].domain.get_dim() ==
+                    my_output_accessor[0].domain.get_dim());
+              for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
+                assert(my_input_accessor[0].domain.hi()[i] ==
+                      my_output_accessor[0].domain.hi()[i]);
+                assert(my_input_accessor[0].domain.lo()[i] ==
+                      my_output_accessor[0].domain.lo()[i]);
+              }
+              assert(my_weight_accessor[0].domain.hi()[0] -
+                        my_weight_accessor[0].domain.lo()[0] ==
+                    my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0]);
             }
-            assert(my_weight_accessor[0].domain.hi()[0] -
-                      my_weight_accessor[0].domain.lo()[0] ==
-                  my_output_accessor[0].domain.hi()[0] -
-                      my_output_accessor[0].domain.lo()[0]);
-          } else {
-            assert(my_input_accessor[0].domain.get_dim() ==
-                  my_output_accessor[0].domain.get_dim());
-            for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
-              assert(my_input_accessor[0].domain.hi()[i] ==
-                    my_output_accessor[0].domain.hi()[i]);
-              assert(my_input_accessor[0].domain.lo()[i] ==
-                    my_output_accessor[0].domain.lo()[i]);
+            int in_dim, out_dim, effective_batch_size;
+            if (m->aggr == AGGR_MODE_NONE) {
+              in_dim = 1;
+              out_dim = my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0] + 1;
+              effective_batch_size =
+                  my_output_accessor[0].domain.get_volume() / out_dim;
+              assert(effective_batch_size * in_dim ==
+                    my_input_accessor[0].domain.get_volume());
+            } else {
+              assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
+              in_dim = my_input_accessor[0].domain.hi()[0] -
+                      my_input_accessor[0].domain.lo()[0] + 1;
+              out_dim = my_output_accessor[0].domain.hi()[0] -
+                        my_output_accessor[0].domain.lo()[0] + 1;
+              effective_batch_size =
+                  my_output_accessor[0].domain.get_volume() / out_dim;
+              assert(effective_batch_size * in_dim ==
+                    my_input_accessor[0].domain.get_volume());
             }
-            assert(my_weight_accessor[0].domain.hi()[0] -
-                      my_weight_accessor[0].domain.lo()[0] ==
-                  my_output_accessor[0].domain.hi()[0] -
-                      my_output_accessor[0].domain.lo()[0]);
+
+            assert(my_input_accessor[0].data_type == DT_INT32 ||
+                  my_input_accessor[0].data_type == DT_INT64);
+            Kernels::Embedding::forward_kernel_wrapper(m,
+                                                      my_input_accessor[0],
+                                                      my_output_accessor[0],
+                                                      my_weight_accessor[0],
+                                                      in_dim,
+                                                      out_dim,
+                                                      effective_batch_size);
+            break;
           }
-          int in_dim, out_dim, effective_batch_size;
-          if (m->aggr == AGGR_MODE_NONE) {
-            in_dim = 1;
-            out_dim = my_output_accessor[0].domain.hi()[0] -
-                      my_output_accessor[0].domain.lo()[0] + 1;
-            effective_batch_size =
-                my_output_accessor[0].domain.get_volume() / out_dim;
-            assert(effective_batch_size * in_dim ==
+          case OP_GELU:
+          case OP_RELU:
+          case OP_SIGMOID:
+          case OP_TANH:
+          case OP_ELU:
+          case OP_SCALAR_TRUE_DIV: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 0);
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+            ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+            if (m->data_type == DT_HALF) {
+              ElementUnary::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_half_ptr(),
+                  my_output_accessor[0].get_half_ptr(),
                   my_input_accessor[0].domain.get_volume());
-          } else {
-            assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
-            in_dim = my_input_accessor[0].domain.hi()[0] -
-                    my_input_accessor[0].domain.lo()[0] + 1;
-            out_dim = my_output_accessor[0].domain.hi()[0] -
-                      my_output_accessor[0].domain.lo()[0] + 1;
-            effective_batch_size =
-                my_output_accessor[0].domain.get_volume() / out_dim;
-            assert(effective_batch_size * in_dim ==
+            } else if (m->data_type == DT_FLOAT) {
+              ElementUnary::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_float_ptr(),
+                  my_output_accessor[0].get_float_ptr(),
                   my_input_accessor[0].domain.get_volume());
+            } else {
+              assert(false && "Unsupported data type in ElementUnary forward");
+            }
+            break;
           }
-
-          assert(my_input_accessor[0].data_type == DT_INT32 ||
-                my_input_accessor[0].data_type == DT_INT64);
-          Kernels::Embedding::forward_kernel_wrapper(m,
+          case OP_RMS_NORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+            Kernels::RMSNorm::forward_kernel_wrapper(m,
                                                     my_input_accessor[0],
-                                                    my_output_accessor[0],
                                                     my_weight_accessor[0],
-                                                    in_dim,
-                                                    out_dim,
-                                                    effective_batch_size);
-          break;
-        }
-        case OP_GELU:
-        case OP_RELU:
-        case OP_SIGMOID:
-        case OP_TANH:
-        case OP_ELU:
-        case OP_SCALAR_TRUE_DIV: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_weights[op] == 0);
-          assert(fused->op_num_outputs[op] == 1);
-          assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-          ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-          if (m->data_type == DT_HALF) {
-            ElementUnary::forward_kernel_wrapper(
-                m,
-                my_input_accessor[0].get_half_ptr(),
-                my_output_accessor[0].get_half_ptr(),
-                my_input_accessor[0].domain.get_volume());
-          } else if (m->data_type == DT_FLOAT) {
-            ElementUnary::forward_kernel_wrapper(
+                                                    my_output_accessor[0]);
+            break;
+          }
+          case OP_RESIDUAL_RMS_NORM: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_weights[op] == 1);
+            assert(fused->op_num_outputs[op] == 2);
+            ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+            Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
+                                                            my_input_accessor[0],
+                                                            my_input_accessor[1],
+                                                            my_weight_accessor[0],
+                                                            my_output_accessor[0],
+                                                            my_output_accessor[1]);
+            break;
+          }
+          case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            IncMultiHeadSelfAttentionMeta const *m =
+                (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            assert(fused->op_num_weights[op] ==
+                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            IncMultiHeadSelfAttention::inference_kernel_wrapper(
                 m,
-                my_input_accessor[0].get_float_ptr(),
-                my_output_accessor[0].get_float_ptr(),
-                my_input_accessor[0].domain.get_volume());
-          } else {
-            assert(false && "Unsupported data type in ElementUnary forward");
+                bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
           }
-          break;
-        }
-        case OP_RMS_NORM: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_weights[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-          Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                  my_input_accessor[0],
-                                                  my_weight_accessor[0],
-                                                  my_output_accessor[0]);
-          break;
-        }
-        case OP_RESIDUAL_RMS_NORM: {
-          assert(fused->op_num_inputs[op] == 2);
-          assert(fused->op_num_weights[op] == 1);
-          assert(fused->op_num_outputs[op] == 2);
-          ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-          Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                          my_input_accessor[0],
-                                                          my_input_accessor[1],
-                                                          my_weight_accessor[0],
-                                                          my_output_accessor[0],
-                                                          my_output_accessor[1]);
-          break;
-        }
-        case OP_INC_MULTIHEAD_SELF_ATTENTION: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          IncMultiHeadSelfAttentionMeta const *m =
-              (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-          assert(fused->op_num_weights[op] ==
-                (1 + (int)(*m->qkv_bias || *m->final_bias)));
-          GenericTensorAccessorR biases;
-          if (*m->qkv_bias || *m->final_bias) {
-            assert(fused->op_num_weights[op] == 2);
-            biases = my_weight_accessor[1];
+          case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            TreeIncMultiHeadSelfAttentionMeta *m =
+                (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            // TreeVerifyBatchConfig const *verify_bc =
+            //     (TreeVerifyBatchConfig *)task->args;
+            BatchConfig const &verify_bc =
+                Future(task->futures[0]).get_result<BatchConfig>();
+            assert(fused->op_num_weights[op] ==
+                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                &verify_bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
           }
-          IncMultiHeadSelfAttention::inference_kernel_wrapper(
-              m,
-              bc,
-              task->index_point.point_data[0],
-              my_input_accessor[0],
-              my_weight_accessor[0],
-              my_output_accessor[0],
-              biases);
-          break;
-        }
-        case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          TreeIncMultiHeadSelfAttentionMeta *m =
-              (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-          // TreeVerifyBatchConfig const *verify_bc =
-          //     (TreeVerifyBatchConfig *)task->args;
-          BatchConfig const &verify_bc =
-              Future(task->futures[0]).get_result<BatchConfig>();
-          assert(fused->op_num_weights[op] ==
-                (1 + (int)(*m->qkv_bias || *m->final_bias)));
-          GenericTensorAccessorR biases;
-          if (*m->qkv_bias || *m->final_bias) {
-            assert(fused->op_num_weights[op] == 2);
-            biases = my_weight_accessor[1];
+          case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            SpecIncMultiHeadSelfAttentionMeta const *m =
+                (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+            // TreeSearchBatchConfig const *search_bc =
+            //     (TreeSearchBatchConfig *)task->args;
+            BatchConfig const &search_bc =
+                Future(task->futures[0]).get_result<BatchConfig>();
+            assert(fused->op_num_weights[op] ==
+                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+            GenericTensorAccessorR biases;
+            if (*m->qkv_bias || *m->final_bias) {
+              assert(fused->op_num_weights[op] == 2);
+              biases = my_weight_accessor[1];
+            }
+            SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+                m,
+                &search_bc,
+                task->index_point.point_data[0],
+                my_input_accessor[0],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                biases);
+            break;
           }
-          TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-              m,
-              &verify_bc,
-              task->index_point.point_data[0],
-              my_input_accessor[0],
-              my_weight_accessor[0],
-              my_output_accessor[0],
-              biases);
-          break;
-        }
-        case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          SpecIncMultiHeadSelfAttentionMeta const *m =
-              (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-          // TreeSearchBatchConfig const *search_bc =
-          //     (TreeSearchBatchConfig *)task->args;
-          BatchConfig const &search_bc =
-              Future(task->futures[0]).get_result<BatchConfig>();
-          assert(fused->op_num_weights[op] ==
-                (1 + (int)(*m->qkv_bias || *m->final_bias)));
-          GenericTensorAccessorR biases;
-          if (*m->qkv_bias || *m->final_bias) {
-            assert(fused->op_num_weights[op] == 2);
-            biases = my_weight_accessor[1];
+          case OP_LAYERNORM: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+            if (m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[0];
+              if (m->use_bias) {
+                beta = my_weight_accessor[1];
+              }
+            }
+            LayerNorm::forward_kernel_wrapper(
+                m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
+            break;
           }
-          SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-              m,
-              &search_bc,
-              task->index_point.point_data[0],
-              my_input_accessor[0],
-              my_weight_accessor[0],
-              my_output_accessor[0],
-              biases);
-          break;
-        }
-        case OP_LAYERNORM: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
-          if (m->elementwise_affine) {
-            assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+          case OP_RESIDUAL_LAYERNORM: {
+            assert(fused->op_num_outputs[op] == 2);
+            ResidualLayerNormMeta const *m =
+                (ResidualLayerNormMeta *)metas->meta[op];
+            if (m->use_two_residuals) {
+              assert(fused->op_num_inputs[op] == 3);
+            } else {
+              assert(fused->op_num_inputs[op] == 2);
+            }
+            if (!m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 0);
+            } else {
+              if (!m->use_bias) {
+                assert(fused->op_num_weights[op] == 1); // weight
+              } else {
+                assert(fused->op_num_weights[op] == 2); // weight + bias
+              }
+            }
+            GenericTensorAccessorR residual2;
+            if (m->use_two_residuals) {
+              residual2 = my_input_accessor[2];
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[0];
+              if (m->use_bias) {
+                beta = my_weight_accessor[1];
+              }
+            }
+            ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                        my_input_accessor[0],
+                                                        my_input_accessor[1],
+                                                        residual2,
+                                                        my_output_accessor[0],
+                                                        my_output_accessor[1],
+                                                        gamma,
+                                                        beta);
+            break;
           }
-          GenericTensorAccessorR gamma, beta;
-          if (m->elementwise_affine) {
-            gamma = my_weight_accessor[0];
-            if (m->use_bias) {
-              beta = my_weight_accessor[1];
+          case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+            assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_outputs[op] == 2);
+            AddBiasResidualLayerNormMeta const *m =
+                (AddBiasResidualLayerNormMeta *)metas->meta[op];
+            if (!m->elementwise_affine) {
+              assert(fused->op_num_weights[op] == 1); // attn bias
+            } else {
+              if (!m->use_bias) {
+                assert(fused->op_num_weights[op] == 2); // attn bias + weight
+              } else {
+                assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+              }
+            }
+            GenericTensorAccessorR gamma, beta;
+            if (m->elementwise_affine) {
+              gamma = my_weight_accessor[1];
+              if (m->use_bias) {
+                beta = my_weight_accessor[2];
+              }
             }
+            Domain attn_bias_domain = my_weight_accessor[0].domain;
+            Domain residual_domain = my_input_accessor[1].domain;
+            int attn_bias_dim =
+                attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
+            int residual_volume = residual_domain.get_volume();
+            AddBiasResidualLayerNorm::inference_kernel_wrapper(
+                m,
+                attn_bias_dim,
+                residual_volume,
+                my_input_accessor[0],
+                my_output_accessor[0],
+                my_output_accessor[1],
+                my_input_accessor[1],
+                my_weight_accessor[0],
+                gamma,
+                beta);
+            break;
           }
-          LayerNorm::forward_kernel_wrapper(
-              m, my_input_accessor[0], my_output_accessor[0], gamma, beta);
-          break;
-        }
-        case OP_RESIDUAL_LAYERNORM: {
-          assert(fused->op_num_outputs[op] == 2);
-          ResidualLayerNormMeta const *m =
-              (ResidualLayerNormMeta *)metas->meta[op];
-          if (m->use_two_residuals) {
-            assert(fused->op_num_inputs[op] == 3);
-          } else {
+          case OP_SIGMOID_SILU_MULTI: {
             assert(fused->op_num_inputs[op] == 2);
+            assert(fused->op_num_outputs[op] == 1);
+            SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+            SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                      my_input_accessor[0],
+                                                      my_input_accessor[1],
+                                                      my_output_accessor[0]);
+            break;
           }
-          if (!m->elementwise_affine) {
+          case OP_SOFTMAX: {
+            assert(fused->op_num_inputs[op] == 1);
             assert(fused->op_num_weights[op] == 0);
-          } else {
-            if (!m->use_bias) {
-              assert(fused->op_num_weights[op] == 1); // weight
-            } else {
-              assert(fused->op_num_weights[op] == 2); // weight + bias
+            assert(fused->op_num_outputs[op] == 1);
+            assert(my_input_accessor[0].domain.get_volume() ==
+                  my_output_accessor[0].domain.get_volume());
+            SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+            if (m->input_type == DT_HALF) {
+              Kernels::Softmax::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_half_ptr(),
+                  my_output_accessor[0].get_half_ptr());
+            } else if (m->input_type == DT_FLOAT) {
+              Kernels::Softmax::forward_kernel_wrapper(
+                  m,
+                  my_input_accessor[0].get_float_ptr(),
+                  my_output_accessor[0].get_float_ptr());
             }
+            break;
           }
-          GenericTensorAccessorR residual2;
-          if (m->use_two_residuals) {
-            residual2 = my_input_accessor[2];
+          case OP_ALLREDUCE: {
+            assert(fused->op_num_inputs[op] == 1);
+            assert(fused->op_num_outputs[op] == 1);
+            AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+            Kernels::AllReduce::inference_kernel_wrapper(
+                m, bc, my_input_accessor[0], my_output_accessor[0]);
+            break;
           }
-          GenericTensorAccessorR gamma, beta;
-          if (m->elementwise_affine) {
-            gamma = my_weight_accessor[0];
-            if (m->use_bias) {
-              beta = my_weight_accessor[1];
-            }
+          default: {
+            fprintf(stderr,
+                    "Fusion currently does not support type = %d\n",
+                    fused->op_op_type[op]);
+            assert(false && "Fusion currently does not support type");
           }
-          ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                      my_input_accessor[0],
-                                                      my_input_accessor[1],
-                                                      residual2,
-                                                      my_output_accessor[0],
-                                                      my_output_accessor[1],
-                                                      gamma,
-                                                      beta);
-          break;
         }
-        case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-          assert(fused->op_num_inputs[op] == 2);
-          assert(fused->op_num_outputs[op] == 2);
-          AddBiasResidualLayerNormMeta const *m =
-              (AddBiasResidualLayerNormMeta *)metas->meta[op];
-          if (!m->elementwise_affine) {
-            assert(fused->op_num_weights[op] == 1); // attn bias
-          } else {
-            if (!m->use_bias) {
-              assert(fused->op_num_weights[op] == 2); // attn bias + weight
+        if (metas->meta[op]->inference_debugging) {
+          std::vector<GenericTensorAccessorR> input_accessors_to_save;
+          std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+          std::vector<GenericTensorAccessorR> output_accessors_to_save;
+          for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+            int my_off = fused->op_input_idx[i + ioff];
+            if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+              input_accessors_to_save.push_back(input_accessor[my_off]);
+            } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+              input_accessors_to_save.push_back(output_accessor[my_off]);
             } else {
-              assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+              assert(false);
             }
           }
-          GenericTensorAccessorR gamma, beta;
-          if (m->elementwise_affine) {
-            gamma = my_weight_accessor[1];
-            if (m->use_bias) {
-              beta = my_weight_accessor[2];
-            }
+          for (int i = 0; i < fused->op_num_weights[op]; i++) {
+            assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+            weight_accessors_to_save.push_back(
+                weight_accessor[fused->op_weight_idx[i + woff]]);
           }
-          Domain attn_bias_domain = my_weight_accessor[0].domain;
-          Domain residual_domain = my_input_accessor[1].domain;
-          int attn_bias_dim =
-              attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-          int residual_volume = residual_domain.get_volume();
-          AddBiasResidualLayerNorm::inference_kernel_wrapper(
-              m,
-              attn_bias_dim,
-              residual_volume,
-              my_input_accessor[0],
-              my_output_accessor[0],
-              my_output_accessor[1],
-              my_input_accessor[1],
-              my_weight_accessor[0],
-              gamma,
-              beta);
-          break;
-        }
-        case OP_SIGMOID_SILU_MULTI: {
-          assert(fused->op_num_inputs[op] == 2);
-          assert(fused->op_num_outputs[op] == 1);
-          SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-          SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    my_output_accessor[0]);
-          break;
-        }
-        case OP_SOFTMAX: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_weights[op] == 0);
-          assert(fused->op_num_outputs[op] == 1);
-          assert(my_input_accessor[0].domain.get_volume() ==
-                my_output_accessor[0].domain.get_volume());
-          SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-          if (m->input_type == DT_HALF) {
-            Kernels::Softmax::forward_kernel_wrapper(
-                m,
-                my_input_accessor[0].get_half_ptr(),
-                my_output_accessor[0].get_half_ptr());
-          } else if (m->input_type == DT_FLOAT) {
-            Kernels::Softmax::forward_kernel_wrapper(
-                m,
-                my_input_accessor[0].get_float_ptr(),
-                my_output_accessor[0].get_float_ptr());
+          for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+            output_accessors_to_save.push_back(output_accessor[i + ooff]);
           }
-          break;
-        }
-        case OP_ALLREDUCE: {
-          assert(fused->op_num_inputs[op] == 1);
-          assert(fused->op_num_outputs[op] == 1);
-          AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-          Kernels::AllReduce::inference_kernel_wrapper(
-              m, bc, my_input_accessor[0], my_output_accessor[0]);
-          break;
-        }
-        default: {
-          fprintf(stderr,
-                  "Fusion currently does not support type = %d\n",
-                  fused->op_op_type[op]);
-          assert(false && "Fusion currently does not support type");
+          assert(task->index_point.get_dim() == 1);
+          int shard_id = task->index_point.point_data[0];
+          FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                                  shard_id,
+                                                  bc,
+                                                  input_accessors_to_save,
+                                                  weight_accessors_to_save,
+                                                  output_accessors_to_save);
         }
+        ioff += fused->op_num_inputs[op];
+        woff += fused->op_num_weights[op];
+        ooff += fused->op_num_outputs[op];
       }
-      if (metas->meta[op]->inference_debugging) {
-        std::vector<GenericTensorAccessorR> input_accessors_to_save;
-        std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-        std::vector<GenericTensorAccessorR> output_accessors_to_save;
-        for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-          int my_off = fused->op_input_idx[i + ioff];
-          if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-            input_accessors_to_save.push_back(input_accessor[my_off]);
-          } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-            input_accessors_to_save.push_back(output_accessor[my_off]);
-          } else {
-            assert(false);
-          }
-        }
-        for (int i = 0; i < fused->op_num_weights[op]; i++) {
-          assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-          weight_accessors_to_save.push_back(
-              weight_accessor[fused->op_weight_idx[i + woff]]);
-        }
-        for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-          output_accessors_to_save.push_back(output_accessor[i + ooff]);
-        }
-        assert(task->index_point.get_dim() == 1);
-        int shard_id = task->index_point.point_data[0];
-        FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                                shard_id,
-                                                bc,
-                                                input_accessors_to_save,
-                                                weight_accessors_to_save,
-                                                output_accessors_to_save);
-      }
-      ioff += fused->op_num_inputs[op];
-      woff += fused->op_num_weights[op];
-      ooff += fused->op_num_outputs[op];
+      // for (int i = 0; i < fused->numOutputs; i++)
+      //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+      //   "[Fused:forward:output]");
+      cudaStreamEndCapture(stream, &graph);
     }
-    // for (int i = 0; i < fused->numOutputs; i++)
-    //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-    //   "[Fused:forward:output]");
-    cudaStreamEndCapture(stream, &graph);
-  }
-
-  bool captured = false;
-
-  if(metas->graph_collections.count(graph_params)  != 0) {
-    captured = true;
-    instance = metas->graph_collections[graph_params];
-    if (cudaGraphExecUpdate(instance, graph, NULL, &updateResult) != cudaSuccess) {
-      cudaGraphExecDestroy(instance);
-      captured = false;
-    } else {
-      // if(shard_id == 0) {
-      //   printf("---------start to reuse the graph-------\n");
-      //   graph_params.Print();
-      //   // bc->print();
-      //   printf("---------end to reuse the graph-------\n");
-      // }
-    }
-  } 
-  
-  if (!captured) {
     cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
     metas->graph_collections[graph_params] = instance;
     // if(shard_id == 0) {
@@ -1169,11 +1168,11 @@ __host__ void
     //   // bc->print();
     //   printf("*************end cudaGraphInstantiate**********\n");
     // }
+    cudaGraphDestroy(graph);
   }
 
   assert(metas->graph_collections.find(graph_params) !=
         metas->graph_collections.end());
-  cudaGraphDestroy(graph);
   cudaGraphLaunch(instance, stream);
 }
 

From fed43a8c19bdf1a6e9736bd5c183f3e03d7c1498 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 22 May 2024 02:13:05 -0400
Subject: [PATCH 268/667] Eliminate redundant computation.

---
 src/ops/argmax.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index d9fb7198d..844d74686 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -23,10 +23,11 @@ __global__ void init_offset(int batch_size,
                             int vocab_size,
                             int total_eles,
                             int *d_offsets) {
-  CUDA_KERNEL_LOOP(i, total_eles + 1) {
-    if (i % vocab_size == 0) {
-      d_offsets[i / vocab_size] = i;
-    }
+  CUDA_KERNEL_LOOP(i, (total_eles + 1) / vocab_size) {
+    // if (i % vocab_size == 0) {
+    //   d_offsets[i / vocab_size] = i;
+    // }
+    d_offsets[i] = i * vocab_size;
   }
 }
 

From 07c58bf9dc702adf5be3ffa0256658ef7873cc75 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 23 May 2024 23:09:34 -0700
Subject: [PATCH 269/667] feat: reserve heap implementation of arg_topk

---
 src/ops/arg_topk.cc.backup | 510 +++++++++++++++++++++++++++++++++++
 src/ops/arg_topk.cu.backup | 525 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1035 insertions(+)
 create mode 100644 src/ops/arg_topk.cc.backup
 create mode 100644 src/ops/arg_topk.cu.backup

diff --git a/src/ops/arg_topk.cc.backup b/src/ops/arg_topk.cc.backup
new file mode 100644
index 000000000..706fbbc7a
--- /dev/null
+++ b/src/ops/arg_topk.cc.backup
@@ -0,0 +1,510 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/arg_topk.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/hash_utils.h"
+#include "legion/legion_utilities.h"
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+using PCG::Node;
+
+// For an input tensor, computes the top k entries in each row
+// (resp. vector along the last dimension). Thus,
+// values.shape = indices.shape = input.shape[:-1] + [k]
+Tensor FFModel::arg_top_k(Tensor const input,
+                          int k,
+                          bool sorted,
+                          bool speculative_decoding,
+                          char const *name) {
+  Layer *li = new Layer(this,
+                        OP_ARG_TOPK,
+                        input->data_type,
+                        name,
+                        1 /*inputs*/,
+                        0 /*weights*/,
+                        speculative_decoding ? 2 : 1 /*outputs*/,
+                        input);
+  {
+    int numdims = input->num_dims;
+    int dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdims; i++) {
+      dims[i] = input->dims[i];
+    }
+    dims[0] = k;
+    // li->outputs[0] = create_tensor_legion_ordering(
+    //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
+    li->outputs[0] = create_tensor_legion_ordering(
+        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
+    if (speculative_decoding) {
+      li->outputs[1] = create_tensor_legion_ordering(
+          numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
+    }
+  }
+  li->add_int_property("k", k);
+  li->add_int_property("sorted", sorted);
+  li->add_int_property("speculative_decoding", speculative_decoding);
+  layers.push_back(li);
+  // outputs[0] = li->outputs[0];
+  // outputs[1] = li->outputs[1];
+  return li->outputs[0];
+}
+
+Op *ArgTopK::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  long long value;
+  layer->get_int_property("k", value);
+  int k = value;
+  layer->get_int_property("sorted", value);
+  bool sorted = (bool)value;
+  layer->get_int_property("speculative_decoding", value);
+  bool speculative_decoding = (bool)value;
+
+  return new ArgTopK(model,
+                     layer->layer_guid,
+                     inputs[0],
+                     k,
+                     sorted,
+                     speculative_decoding,
+                     layer->name);
+}
+
+ArgTopKParams ArgTopK::get_params() const {
+  ArgTopKParams params;
+  params.k = this->k;
+  params.sorted = this->sorted;
+  params.speculative_decoding = this->speculative_decoding;
+  if (this->name != nullptr) {
+    strcpy(params.name, this->name);
+  }
+  return params;
+}
+
+bool ArgTopKParams::is_valid(ParallelTensorShape const &) const {
+  // topk is always valid
+  return true;
+}
+
+bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) {
+  return lhs.k == rhs.k && lhs.sorted == rhs.sorted &&
+         lhs.speculative_decoding == rhs.speculative_decoding;
+}
+
+ArgTopK::ArgTopK(FFModel &model,
+                 LayerID const &_layer_guid,
+                 ParallelTensor const _input,
+                 int _k,
+                 bool _sorted,
+                 bool _speculative_decoding,
+                 char const *name)
+    : Op(model,
+         OP_ARG_TOPK,
+         _input->data_type,
+         name,
+         1 /*inputs*/,
+         0 /*weights*/,
+         _speculative_decoding ? 2 : 1 /*outputs*/,
+         _input),
+      k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
+  // overwrite layer_guid
+  layer_guid = _layer_guid;
+  int numdim = inputs[0]->num_dims;
+  ParallelDim dims[MAX_TENSOR_DIM];
+  for (int i = 0; i < numdim; i++) {
+    dims[i] = inputs[0]->dims[i];
+  }
+
+  dims[0].size = k;
+  assert(inputs[0]->dims[0].degree == 1);
+  assert(inputs[0]->dims[0].parallel_idx == -1);
+
+  outputs[0] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
+  if (_speculative_decoding) {
+    outputs[1] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
+  }
+}
+
+ArgTopK::ArgTopK(FFModel &model,
+                 LayerID const &layer_guid,
+                 ArgTopK const &other,
+                 ParallelTensor const input)
+    : ArgTopK(model,
+              layer_guid,
+              input,
+              other.k,
+              other.sorted,
+              other.speculative_decoding,
+              other.name) {}
+
+ArgTopK::ArgTopK(FFModel &model,
+                 ArgTopKParams const &params,
+                 ParallelTensor const input,
+                 char const *name)
+    : ArgTopK(model,
+              params.layer_guid,
+              input,
+              params.k,
+              params.sorted,
+              params.speculative_decoding,
+              params.name) {}
+
+void ArgTopK::init_inference(FFModel const &ff,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = batch_outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(ArgTopK)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  //   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
+  //                                                     0 /*projection id*/,
+  //                                                     WRITE_ONLY,
+  //                                                     EXCLUSIVE,
+  //                                                     batch_outputs[1]->region));
+  //   launcher.add_field(2, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
+}
+
+void ArgTopK::init(FFModel const &ff) {
+  assert(check_output_input_weight_same_parallel_is());
+  parallel_is = outputs[0]->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_init(ff, argmap);
+  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(ArgTopK)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  //   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
+  //                                                     0 /*projection id*/,
+  //                                                     WRITE_ONLY,
+  //                                                     EXCLUSIVE,
+  //                                                     outputs[1]->region));
+  //   launcher.add_field(2, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap(ff, fm);
+}
+
+OpMeta *ArgTopK::init_task(Task const *task,
+                           std::vector<PhysicalRegion> const &regions,
+                           Context ctx,
+                           Runtime *runtime) {
+  ArgTopK *topk = (ArgTopK *)task->args;
+  FFHandler handle = *((FFHandler *)task->local_args);
+  ArgTopKMeta *m = new ArgTopKMeta(handle, topk);
+  m->profiling = topk->profiling;
+  m->inference_debugging = topk->inference_debugging;
+  m->sorted = topk->sorted;
+  m->k = topk->k;
+  std::strcpy(m->op_name, topk->name);
+  m->layer_guid = topk->layer_guid;
+  m->speculative_decoding = topk->speculative_decoding;
+  return m;
+}
+
+void ArgTopK::forward(FFModel const &ff) {
+  // ArgTopK does not support forward
+  assert(false);
+}
+
+FutureMap ArgTopK::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  if (speculative_decoding) {
+    IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+                           parallel_is,
+                           TaskArgument(nullptr, 0),
+                           argmap,
+                           Predicate::TRUE_PRED,
+                           false /*must*/,
+                           0 /*mapper_id*/,
+                           machine_view_hash);
+    launcher.add_future(bc);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[0]->region));
+    launcher.add_field(0, FID_DATA);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(1, FID_DATA);
+
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[1]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[1]->region));
+    launcher.add_field(2, FID_DATA);
+    return runtime->execute_index_space(ctx, launcher);
+
+  } else {
+    IndexLauncher launcher(ARG_TOPK_INF_TASK_ID,
+                           parallel_is,
+                           TaskArgument(nullptr, 0),
+                           argmap,
+                           Predicate::TRUE_PRED,
+                           false /*must*/,
+                           0 /*mapper_id*/,
+                           machine_view_hash);
+    launcher.add_future(bc);
+    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      batch_inputs[0]->region));
+    launcher.add_field(0, FID_DATA);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(1, FID_DATA);
+    return runtime->execute_index_space(ctx, launcher);
+  }
+}
+
+InferenceResult
+    ArgTopK::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  // const ArgTopK* topk = (const ArgTopK*) task->args;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
+    // Directly return for empty batch config
+    InferenceResult ir;
+    return ir;
+  }
+  ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW probs;
+
+  int batch_size = bc->num_active_tokens();
+  ArgTopK::forward_kernel_wrapper(
+      m, input, probs, indices, batch_size, nullptr);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    ArgTopK::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {indices});
+  }
+
+  InferenceResult ir;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size);
+  return ir;
+}
+
+InferenceResult ArgTopK::inference_speculative_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3);
+  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  if (bc.num_active_tokens() == 0) {
+    // Directly return for empty batch config
+    InferenceResult ir;
+    return ir;
+  }
+  ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
+      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO(
+      DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
+
+  int batch_size = bc.num_active_tokens();
+  ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
+
+  InferenceResult ir;
+  download_tensor<BatchConfig::TokenId>(
+      indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
+  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  return ir;
+}
+
+void ArgTopK::backward(FFModel const &ff) {
+  // ArgTopK does not support backward
+  assert(false);
+}
+
+void ArgTopK::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->layer_guid.id);
+  sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->k);
+  sez.serialize(this->sorted);
+  sez.serialize(this->speculative_decoding);
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
+}
+
+Node ArgTopK::deserialize(FFModel &ff,
+                          Legion::Deserializer &dez,
+                          ParallelTensor inputs[],
+                          int num_inputs) {
+  assert(num_inputs == 1);
+  size_t id, transformer_layer_id, deserialized_model_id;
+  dez.deserialize(id);
+  dez.deserialize(transformer_layer_id);
+  dez.deserialize(deserialized_model_id);
+  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+  dez.deserialize(k);
+  dez.deserialize(sorted);
+  dez.deserialize(speculative_decoding);
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
+  ArgTopKParams params;
+  params.layer_guid = layer_guid;
+  params.k = k;
+  params.sorted = sorted;
+  params.speculative_decoding = speculative_decoding;
+  strcpy(params.name, name);
+  return ff.get_or_create_node<ArgTopK>(inputs[0], params);
+}
+
+Op *ArgTopK::materialize(FFModel &ff,
+                         ParallelTensor inputs[],
+                         int num_inputs) const {
+  ArgTopKParams params = get_params();
+  return new ArgTopK(ff, params, inputs[0], this->name);
+}
+
+bool ArgTopK::measure_operator_cost(Simulator *sim,
+                                    MachineView const &mv,
+                                    CostMetrics &cost_metrics) const {
+  return false;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::ArgTopKParams>::operator()(
+    FlexFlow::ArgTopKParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.k);
+  hash_combine(key, params.sorted);
+  hash_combine(key, params.speculative_decoding);
+  return key;
+}
+}; // namespace std
diff --git a/src/ops/arg_topk.cu.backup b/src/ops/arg_topk.cu.backup
new file mode 100644
index 000000000..491b255be
--- /dev/null
+++ b/src/ops/arg_topk.cu.backup
@@ -0,0 +1,525 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/arg_topk.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::coord_t;
+
+enum class HeapType { kMinHeap, kMaxHeap };
+enum class PreferIndices { kLower, kHigher };
+
+template <typename T>
+struct Entry {
+  int index;
+  T value;
+};
+
+template <typename T>
+struct LinearData {
+  typedef Entry<T> Entry;
+
+  __device__ Entry &operator[](std::size_t index) const {
+    return data[index];
+  }
+
+  __device__ int get_index(int i) const {
+    return data[i].index;
+  }
+  __device__ T get_value(int i) const {
+    return data[i].value;
+  }
+
+  Entry *const data;
+};
+
+template <typename T>
+struct IndirectLinearData {
+  typedef Entry<T> Entry;
+
+  __device__ Entry &operator[](std::size_t index) const {
+    return data[index];
+  }
+
+  __device__ int get_index(int i) const {
+    return backing_data[data[i].index].index;
+  }
+  __device__ T get_value(int i) const {
+    return data[i].value;
+  }
+
+  Entry *const data;
+  Entry *const backing_data;
+};
+
+template <typename T>
+struct StridedData {
+  typedef Entry<T> Entry;
+
+  __device__ Entry &operator[](std::size_t index) const {
+    return data[index * blockDim.x + threadIdx.x];
+  }
+
+  __device__ int get_index(int i) const {
+    return (*this)[i].index;
+  }
+  __device__ T get_value(int i) const {
+    return (*this)[i].value;
+  }
+
+  Entry *const data;
+};
+
+// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
+template <HeapType heapType,
+          PreferIndices preferIndices,
+          template <typename>
+          class Data,
+          typename T>
+struct IndexedHeap {
+  typedef typename Data<T>::Entry Entry;
+  Data<T> const data;
+  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
+
+  __device__ bool is_above(int left, int right) {
+    T left_value = data.get_value(left);
+    T right_value = data.get_value(right);
+    if (left_value == right_value) {
+      if (preferIndices == PreferIndices::kLower) {
+        return data.get_index(left) < data.get_index(right);
+      } else {
+        return data.get_index(left) > data.get_index(right);
+      }
+    }
+    if (heapType == HeapType::kMinHeap) {
+      return left_value < right_value;
+    } else {
+      return left_value > right_value;
+    }
+  }
+
+  __device__ void assign(int i, Entry const &entry) {
+    data[i] = entry;
+  }
+
+  __device__ void push_up(int i) {
+    int child = i;
+    int parent;
+    for (; child > 0; child = parent) {
+      parent = (child - 1) / 2;
+      if (!is_above(child, parent)) {
+        // Heap property satisfied.
+        break;
+      }
+      swap(child, parent);
+    }
+  }
+
+  __device__ void swap(int a, int b) {
+    auto tmp = data[b];
+    data[b] = data[a];
+    data[a] = tmp;
+  }
+
+  __device__ void push_root_down(int k) {
+    push_down(0, k);
+  }
+
+  // MAX-HEAPIFY in Cormen
+  __device__ void push_down(int node, int k) {
+    while (true) {
+      int const left = 2 * node + 1;
+      int const right = left + 1;
+      int smallest = node;
+      if (left < k && is_above(left, smallest)) {
+        smallest = left;
+      }
+      if (right < k && is_above(right, smallest)) {
+        smallest = right;
+      }
+      if (smallest == node) {
+        break;
+      }
+      swap(smallest, node);
+      node = smallest;
+    }
+  }
+
+  // BUILD-MAX-HEAPIFY in Cormen
+  __device__ void build(int k) {
+    for (int node = (k - 1) / 2; node >= 0; node--) {
+      push_down(node, k);
+    }
+  }
+
+  // HEAP-EXTRACT-MAX in Cormen
+  __device__ void remove_root(int k) {
+    data[0] = data[k - 1];
+    push_root_down(k - 1);
+  }
+
+  // in-place HEAPSORT in Cormen
+  // This method destroys the heap property.
+  __device__ void sort(int k) {
+    for (int slot = k - 1; slot > 0; slot--) {
+      // This is like remove_root but we insert the element at the end.
+      swap(slot, 0);
+      // Heap is now an element smaller.
+      push_root_down(/*k=*/slot);
+    }
+  }
+
+  __device__ void replace_root(Entry const &entry, int k) {
+    data[0] = entry;
+    push_root_down(k);
+  }
+
+  __device__ Entry const &root() {
+    return data[0];
+  }
+};
+
+template <HeapType heapType,
+          PreferIndices preferIndices,
+          template <typename>
+          class Data,
+          typename T>
+__device__ IndexedHeap<heapType, preferIndices, Data, T>
+    make_indexed_heap(typename Data<T>::Entry *data) {
+  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
+}
+
+// heapArgTopK walks over [input, input+length) with `step_size` stride starting
+// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
+// using `Accessor` to access elements in `heap_entries`. If sorted=true, the
+// elements will be sorted at the end.
+template <typename T, template <typename> class Data = LinearData>
+__device__ void heapArgTopK(T const *__restrict__ input,
+                            int length,
+                            int k,
+                            Entry<T> *__restrict__ heap_entries,
+                            bool sorted = false,
+                            int start_index = 0,
+                            int step_size = 1) {
+  assert(k <= length);
+
+  auto heap =
+      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
+          heap_entries);
+
+  int heap_end_index = start_index + k * step_size;
+  if (heap_end_index > length) {
+    heap_end_index = length;
+  }
+  // Initialize the min-heap.
+  for (int index = start_index, slot = 0; index < heap_end_index;
+       index += step_size, slot++) {
+    heap.assign(slot, {index, input[index]});
+  }
+
+  heap.build(k);
+
+  // Now iterate over the remaining items.
+  // If an item is smaller than the min element, it is not amongst the top k.
+  // Otherwise, replace the min element with it and push upwards.
+  for (int index = heap_end_index; index < length; index += step_size) {
+    // We prefer elements with lower indices. This is given here.
+    // Later elements automatically have higher indices, so can be discarded.
+    if (input[index] > heap.root().value) {
+      // This element should replace the min.
+      heap.replace_root({index, input[index]}, k);
+    }
+  }
+
+  // Sort if wanted.
+  if (sorted) {
+    heap.sort(k);
+  }
+}
+
+// mergeShards performs a top-k merge on `num_shards` many sorted streams that
+// are sorted and stored in `entries` in a strided way:
+// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
+// The overall top k elements are written to `top_k_values` and their indices
+// to top_k_indices.
+// `top_k_heap` is used as temporary storage for the merge heap.
+template <typename T>
+__device__ void mergeShards(int num_shards,
+                            int k,
+                            Entry<T> *__restrict__ entries,
+                            Entry<T> *__restrict__ top_k_heap,
+                            float *top_k_values,
+                            int *top_k_indices,
+                            bool speculative_decoding) {
+  // If k < num_shards, we can use a min-heap with k elements to get the top k
+  // of the sorted blocks.
+  // If k > num_shards, we can initialize a min-heap with the top element from
+  // each sorted block.
+  int const heap_size = k < num_shards ? k : num_shards;
+
+  // Min-heap part.
+  {
+    auto min_heap = IndexedHeap<HeapType::kMinHeap,
+                                PreferIndices::kHigher,
+                                IndirectLinearData,
+                                T>{IndirectLinearData<T>{top_k_heap, entries}};
+    // Initialize the heap as a min-heap.
+    for (int slot = 0; slot < heap_size; slot++) {
+      min_heap.assign(slot, {slot, entries[slot].value});
+    }
+    min_heap.build(heap_size);
+
+    // Now perform top k with the remaining shards (if num_shards > heap_size).
+    for (int shard = heap_size; shard < num_shards; shard++) {
+      auto const entry = entries[shard];
+      auto const root = min_heap.root();
+      if (entry.value < root.value) {
+        continue;
+      }
+      if (entry.value == root.value &&
+          entry.index > entries[root.index].index) {
+        continue;
+      }
+      // This element should replace the min.
+      min_heap.replace_root({shard, entry.value}, heap_size);
+    }
+  }
+
+  // Max-part.
+  {
+    // Turn the min-heap into a max-heap in-place.
+    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
+                                PreferIndices::kLower,
+                                IndirectLinearData,
+                                T>{IndirectLinearData<T>{top_k_heap, entries}};
+    // Heapify into a max heap.
+    max_heap.build(heap_size);
+
+    // Now extract the minimum k-1 times.
+    // k is treated specially.
+    int const last_k = k - 1;
+    for (int rank = 0; rank < last_k; rank++) {
+      Entry<T> const &max_element = max_heap.root();
+      if (speculative_decoding) {
+        assert(top_k_values != nullptr);
+        top_k_values[rank] = static_cast<float>(max_element.value);
+      }
+
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
+      int next_shard_index = shard_index + num_shards;
+      // For rank < k-1, each top k heap still contains at least 1 element,
+      // so we can draw a replacement.
+      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
+                            heap_size);
+    }
+
+    // rank == last_k.
+    Entry<T> const &max_element = max_heap.root();
+    // top_k_values[last_k] = max_element.value;
+    int shard_index = max_element.index;
+    top_k_indices[last_k] = entries[shard_index].index;
+    top_k_values[last_k] = static_cast<float>(max_element.value);
+  }
+}
+
+template <typename T>
+__global__ void arg_topk_forward_kernel(T const *__restrict__ input,
+                                        size_t shared_memory_size,
+                                        int length,
+                                        int k,
+                                        bool sorted,
+                                        float *__restrict__ output,
+                                        int *__restrict__ indices,
+                                        bool speculative_decoding) {
+  __shared__ char shared_memory[48 << 10];
+  int const batch_index = blockIdx.x;
+  T const *batch_input = input + batch_index * length;
+  int const thread_index = threadIdx.x;
+  int const thread_count = blockDim.x;
+  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
+  heapArgTopK<T, StridedData>(
+      batch_input, length, k, shared_entries, true, thread_index, thread_count);
+  __syncthreads();
+  if (thread_index == 0) {
+    int const offset = batch_index * k;
+    auto batch_output = output + offset;
+    auto batch_indices = indices + offset;
+    Entry<T> *top_k_heap = shared_entries + thread_count * k;
+    mergeShards(thread_count,
+                k,
+                shared_entries,
+                top_k_heap,
+                batch_output,
+                batch_indices,
+                speculative_decoding);
+  }
+}
+
+/*static*/
+template <typename DT>
+void ArgTopK::forward_kernel(
+    ArgTopKMeta const *m,
+    DT const *input_ptr,
+    float *output_ptr,
+    int *indices_ptr,
+    size_t batch_size,
+    int length,
+    int k,
+    bool sorted,
+    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
+    cudaStream_t stream) {
+  // Adopted from TensorFlow's ArgTopK implementation
+  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
+  int num_shards = 0;
+  {
+    constexpr auto shared_memory_size = 48 << 10;
+    auto const heap_size = k * sizeof(Entry<DT>);
+    // shared_memory_size = (num_shards + 1) * heap_size <=>
+    num_shards = shared_memory_size / heap_size - 1;
+    assert(num_shards > 0);
+    if (num_shards > CUDA_NUM_THREADS) {
+      num_shards = CUDA_NUM_THREADS;
+    }
+  }
+  // We are limited by the amount of shared memory we have per block.
+  size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
+  // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
+  size_t num_blocks = batch_size;
+
+  // all requests share the same number of branches
+  if (m->speculative_decoding) {
+    assert(bc->num_active_requests() >= 0);
+    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+    num_shards = k;
+    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        input_ptr,
+        shared_memory_size,
+        length,
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
+        sorted,
+        output_ptr,
+        indices_ptr,
+        m->speculative_decoding);
+  } else {
+
+    assert(num_shards >= (size_t)k);
+    num_shards = k;
+    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+        input_ptr,
+        shared_memory_size,
+        length,
+        k,
+        sorted,
+        nullptr,
+        indices_ptr,
+        false);
+  }
+}
+
+/*static*/
+void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
+                                     GenericTensorAccessorR const &input,
+                                     // float *output_ptr,
+                                     GenericTensorAccessorW const &probs,
+                                     GenericTensorAccessorW const &indices,
+                                     int batch_size,
+                                     BatchConfig const *bc) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  // Domain in1_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[0].region.get_index_space());
+  //   Domain out1_domain = runtime->get_index_space_domain(
+  //       ctx, task->regions[1].region.get_index_space());
+  // Domain out2_domain = runtime->get_index_space_domain(
+  //     ctx, task->regions[1].region.get_index_space());
+  int numdims = input.domain.get_dim();
+  assert(indices.domain.get_dim() == numdims);
+
+  int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1;
+  int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1;
+
+  // assert(out1_domain == out2_domain);
+  for (int i = 1; i < input.domain.get_dim(); i++) {
+    assert(input.domain.lo()[i] == indices.domain.lo()[i]);
+    assert(input.domain.hi()[i] == indices.domain.hi()[i]);
+  }
+  // float const *in_ptr = helperGetTensorPointerRO<float>(
+  //     regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  //   float *value_ptr = helperGetTensorPointerWO<float>(
+  //       regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  // int *index_ptr = helperGetTensorPointerWO<int>(
+  //    regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int k = indices.domain.hi()[0] - indices.domain.lo()[0] +
+          1; /*TODO: This prints to 5*/
+
+  // batch_size = input.domain.get_volume() / length;
+  // assert(indices.domain.get_volume() / k == batch_size);
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (input.data_type == DT_HALF) {
+    ArgTopK::forward_kernel(m,
+                            input.get_half_ptr(),
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
+  } else if (input.data_type == DT_FLOAT) {
+    ArgTopK::forward_kernel(m,
+                            input.get_float_ptr(),
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ArgTopK] forward time = %.2lfms\n", elapsed);
+  }
+}
+
+ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op)
+    : OpMeta(handler, op) {}
+
+}; // namespace FlexFlow

From f31fd4ee616498c89fc3bd9ed5f52ca4f97f426e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 00:26:08 -0700
Subject: [PATCH 270/667] feat: add submodule raft

---
 .gitmodules | 6 +++++-
 deps/raft   | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 160000 deps/raft

diff --git a/.gitmodules b/.gitmodules
index c68582d4a..a341e9396 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,4 +22,8 @@
 [submodule "deps/tokenizers-cpp"]
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
-	fetchRecurseSubmodules = true
\ No newline at end of file
+	fetchRecurseSubmodules = true
+[submodule "deps/raft"]
+	path = deps/raft
+	url = git@github.com:rapidsai/raft.git
+	branch = branch-23.04
diff --git a/deps/raft b/deps/raft
new file mode 160000
index 000000000..994e6c8b5
--- /dev/null
+++ b/deps/raft
@@ -0,0 +1 @@
+Subproject commit 994e6c8b504289df72050cb9f3cadc8a452df7f3

From 59d864d4f4654dd4f4b028eabe691eb396a007ea Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 03:31:29 -0700
Subject: [PATCH 271/667] feat: add submodules rmm, spdlog

---
 .gitmodules | 6 ++++++
 deps/rmm    | 1 +
 deps/spdlog | 1 +
 3 files changed, 8 insertions(+)
 create mode 160000 deps/rmm
 create mode 160000 deps/spdlog

diff --git a/.gitmodules b/.gitmodules
index a341e9396..7b5ed2295 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -27,3 +27,9 @@
 	path = deps/raft
 	url = git@github.com:rapidsai/raft.git
 	branch = branch-23.04
+[submodule "deps/spdlog"]
+	path = deps/spdlog
+	url = git@github.com:gabime/spdlog.git
+[submodule "deps/rmm"]
+	path = deps/rmm
+	url = git@github.com:rapidsai/rmm.git
diff --git a/deps/rmm b/deps/rmm
new file mode 160000
index 000000000..dc1e17a03
--- /dev/null
+++ b/deps/rmm
@@ -0,0 +1 @@
+Subproject commit dc1e17a03ed2dbc9329ccecc27922e414250f45a
diff --git a/deps/spdlog b/deps/spdlog
new file mode 160000
index 000000000..c3aed4b68
--- /dev/null
+++ b/deps/spdlog
@@ -0,0 +1 @@
+Subproject commit c3aed4b68373955e1cc94307683d44dca1515d2b

From a9e40ff3108af9063eede616fa0b3b7dbff1f570 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 03:33:15 -0700
Subject: [PATCH 272/667] chore: minor

---
 .gitmodules | 4 ----
 deps/raft   | 1 -
 2 files changed, 5 deletions(-)
 delete mode 160000 deps/raft

diff --git a/.gitmodules b/.gitmodules
index 7b5ed2295..2fde8be2e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,10 +23,6 @@
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
 	fetchRecurseSubmodules = true
-[submodule "deps/raft"]
-	path = deps/raft
-	url = git@github.com:rapidsai/raft.git
-	branch = branch-23.04
 [submodule "deps/spdlog"]
 	path = deps/spdlog
 	url = git@github.com:gabime/spdlog.git
diff --git a/deps/raft b/deps/raft
deleted file mode 160000
index 994e6c8b5..000000000
--- a/deps/raft
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 994e6c8b504289df72050cb9f3cadc8a452df7f3

From b73c9b52b5220d7bb4e3c5c484b3d9e1f5bd3d07 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 03:35:24 -0700
Subject: [PATCH 273/667] feat: submodules

---
 .gitmodules | 3 +++
 deps/raft   | 1 +
 deps/rmm    | 2 +-
 deps/spdlog | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)
 create mode 160000 deps/raft

diff --git a/.gitmodules b/.gitmodules
index 2fde8be2e..48cb47c2e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -29,3 +29,6 @@
 [submodule "deps/rmm"]
 	path = deps/rmm
 	url = git@github.com:rapidsai/rmm.git
+[submodule "deps/raft"]
+	path = deps/raft
+	url = git@github.com:rapidsai/raft.git
diff --git a/deps/raft b/deps/raft
new file mode 160000
index 000000000..7d1057e77
--- /dev/null
+++ b/deps/raft
@@ -0,0 +1 @@
+Subproject commit 7d1057e77c71c0cb9d28043e3f1db036995ffe56
diff --git a/deps/rmm b/deps/rmm
index dc1e17a03..6797909d5 160000
--- a/deps/rmm
+++ b/deps/rmm
@@ -1 +1 @@
-Subproject commit dc1e17a03ed2dbc9329ccecc27922e414250f45a
+Subproject commit 6797909d5304be6ee56c09c0156252e19f712639
diff --git a/deps/spdlog b/deps/spdlog
index c3aed4b68..100f30043 160000
--- a/deps/spdlog
+++ b/deps/spdlog
@@ -1 +1 @@
-Subproject commit c3aed4b68373955e1cc94307683d44dca1515d2b
+Subproject commit 100f30043f33277122e0991c83845a2617172ffd

From 41836d674524b09e77fb19f5e5d9ad6dbe184901 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 05:15:41 -0700
Subject: [PATCH 274/667] feat: add radix_topk

---
 .gitmodules         |  1 +
 CMakeLists.txt      |  9 +++++++++
 FlexFlow.mk         |  6 ++++--
 config/config.linux |  2 ++
 config/raft.patch   | 11 +++++++++++
 5 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 config/raft.patch

diff --git a/.gitmodules b/.gitmodules
index 48cb47c2e..afa9aa9f3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -32,3 +32,4 @@
 [submodule "deps/raft"]
 	path = deps/raft
 	url = git@github.com:rapidsai/raft.git
+	ignore = untracked
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43ce4f704..d90465b4e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,6 +128,9 @@ list(APPEND CC_FLAGS
 list(APPEND NVCC_FLAGS
   -std=c++17)
 
+list(APPEND NVCC_FLAGS
+  --expt-relaxed-constexpr
+  --extended-lambda)
 
 add_compile_options(${CC_FLAGS})
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
@@ -201,6 +204,12 @@ if(NOT BUILD_LEGION_ONLY)
   # optional
   include(optional)
 
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/include)
+
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/rmm/include)
+
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/spdlog/include)
+
   if (FF_GPU_BACKEND STREQUAL "cuda")
     list(APPEND FF_CC_FLAGS
       -DFF_USE_CUDA)
diff --git a/FlexFlow.mk b/FlexFlow.mk
index 14f32a763..cf92d8270 100644
--- a/FlexFlow.mk
+++ b/FlexFlow.mk
@@ -95,9 +95,11 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
 endif
 
 
-INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
+INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src \
+				-I${FF_HOME}/deps/raft/cpp/include -I${FF_HOME}/deps/rmm/include -I${FF_HOME}/deps/spdlog/include
 CC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
-NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
+NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 \
+			    --expt-relaxed-constexpr --extended-lambda
 HIPCC_FLAGS     += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
 GASNET_FLAGS	+=
 # For Point and Rect typedefs
diff --git a/config/config.linux b/config/config.linux
index acffc210f..13cbefbc9 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,6 +111,8 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
+patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh raft.patch
+
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc
     # Passing CMAKE_FLAGS or CUDA_PATH as $1 will print the value of the CMAKE_FLAGS/CUDA_PATH variable, 
diff --git a/config/raft.patch b/config/raft.patch
new file mode 100644
index 000000000..e587a590c
--- /dev/null
+++ b/config/raft.patch
@@ -0,0 +1,11 @@
+--- raft/cpp/include/raft/matrix/detail/select_radix.cuh	2023-04-12 07:29:14.000000000 -0700
++++ raft/cpp/include/raft/matrix/detail/select_radix_update.cuh	2023-04-20 19:06:53.323031000 -0700
+@@ -110,7 +110,7 @@
+   // When writing is not skipped, read `in_buf`(T) and `in_idx_buf`(IdxT), and write `out_buf`(T)
+   // and `out_idx_buf`(IdxT).
+   // The ratio between these cases determines whether to skip writing and hence the buffer size.
+-  constexpr float ratio = 2 + sizeof(IdxT) * 2.0 / sizeof(T);
++  constexpr float ratio = 128;
+   return len / ratio;
+ }
+ 

From 952a78288f47e61b60cbe51d9414275fca6f39e0 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 05:19:01 -0700
Subject: [PATCH 275/667] chore: minor

---
 .gitmodules         | 2 +-
 config/config.linux | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index afa9aa9f3..115712b36 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -32,4 +32,4 @@
 [submodule "deps/raft"]
 	path = deps/raft
 	url = git@github.com:rapidsai/raft.git
-	ignore = untracked
+	ignore = dirty
diff --git a/config/config.linux b/config/config.linux
index 13cbefbc9..b2c1e2703 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,7 +111,7 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
-patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh raft.patch
+patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/raft.patch
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc

From 75fe5d8a3566816bd87ed01c64152abbaae2caa9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 05:51:11 -0700
Subject: [PATCH 276/667] feat: add arg_topk

---
 src/ops/arg_topk.cc |   1 +
 src/ops/arg_topk.cu | 467 +++++++-------------------------------------
 2 files changed, 74 insertions(+), 394 deletions(-)

diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 706fbbc7a..669284573 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -366,6 +366,7 @@ FutureMap ArgTopK::inference(
   }
 }
 
+// just output the indices
 InferenceResult
     ArgTopK::inference_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 491b255be..247c447ba 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -15,359 +15,58 @@
 
 #include "flexflow/ops/arg_topk.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "raft/matrix/detail/select_radix.cuh"
 
 namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
 
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
-
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapArgTopK walks over [input, input+length) with `step_size` stride starting
-// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
-// using `Accessor` to access elements in `heap_entries`. If sorted=true, the
-// elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapArgTopK(T const *__restrict__ input,
-                            int length,
-                            int k,
-                            Entry<T> *__restrict__ heap_entries,
-                            bool sorted = false,
-                            int start_index = 0,
-                            int step_size = 1) {
-  assert(k <= length);
-
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
-  }
-}
-
-// mergeShards performs a top-k merge on `num_shards` many sorted streams that
-// are sorted and stored in `entries` in a strided way:
-// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
-// The overall top k elements are written to `top_k_values` and their indices
-// to top_k_indices.
-// `top_k_heap` is used as temporary storage for the merge heap.
-template <typename T>
-__device__ void mergeShards(int num_shards,
-                            int k,
-                            Entry<T> *__restrict__ entries,
-                            Entry<T> *__restrict__ top_k_heap,
-                            float *top_k_values,
-                            int *top_k_indices,
-                            bool speculative_decoding) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-
-  // Min-heap part.
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      min_heap.assign(slot, {slot, entries[slot].value});
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-      if (entry.value < root.value) {
-        continue;
-      }
-      if (entry.value == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      if (speculative_decoding) {
-        assert(top_k_values != nullptr);
-        top_k_values[rank] = static_cast<float>(max_element.value);
-      }
-
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      int next_shard_index = shard_index + num_shards;
-      // For rank < k-1, each top k heap still contains at least 1 element,
-      // so we can draw a replacement.
-      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
-                            heap_size);
-    }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    // top_k_values[last_k] = max_element.value;
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
-    top_k_values[last_k] = static_cast<float>(max_element.value);
-  }
+// Adopted from Raft's select_k
+// https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/matrix/detail/select_radix.cuh#L1113
+template<typename T, typename idxT>
+void raft_radix_11bits_kernel(const T* in,
+                       int batch_size,
+                       idxT len,
+                       idxT k,
+                       T* out,
+                       idxT* out_idx = nullptr,
+                       bool greater = true,
+                       cudaStream_t stream = 0) {
+    raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
+        in,
+        static_cast<idxT*>(nullptr),
+        batch_size,
+        len,
+        k,
+        out,
+        out_idx,
+        !greater,
+        true,  // fused_last_filter
+        stream);
 }
 
-template <typename T>
-__global__ void arg_topk_forward_kernel(T const *__restrict__ input,
-                                        size_t shared_memory_size,
-                                        int length,
-                                        int k,
-                                        bool sorted,
-                                        float *__restrict__ output,
-                                        int *__restrict__ indices,
-                                        bool speculative_decoding) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-  heapArgTopK<T, StridedData>(
-      batch_input, length, k, shared_entries, true, thread_index, thread_count);
-  __syncthreads();
-  if (thread_index == 0) {
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-    mergeShards(thread_count,
-                k,
-                shared_entries,
-                top_k_heap,
-                batch_output,
-                batch_indices,
-                speculative_decoding);
-  }
+// Adopted from Raft's select_k
+// https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/matrix/detail/select_radix.cuh#L1113
+template<typename T, typename idxT>
+void raft_radix_11bits_extra_pass_kernel(const T* in,
+                                  int batch_size,
+                                  idxT len,
+                                  idxT k,
+                                  T* out,
+                                  idxT* out_idx = nullptr,
+                                  bool greater = true,
+                                  cudaStream_t stream = 0) {
+    raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
+        in,
+        static_cast<idxT*>(nullptr),
+        batch_size,
+        len,
+        k,
+        out,
+        out_idx,
+        !greater,
+        false,  // fused_last_filter
+        stream);
 }
 
 /*static*/
@@ -383,51 +82,30 @@ void ArgTopK::forward_kernel(
     bool sorted,
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
-  // Adopted from TensorFlow's ArgTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-  int num_shards = 0;
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = k * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
-  // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
-  size_t num_blocks = batch_size;
-
-  // all requests share the same number of branches
+  printf("ArgTopK::forward_kernel\n");
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
-    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
-    num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
+    printf("ArgTopK::forward_kernel: speculative_decoding\n");
+    raft_radix_11bits_extra_pass_kernel<DT, int>(
         input_ptr,
-        shared_memory_size,
+        batch_size,
         length,
         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
-        sorted,
         output_ptr,
         indices_ptr,
-        m->speculative_decoding);
+        true,
+        stream);
   } else {
-
-    assert(num_shards >= (size_t)k);
-    num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
-        input_ptr,
-        shared_memory_size,
-        length,
-        k,
-        sorted,
-        nullptr,
-        indices_ptr,
-        false);
+    // raft_radix_11bits_extra_pass_kernel<DT, int>(
+    //     input_ptr,
+    //     batch_size,
+    //     length,
+    //     k,
+    //     static_cast<float*>(nullptr),
+    //     indices_ptr,
+    //     true,
+    //     stream);
+    assert(false && "Not in speculative decoding mode");
   }
 }
 
@@ -481,17 +159,18 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 
   if (input.data_type == DT_HALF) {
-    ArgTopK::forward_kernel(m,
-                            input.get_half_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
-                                                    : nullptr,
-                            indices.get_int32_ptr(),
-                            batch_size,
-                            length,
-                            k,
-                            m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
-                            stream);
+    // ArgTopK::forward_kernel(m,
+    //                         input.get_half_ptr(),
+    //                         m->speculative_decoding ? probs.get_float_ptr()
+    //                                                 : nullptr,
+    //                         indices.get_int32_ptr(),
+    //                         batch_size,
+    //                         length,
+    //                         k,
+    //                         m->sorted,
+    //                         m->speculative_decoding ? bc : nullptr,
+    //                         stream);
+    assert(false && "Unsupported data type");
   } else if (input.data_type == DT_FLOAT) {
     ArgTopK::forward_kernel(m,
                             input.get_float_ptr(),

From 992b6a8b301d0f1086eada590fd86e3e8e9b064a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 06:35:21 -0700
Subject: [PATCH 277/667] chore: eliminate intermediate output

---
 src/ops/arg_topk.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 247c447ba..feb0bebe6 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -82,10 +82,8 @@ void ArgTopK::forward_kernel(
     bool sorted,
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
-  printf("ArgTopK::forward_kernel\n");
   if (m->speculative_decoding) {
     assert(bc->num_active_requests() >= 0);
-    printf("ArgTopK::forward_kernel: speculative_decoding\n");
     raft_radix_11bits_extra_pass_kernel<DT, int>(
         input_ptr,
         batch_size,

From 6b8e1d3e4f7c85b8dd8bf69d47b4c01ddd8a7ae4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 24 May 2024 06:36:49 -0700
Subject: [PATCH 278/667] feat: disable cudaGraph bcz can't adopted on
 specinfer now

---
 src/ops/fused.cu | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 7d1c2944d..03838cd49 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -630,10 +630,10 @@ __host__ void
     // }
   }
 
-  if (!captured) {
-    cudaGraph_t graph;
-    {    
-      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+  // if (!captured) {
+  //   cudaGraph_t graph;
+  //   {    
+  //     cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
       int ioff = 0, woff = 0, ooff = 0;
       for (int op = 0; op < fused->numOperators; op++) {
         // Domain my_id[MAX_NUM_INPUTS];
@@ -1158,22 +1158,22 @@ __host__ void
       // for (int i = 0; i < fused->numOutputs; i++)
       //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
       //   "[Fused:forward:output]");
-      cudaStreamEndCapture(stream, &graph);
-    }
-    cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
-    metas->graph_collections[graph_params] = instance;
-    // if(shard_id == 0) {
-    //   printf("*************start cudaGraphInstantiate**********\n");
-    //   graph_params.Print();
-    //   // bc->print();
-    //   printf("*************end cudaGraphInstantiate**********\n");
-    // }
-    cudaGraphDestroy(graph);
-  }
+  //     cudaStreamEndCapture(stream, &graph);
+  //   }
+  //   cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+  //   metas->graph_collections[graph_params] = instance;
+  //   // if(shard_id == 0) {
+  //   //   printf("*************start cudaGraphInstantiate**********\n");
+  //   //   graph_params.Print();
+  //   //   // bc->print();
+  //   //   printf("*************end cudaGraphInstantiate**********\n");
+  //   // }
+  //   cudaGraphDestroy(graph);
+  // }
 
-  assert(metas->graph_collections.find(graph_params) !=
-        metas->graph_collections.end());
-  cudaGraphLaunch(instance, stream);
+  // assert(metas->graph_collections.find(graph_params) !=
+  //       metas->graph_collections.end());
+  // cudaGraphLaunch(instance, stream);
 }
 
 /*

From 3bc3ddd975f1e0d2abad15247fe35e0e2a850141 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 24 May 2024 13:52:53 -0400
Subject: [PATCH 279/667] Format.

---
 src/runtime/request_manager.cc | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7d9a6982e..77bc14f58 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -465,18 +465,18 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     if (outputFile.is_open()) {
       outputFile << "Request " << guid << " profiling: " << std::endl;
       outputFile << "Decoding time: "
-                  << (profile_info.finish_time -
-                  profile_info.start_decoding_time) * 1e-3
-                  << "ms" << std::endl;
+                 << (profile_info.finish_time -
+                     profile_info.start_decoding_time) *
+                        1e-3
+                 << "ms" << std::endl;
       outputFile << "Total time: "
-                  << (profile_info.finish_time -
-                      profile_info.start_time) * 1e-3
-                  << "ms" << std::endl;
-      outputFile << "LLM decoding steps: "
-                   << profile_info.llm_decoding_steps << std::endl;
+                 << (profile_info.finish_time - profile_info.start_time) * 1e-3
+                 << "ms" << std::endl;
+      outputFile << "LLM decoding steps: " << profile_info.llm_decoding_steps
+                 << std::endl;
       if (decoding_mode == SPECULATIVE_DECODING) {
-        outputFile << "SSM decoding steps: "
-                    << profile_info.ssm_decoding_steps << std::endl;
+        outputFile << "SSM decoding steps: " << profile_info.ssm_decoding_steps
+                   << std::endl;
       }
       outputFile << output << std::endl << std::endl;
       outputFile.close();
@@ -1747,7 +1747,6 @@ void RequestManager::serve_decoding(FFModel *llm) {
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(irf);
-    last_irf = irf;
     runtime->end_trace(ctx, 12346 /*trace_id*/);
   }
 }

From b6e671634643e6b91efe9caf634b8e6a4727599c Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 25 May 2024 02:19:28 -0400
Subject: [PATCH 280/667] Adepted add_tokens_to_spec_token_tree to the new
 version of arg_topk.

---
 src/runtime/request_manager.cc | 41 ++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 77bc14f58..7bc4c4e8f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1928,16 +1928,38 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
       if (!parent_ptr->pruned) {
+        // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
+        float parent_log_prob = parent_ptr->log_accumulated_prob;
+        int child_start_idx =
+            result_offset +
+            parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+        std::vector<std::pair<float, int>> child_probs(
+            BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
         for (int child_pos = 0;
              child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
              child_pos++) {
-          int result_idx =
-              result_offset +
-              parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES +
-              child_pos;
-          float log_prob = log(ssm_inference_result.probs[result_idx]);
-          float log_accumulated_prob =
-              log_prob + parent_ptr->log_accumulated_prob;
+          int result_idx = child_start_idx + child_pos;
+          child_probs[child_pos] = std::make_pair(
+              log(ssm_inference_result.probs[result_idx]), result_idx);
+        }
+        // Sort in descending order
+        std::sort(child_probs.begin(),
+                  child_probs.end(),
+                  std::greater<std::pair<float, int>>());
+
+        // for (int child_pos = 0;
+        //      child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+        //      child_pos++) {
+        for (auto const &child_prob : child_probs) {
+
+          //   int result_idx =
+          //       result_offset +
+          //       parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES +
+          //       child_pos;
+
+          float log_prob = child_prob.first;
+          float log_accumulated_prob = log_prob + parent_log_prob;
+          int result_idx = child_prob.second;
 
           //   std::cout << "Probability at result index " << result_idx << ": "
           //             << ssm_inference_result.probs[result_idx] << "\t";
@@ -2058,8 +2080,9 @@ std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
     int token_pos = 0;
     for (auto const &node : layer) {
       if (!node->pruned) {
-        os << "token pos: " << token_pos << "token id: " << node->id << "\t"
-           << "parent pos: " << node->parent_pos << "\t" << std::endl;
+        os << "token pos: " << token_pos << "\ttoken id: " << node->id
+           << "\tparent pos: " << node->parent_pos
+           << "\tlog prob: " << node->log_accumulated_prob << std::endl;
       }
       token_pos++;
     }

From b14ab582c44fff4a1c2692481ef6dd471a4eae00 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 25 May 2024 16:33:13 -0400
Subject: [PATCH 281/667] Modified the output strings after the request
 completes.

---
 src/runtime/request_manager.cc | 73 +++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 19 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7bc4c4e8f..c9fe9cc74 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -460,32 +460,36 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   std::cout << "Request " << guid << " completed: " << std::endl
             << output << std::endl;
   ProfileInfo profile_info = profiling_requests[guid];
+
+  std::ostream *os = &std::cout;
+  std::ofstream output_file;
   if (!output_filepath.empty()) {
-    std::ofstream outputFile(output_filepath, std::ios::app);
-    if (outputFile.is_open()) {
-      outputFile << "Request " << guid << " profiling: " << std::endl;
-      outputFile << "Decoding time: "
-                 << (profile_info.finish_time -
-                     profile_info.start_decoding_time) *
-                        1e-3
-                 << "ms" << std::endl;
-      outputFile << "Total time: "
-                 << (profile_info.finish_time - profile_info.start_time) * 1e-3
-                 << "ms" << std::endl;
-      outputFile << "LLM decoding steps: " << profile_info.llm_decoding_steps
-                 << std::endl;
-      if (decoding_mode == SPECULATIVE_DECODING) {
-        outputFile << "SSM decoding steps: " << profile_info.ssm_decoding_steps
-                   << std::endl;
-      }
-      outputFile << output << std::endl << std::endl;
-      outputFile.close();
+    output_file.open(output_filepath, std::ios::app);
+    if (output_file.is_open()) {
+      os = &output_file;
     } else {
       std::cout << "Unable to open the output file: " << output_filepath
                 << std::endl;
       assert(false);
     }
   }
+  *os << "Request " << guid << " profiling: " << std::endl;
+  *os << "Decoding time: "
+      << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
+      << "ms" << std::endl;
+  *os << "Total time: "
+      << (profile_info.finish_time - profile_info.start_time) * 1e-3 << "ms"
+      << std::endl;
+  *os << "LLM decoding steps: " << profile_info.llm_decoding_steps << std::endl;
+  if (decoding_mode == SPECULATIVE_DECODING) {
+    *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
+        << std::endl;
+  }
+  *os << output << std::endl << std::endl;
+
+  if (!output_filepath.empty()) {
+    output_file.close();
+  }
 
   trigger_request_completion_future(guid);
 }
@@ -1820,6 +1824,37 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
       assert(false && "Invalid request manager status");
     }
   }
+
+  //   while (!is_background_server_terminated()) {
+  //     // last_irf.get_void_result();
+  //     BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
+  //     bcf.get_void_result();
+  //     // time_2 = Realm::Clock::current_time_in_microseconds();
+  //     // std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
+  //     //           << std::endl;
+
+  //     // time_1 = Realm::Clock::current_time_in_microseconds();
+  //     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
+  //         request_manager_status == LLM_VERIFY) {
+  //       //   std::cout << "Branch 1" << std::endl;
+  //       runtime->begin_trace(ctx, 12345 /*trace_id*/);
+  //       FutureMap fm = im->inference(llm, 0, bcf);
+  //       //   assert(fm.get_future_map_domain().get_volume() == 1);
+  //       last_irf = fm.get_future(0);
+  //       runtime->end_trace(ctx, 12345 /*trace_id*/);
+  //     } else if ((request_manager_status == PREFILLING and
+  //                 prefill_model == SSM) or
+  //                request_manager_status == SSM_SPEC) {
+  //       //   std::cout << "Branch 2" << std::endl;
+  //       runtime->begin_trace(ctx, 23456 /*trace_id*/);
+  //       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
+  //       //   assert(fm.get_future_map_domain().get_volume() == 1);
+  //       last_irf = fm.get_future(0);
+  //       runtime->end_trace(ctx, 23456 /*trace_id*/);
+  //     } else {
+  //       assert(false && "Invalid request manager status");
+  //     }
+  //   }
 }
 
 void RequestManager::trigger_request_completion_future(

From 7dbb30f941a5027438df01485e151e71716dcc76 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 27 May 2024 03:32:42 -0400
Subject: [PATCH 282/667] Complete on EOS.

---
 src/runtime/request_manager.cc | 117 +++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 43 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c9fe9cc74..f03f1c36e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -456,9 +456,26 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   num_available_requests--;
   request.status = Request::COMPLETED;
 
-  std::string output = this->tokenizer_->Decode(request.tokens);
+  // Find the sos and eos in the sequence
+  auto bos_it = std::find(
+      request.tokens.begin(), request.tokens.end(), this->bos_token_id);
+  auto eos_rit = std::find(
+      request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id);
+  std::vector<int>::iterator eos_it;
+  if (eos_rit != request.tokens.rend()) {
+    eos_it = eos_rit.base();
+  } else {
+    eos_it = request.tokens.end();
+  }
+  std::string output =
+      this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
+
   std::cout << "Request " << guid << " completed: " << std::endl
-            << output << std::endl;
+            << "<bos>" << output;
+  if (eos_rit != request.tokens.rend()) {
+    std::cout << "<eos>";
+  }
+  std::cout << std::endl;
   ProfileInfo profile_info = profiling_requests[guid];
 
   std::ostream *os = &std::cout;
@@ -516,8 +533,8 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
     case PREFILLING:
       if (decoding_mode == INCREMENTAL_DECODING) {
         if (update_llm_prefill_results(result)) {
-          // This indicates that the prefilling of the current request finishes
-          // Reset the prefill_request
+          // This indicates that the prefilling of the current request
+          // finishes Reset the prefill_request
           prefill_request = nullptr;
 
           // Check if there are more empty slots
@@ -694,7 +711,8 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
         result.token_ids[request.first_token_offset_in_batch]);
 
     profiling_requests[guid].llm_decoding_steps++;
-    if (request.tokens.size() >= get_max_sequence_length()) {
+    if (request.tokens.back() == eos_token_id or
+        request.tokens.size() >= get_max_sequence_length()) {
       request_completed = true;
       request_complete_clean_up(request_index);
     }
@@ -1044,8 +1062,8 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
 /***** Speculative Decoding Phase *****/
 BatchConfig RequestManager::prepare_next_spec_batch_config() {
   if (verbose) {
-    std::cout
-        << "\n############### prepare_next_spec_batch_config ###############\n";
+    std::cout << "\n############### prepare_next_spec_batch_config "
+                 "###############\n";
     std::cout << "Current tree depth: " << current_speculation_step + 1 << "\n";
   }
 
@@ -1309,7 +1327,14 @@ bool RequestManager::update_llm_verify_results(
 
     // Check if the request is completed. If its completed, clean up the
     // metainfo stored in the RequestManager. Otherwise, update its bitmask.
-    if (request.tokens.size() >= get_max_sequence_length()) {
+    bool eos_token_found = false;
+    for (auto const &committed_token : request.committed_tokens) {
+      if (committed_token.token_id == eos_token_id) {
+        eos_token_found = true;
+        break;
+      }
+    }
+    if (eos_token_found or request.tokens.size() >= get_max_sequence_length()) {
       // Request is completed
       request_completed = true;
       request_complete_clean_up(request_index);
@@ -1825,35 +1850,40 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     }
   }
 
+  //   BatchConfigFuture bcf;
+
+  //   std::queue<InferenceResultFuture> infer_result_future_pipeline;
+  //   { infer_result_future_pipeline.push(last_irf); }
+
   //   while (!is_background_server_terminated()) {
-  //     // last_irf.get_void_result();
-  //     BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
-  //     bcf.get_void_result();
-  //     // time_2 = Realm::Clock::current_time_in_microseconds();
-  //     // std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
-  //     //           << std::endl;
-
-  //     // time_1 = Realm::Clock::current_time_in_microseconds();
-  //     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
-  //         request_manager_status == LLM_VERIFY) {
-  //       //   std::cout << "Branch 1" << std::endl;
-  //       runtime->begin_trace(ctx, 12345 /*trace_id*/);
-  //       FutureMap fm = im->inference(llm, 0, bcf);
-  //       //   assert(fm.get_future_map_domain().get_volume() == 1);
-  //       last_irf = fm.get_future(0);
-  //       runtime->end_trace(ctx, 12345 /*trace_id*/);
-  //     } else if ((request_manager_status == PREFILLING and
-  //                 prefill_model == SSM) or
-  //                request_manager_status == SSM_SPEC) {
-  //       //   std::cout << "Branch 2" << std::endl;
-  //       runtime->begin_trace(ctx, 23456 /*trace_id*/);
+  //     if (infer_result_future_pipeline.size() >= 4) {
+  //       // Block here to avoid launching too many batches
+  //       auto const &ir = infer_result_future_pipeline.front();
+  //       ir.get_void_result();
+  //     }
+  //     // deque finished batches
+  //     while (infer_result_future_pipeline.size() > 1) {
+  //       auto const &ir = infer_result_future_pipeline.front();
+  //       if (ir.is_ready()) {
+  //         infer_result_future_pipeline.pop();
+  //       } else {
+  //         break;
+  //       }
+  //     }
+
+  //     runtime->begin_trace(ctx, 12345 /*trace_id*/);
+  //     for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth();
+  //     ssm_step_i++) {
+  //       last_irf = infer_result_future_pipeline.back();
+  //       bcf = get_next_batch_config(last_irf, ctx, runtime);
   //       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-  //       //   assert(fm.get_future_map_domain().get_volume() == 1);
-  //       last_irf = fm.get_future(0);
-  //       runtime->end_trace(ctx, 23456 /*trace_id*/);
-  //     } else {
-  //       assert(false && "Invalid request manager status");
+  //       infer_result_future_pipeline.push(fm.get_future(0));
   //     }
+  //     last_irf = infer_result_future_pipeline.back();
+  //     bcf = get_next_batch_config(last_irf, ctx, runtime);
+  //     FutureMap fm = im->inference(llm, 0, bcf);
+  //     infer_result_future_pipeline.push(fm.get_future(0));
+  //     runtime->end_trace(ctx, 12345 /*trace_id*/);
   //   }
 }
 
@@ -1936,8 +1966,8 @@ bool RequestManager::add_tokens_to_spec_token_tree(
 
     int parent_num = request.num_tokens_in_batch;
     if (parent_num == 0) {
-      // The request has no committed tokens, we don't need to add tokens to the
-      // token tree
+      // The request has no committed tokens, we don't need to add tokens to
+      // the token tree
       continue;
     }
     int result_offset = request.first_token_offset_in_batch *
@@ -1996,7 +2026,8 @@ bool RequestManager::add_tokens_to_spec_token_tree(
           float log_accumulated_prob = log_prob + parent_log_prob;
           int result_idx = child_prob.second;
 
-          //   std::cout << "Probability at result index " << result_idx << ": "
+          //   std::cout << "Probability at result index " << result_idx << ":
+          //   "
           //             << ssm_inference_result.probs[result_idx] << "\t";
           //   std::cout << "Token id: "
           //             << ssm_inference_result.token_ids[result_idx] <<
@@ -2016,10 +2047,10 @@ bool RequestManager::add_tokens_to_spec_token_tree(
                      log_accumulated_prob <= token_tree_node_pool.top()
                                                  .first->log_accumulated_prob) {
             // The token tree is not full, but the token pool is full, and the
-            // new token has a lower joint probability than the minimum node in
-            // the pool, we don't need to add the new token and the following
-            // tokens belong to the same parent to the tree, because the tokens
-            // are sorted by their probability
+            // new token has a lower joint probability than the minimum node
+            // in the pool, we don't need to add the new token and the
+            // following tokens belong to the same parent to the tree, because
+            // the tokens are sorted by their probability
             break;
           } else {
             std::shared_ptr<TokenTreeNode> node_ptr =
@@ -2032,8 +2063,8 @@ bool RequestManager::add_tokens_to_spec_token_tree(
                     (*tokens.begin())->log_accumulated_prob) {
               // The token tree is full, and the new token has a higher joint
               // probability than the minimum node in the pool, we need to
-              // remove the minimum node from the pool and add the new token to
-              // the tree
+              // remove the minimum node from the pool and add the new token
+              // to the tree
               tokens.erase(tokens.begin());
             }
             tokens.insert(node_ptr);

From 721e1ff63ff86b225d408b2d20d4aafb6ef0a9a9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 27 May 2024 06:08:57 -0700
Subject: [PATCH 283/667] feat: add argtopk half_precision case

---
 include/flexflow/ops/arg_topk.h |  9 +++++-
 src/ops/arg_topk.cc             |  7 +++-
 src/ops/arg_topk.cu             | 57 +++++++++++++++++++++++++--------
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 935aa9ff9..af17b3f7e 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -5,15 +5,22 @@
 #include "flexflow/model.h"
 #include "flexflow/node.h"
 #include "flexflow/ops/arg_topk_params.h"
+#include "flexflow/utils/memory_allocator.h"
 
 namespace FlexFlow {
 
 class ArgTopKMeta : public OpMeta {
 public:
-  ArgTopKMeta(FFHandler handle, Op const *op);
   bool sorted;
   int k;
   bool speculative_decoding;
+  Realm::RegionInstance reserveInst;
+  float *full_precision_input;
+  int max_input_size;
+  ArgTopKMeta(FFHandler handle,
+              Op const *op,
+              MemoryAllocator &gpu_mem_allocator);
+  ~ArgTopKMeta(void);
 };
 
 class ArgTopK : public Op {
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 669284573..83813b578 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -275,7 +275,12 @@ OpMeta *ArgTopK::init_task(Task const *task,
                            Runtime *runtime) {
   ArgTopK *topk = (ArgTopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  ArgTopKMeta *m = new ArgTopKMeta(handle, topk);
+  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                       .only_kind(Memory::GPU_FB_MEM)
+                       .best_affinity_to(task->target_proc)
+                       .first();
+  MemoryAllocator gpu_mem_allocator(gpu_mem);
+  ArgTopKMeta *m = new ArgTopKMeta(handle, topk, gpu_mem_allocator);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
   m->sorted = topk->sorted;
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index feb0bebe6..f21bc561c 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -69,6 +69,18 @@ void raft_radix_11bits_extra_pass_kernel(const T* in,
         stream);
 }
 
+__global__ void half2float_kernel(const half* __restrict__ in, float* __restrict__ out, int size) {
+  // int stride = blockDim.x * gridDim.x,
+  //     tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // for (int i = tid; i < size; i += stride) {
+  //   out[i] = __half2float(in[i]);
+  // }
+  CUDA_KERNEL_LOOP(i, size) {
+    out[i] = __half2float(in[i]);
+  }
+}
+
 /*static*/
 template <typename DT>
 void ArgTopK::forward_kernel(
@@ -157,18 +169,24 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 
   if (input.data_type == DT_HALF) {
-    // ArgTopK::forward_kernel(m,
-    //                         input.get_half_ptr(),
-    //                         m->speculative_decoding ? probs.get_float_ptr()
-    //                                                 : nullptr,
-    //                         indices.get_int32_ptr(),
-    //                         batch_size,
-    //                         length,
-    //                         k,
-    //                         m->sorted,
-    //                         m->speculative_decoding ? bc : nullptr,
-    //                         stream);
-    assert(false && "Unsupported data type");
+    // transfer data from half to float (input to full_precision_input)
+    // printf("ArgTopK: length = %d, batch_size = %d\n", length, batch_size);
+    int size = length * batch_size;
+    half2float_kernel<<<GET_BLOCKS(size),
+                        min((int)CUDA_NUM_THREADS, size),
+                        0,
+                        stream>>>(input.get_half_ptr(), m->full_precision_input, size);
+    ArgTopK::forward_kernel(m,
+                            m->full_precision_input,
+                            m->speculative_decoding ? probs.get_float_ptr()
+                                                    : nullptr,
+                            indices.get_int32_ptr(),
+                            batch_size,
+                            length,
+                            k,
+                            m->sorted,
+                            m->speculative_decoding ? bc : nullptr,
+                            stream);
   } else if (input.data_type == DT_FLOAT) {
     ArgTopK::forward_kernel(m,
                             input.get_float_ptr(),
@@ -196,7 +214,18 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 }
 
-ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op)
-    : OpMeta(handler, op) {}
+ArgTopKMeta::ArgTopKMeta(FFHandler handler,
+                          Op const *op,
+                          MemoryAllocator &gpu_mem_allocator)
+    : OpMeta(handler, op) {
+  max_input_size = BatchConfig::MAX_NUM_TOKENS * 32000; // TODO: use vocab_size
+  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(float) * max_input_size);
+  full_precision_input = gpu_mem_allocator.allocate_instance<float>(max_input_size);
+}
 
+ArgTopKMeta::~ArgTopKMeta() {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
 }; // namespace FlexFlow

From 802c600a6b3f19505a7840743a74d7f86b3cdc30 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 27 May 2024 09:24:55 -0700
Subject: [PATCH 284/667] feat: support half precision

---
 include/flexflow/ops/arg_topk.h |  4 ++--
 src/ops/arg_topk.cu             | 24 +++++++++++++-----------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index af17b3f7e..06bd4b84f 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -15,7 +15,7 @@ class ArgTopKMeta : public OpMeta {
   int k;
   bool speculative_decoding;
   Realm::RegionInstance reserveInst;
-  float *full_precision_input;
+  void *half_precision_output;
   int max_input_size;
   ArgTopKMeta(FFHandler handle,
               Op const *op,
@@ -90,7 +90,7 @@ class ArgTopK : public Op {
   template <typename DT>
   static void forward_kernel(ArgTopKMeta const *m,
                              DT const *input_ptr,
-                             float *output_ptr,
+                             DT *output_ptr,
                              int *indices_ptr,
                              size_t batch_size,
                              int length,
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index f21bc561c..4a8a569c1 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -86,7 +86,7 @@ template <typename DT>
 void ArgTopK::forward_kernel(
     ArgTopKMeta const *m,
     DT const *input_ptr,
-    float *output_ptr,
+    DT *output_ptr,
     int *indices_ptr,
     size_t batch_size,
     int length,
@@ -169,16 +169,10 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
   }
 
   if (input.data_type == DT_HALF) {
-    // transfer data from half to float (input to full_precision_input)
     // printf("ArgTopK: length = %d, batch_size = %d\n", length, batch_size);
-    int size = length * batch_size;
-    half2float_kernel<<<GET_BLOCKS(size),
-                        min((int)CUDA_NUM_THREADS, size),
-                        0,
-                        stream>>>(input.get_half_ptr(), m->full_precision_input, size);
     ArgTopK::forward_kernel(m,
-                            m->full_precision_input,
-                            m->speculative_decoding ? probs.get_float_ptr()
+                            input.get_half_ptr(),
+                            m->speculative_decoding ? (half *)m->half_precision_output
                                                     : nullptr,
                             indices.get_int32_ptr(),
                             batch_size,
@@ -187,6 +181,14 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             m->sorted,
                             m->speculative_decoding ? bc : nullptr,
                             stream);
+    if (m->speculative_decoding) {
+      // transfer data from half to float (half_precision_output to output)
+      int size = length * batch_size;
+      half2float_kernel<<<GET_BLOCKS(size),
+                          min((int)CUDA_NUM_THREADS, size),
+                          0,
+                          stream>>>((const half *)m->half_precision_output, probs.get_float_ptr(), size);
+    }
   } else if (input.data_type == DT_FLOAT) {
     ArgTopK::forward_kernel(m,
                             input.get_float_ptr(),
@@ -219,8 +221,8 @@ ArgTopKMeta::ArgTopKMeta(FFHandler handler,
                           MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, op) {
   max_input_size = BatchConfig::MAX_NUM_TOKENS * 32000; // TODO: use vocab_size
-  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(float) * max_input_size);
-  full_precision_input = gpu_mem_allocator.allocate_instance<float>(max_input_size);
+  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(half) * max_input_size);
+  half_precision_output = gpu_mem_allocator.allocate_instance_untyped(sizeof(half) * max_input_size);
 }
 
 ArgTopKMeta::~ArgTopKMeta() {

From d8fdc7ca0281ff2599cb1e59a4bf0a2e5981f5ac Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 27 May 2024 09:50:07 -0700
Subject: [PATCH 285/667] feat: add orignal argtopk backup

---
 include/flexflow/ops/arg_topk.h.backup | 110 +++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 include/flexflow/ops/arg_topk.h.backup

diff --git a/include/flexflow/ops/arg_topk.h.backup b/include/flexflow/ops/arg_topk.h.backup
new file mode 100644
index 000000000..935aa9ff9
--- /dev/null
+++ b/include/flexflow/ops/arg_topk.h.backup
@@ -0,0 +1,110 @@
+#ifndef _FLEXFLOW_ARG_TOPK_H_
+#define _FLEXFLOW_ARG_TOPK_H_
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/node.h"
+#include "flexflow/ops/arg_topk_params.h"
+
+namespace FlexFlow {
+
+class ArgTopKMeta : public OpMeta {
+public:
+  ArgTopKMeta(FFHandler handle, Op const *op);
+  bool sorted;
+  int k;
+  bool speculative_decoding;
+};
+
+class ArgTopK : public Op {
+public:
+  using Params = ArgTopKParams;
+  using Input = ParallelTensor;
+  ArgTopK(FFModel &model,
+          LayerID const &layer_guid,
+          ParallelTensor const input,
+          int k,
+          bool sorted,
+          bool speculative_decoding,
+          char const *name);
+  ArgTopK(FFModel &model,
+          LayerID const &layer_guid,
+          ArgTopK const &other,
+          ParallelTensor const input);
+  ArgTopK(FFModel &model,
+          Params const &params,
+          Input const input,
+          char const *name = nullptr);
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override {
+    assert(0);
+  }
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static InferenceResult
+      inference_task(Legion::Task const *task,
+                     std::vector<Legion::PhysicalRegion> const &regions,
+                     Legion::Context ctx,
+                     Legion::Runtime *runtime);
+  static InferenceResult inference_speculative_task(
+      Legion::Task const *task,
+      std::vector<Legion::PhysicalRegion> const &regions,
+      Legion::Context ctx,
+      Legion::Runtime *runtime);
+  void serialize(Legion::Serializer &s) const override;
+  static PCG::Node deserialize(FFModel &ff,
+                               Legion::Deserializer &d,
+                               ParallelTensor inputs[],
+                               int num_inputs);
+  Op *materialize(FFModel &ff,
+                  ParallelTensor inputs[],
+                  int num_inputs) const override;
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+  template <typename DT>
+  static void forward_kernel(ArgTopKMeta const *m,
+                             DT const *input_ptr,
+                             float *output_ptr,
+                             int *indices_ptr,
+                             size_t batch_size,
+                             int length,
+                             int k,
+                             bool sorted,
+                             BatchConfig const *bc,
+                             ffStream_t stream);
+  static void forward_kernel_wrapper(ArgTopKMeta const *m,
+                                     GenericTensorAccessorR const &input,
+                                     GenericTensorAccessorW const &prob,
+                                     GenericTensorAccessorW const &indices,
+                                     int batch_size,
+                                     BatchConfig const *bc);
+  Params get_params() const;
+
+public:
+  int k;
+  bool sorted;
+  bool speculative_decoding;
+};
+
+}; // namespace FlexFlow
+
+#endif

From 315d729bf3528f2d0ecc4f308ee6f7afc0aa11c1 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 27 May 2024 13:48:49 -0400
Subject: [PATCH 286/667] Fixed a bug in argmax.cu.

---
 src/ops/argmax.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 844d74686..e7baef6d1 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -23,7 +23,7 @@ __global__ void init_offset(int batch_size,
                             int vocab_size,
                             int total_eles,
                             int *d_offsets) {
-  CUDA_KERNEL_LOOP(i, (total_eles + 1) / vocab_size) {
+  CUDA_KERNEL_LOOP(i, (total_eles) / vocab_size + 1) {
     // if (i % vocab_size == 0) {
     //   d_offsets[i / vocab_size] = i;
     // }

From d3e76d6a4dcabdf3a7b2cfc2141f9e992e024d00 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 27 May 2024 14:17:15 -0400
Subject: [PATCH 287/667] Optimized metadata transfer from host to GPU.

---
 src/runtime/request_manager.cu | 88 ++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 30 deletions(-)

diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 67f2c8713..326ebc5d2 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -81,19 +81,29 @@ void RequestManager::load_batch_config_task(
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
   size_t total_copy_size = 0;
-  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
-                            &(batch_config->tokensInfo),
-                            sizeof(BatchConfig::tokensInfo),
-                            cudaMemcpyHostToDevice,
-                            stream));
+  if (batch_config->num_tokens > 0) {
+    // The tokensInfo is compact
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
+                              &(batch_config->tokensInfo),
+                              batch_config->num_tokens *
+                                  sizeof(BatchConfig::PerTokenInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+  }
   total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                total_copy_size,
-                            &(batch_config->requestsInfo),
-                            sizeof(BatchConfig::requestsInfo),
-                            cudaMemcpyHostToDevice,
-                            stream));
+  for (int request_idx = 0; request_idx < BatchConfig::max_requests_per_batch();
+       request_idx++) {
+    if (batch_config->request_available[request_idx]) {
+      checkCUDA(cudaMemcpyAsync(
+          static_cast<char *>(handle.batch_config_metadata) + total_copy_size +
+              request_idx * sizeof(BatchConfig::PerRequestInfo),
+          &(batch_config->requestsInfo[request_idx]),
+          sizeof(BatchConfig::PerRequestInfo),
+          cudaMemcpyHostToDevice,
+          stream));
+    }
+  }
   total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
@@ -106,28 +116,46 @@ void RequestManager::load_batch_config_task(
 
   // load speculative metadata
   if (batch_config->get_mode() == TREE_SEARCH_MODE) {
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
+    for (int request_idx = 0;
+         request_idx < BatchConfig::max_requests_per_batch();
+         request_idx++) {
+      if (batch_config->request_available[request_idx]) {
+        checkCUDA(cudaMemcpyAsync(
+            static_cast<char *>(handle.batch_config_metadata) +
+                total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+            &(batch_config->causalMask[request_idx]),
+            sizeof(BatchConfig::BitMask),
+            cudaMemcpyHostToDevice,
+            stream));
+      }
+    }
     total_copy_size += sizeof(BatchConfig::causalMask);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
+    for (int request_idx = 0;
+         request_idx < BatchConfig::max_requests_per_batch();
+         request_idx++) {
+      if (batch_config->request_available[request_idx]) {
+        checkCUDA(cudaMemcpyAsync(
+            static_cast<char *>(handle.batch_config_metadata) +
+                total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+            &(batch_config->causalMask[request_idx]),
+            sizeof(BatchConfig::BitMask),
+            cudaMemcpyHostToDevice,
+            stream));
+      }
+    }
     total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->committed_tokens),
-        sizeof(BatchConfig::committed_tokens),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::committed_tokens);
+
+    if (batch_config->num_tokens_to_commit > 0) {
+      checkCUDA(cudaMemcpyAsync(
+          static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+          &(batch_config->committed_tokens),
+          batch_config->num_tokens_to_commit *
+              sizeof(BatchConfig::CommittedTokensInfo),
+          cudaMemcpyHostToDevice,
+          stream));
+      total_copy_size += sizeof(BatchConfig::committed_tokens);
+    }
   }
 
   // add a size check

From 0acdff29492bb305985a3be1fe949fb1a8d096ce Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 28 May 2024 11:09:57 -0700
Subject: [PATCH 288/667] chore: add sort in argtopk

---
 src/ops/arg_topk.cu | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 4a8a569c1..50d2bb8d3 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -81,6 +81,29 @@ __global__ void half2float_kernel(const half* __restrict__ in, float* __restrict
   }
 }
 
+template <typename DT>
+__global__ void insertion_sort_kernel(DT* topk_values, int* topk_indices, int batch_size, int k) {
+    int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (batch_index < batch_size) {
+        DT* values = topk_values + batch_index * k;
+        int* indices = topk_indices + batch_index * k;
+
+        for (int i = 1; i < k; i++) {
+            DT key_val = values[i];
+            int key_idx = indices[i];
+            int j = i - 1;
+            while (j >= 0 && values[j] < key_val) {
+                values[j + 1] = values[j];
+                indices[j + 1] = indices[j];
+                j = j - 1;
+            }
+            values[j + 1] = key_val;
+            indices[j + 1] = key_idx;
+        }
+    }
+}
+
+
 /*static*/
 template <typename DT>
 void ArgTopK::forward_kernel(
@@ -105,6 +128,13 @@ void ArgTopK::forward_kernel(
         indices_ptr,
         true,
         stream);
+    if (sorted) {
+      assert(output_ptr != nullptr);
+      insertion_sort_kernel<<<GET_BLOCKS(batch_size),
+                             min((size_t)CUDA_NUM_THREADS, batch_size),
+                             0,
+                             stream>>>(output_ptr, indices_ptr, batch_size, k);
+    }
   } else {
     // raft_radix_11bits_extra_pass_kernel<DT, int>(
     //     input_ptr,

From b1dac12a21cfca2258dd466cf40ae1de92cdf071 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 29 May 2024 01:15:46 -0400
Subject: [PATCH 289/667] Added a version of async specinfer. The old sync
 version is rewrite into the API serve_spec_infer_sync.

---
 include/flexflow/request_manager.h |   6 +-
 src/runtime/request_manager.cc     | 225 +++++++++++++++++------------
 2 files changed, 138 insertions(+), 93 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index cc4d77fec..9dc30f9f6 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -235,6 +235,7 @@ class RequestManager {
   FFModel *get_ssm_model(int model_id);
 
   void serve_spec_infer(FFModel *model);
+  void serve_spec_infer_sync(FFModel *model);
   void serve_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
   RequestGuid register_new_request(std::string const &prompt);
@@ -317,11 +318,12 @@ class RequestManager {
   // first small model inference results, the step equals to 1. That is, every
   // time a small model inference task is launched, the step is increased
   // by 1.
-  int current_speculation_step = 0;
+  int current_ssm_step = 0;
   // Maps the index of the request in the batch config to the request guid.
   int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
   bool request_available[BatchConfig::MAX_NUM_REQUESTS];
   int num_available_requests = 0;
+  int ssm_completed = true;
 
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
@@ -367,7 +369,7 @@ class RequestManager {
   bool update_llm_verify_results(InferenceResult const &llm_verify_result);
   bool
       update_ssm_inference_results(InferenceResult const &ssm_inference_result);
-  bool update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
+  void update_ssm_prefill_results(InferenceResult const &ssm_prefill_result);
   // Prepare the next speculation batch config. This function is called before
   // the second step of the speculation.
   BatchConfig prepare_next_spec_batch_config();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f03f1c36e..58ba7bf34 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -524,6 +524,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
       request_manager_status = PREFILLING;
       if (decoding_mode == SPECULATIVE_DECODING) {
         prefill_model = SSM;
+        current_ssm_step = 0;
       }
     }
     return;
@@ -551,12 +552,18 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         // Not completed, continue prefilling
       } else if (decoding_mode == SPECULATIVE_DECODING) {
         if (prefill_model == SSM) {
-          if (update_ssm_prefill_results(result)) {
-            // This indicates that the prefilling phase for SSM finishes
-            // We need to start the LLM prefilling
+          // A single iteration contains max_tree_depth SSM steps and a single
+          // LLM step. To align with this structure, we have to create
+          // max_tree_depth - 1 empty SSM steps during the prefilling phase.
+          if (current_ssm_step == 0) {
+            update_ssm_prefill_results(result);
+          }
+          // Except for the first step, we do nothing.
+          current_ssm_step++;
+
+          if (current_ssm_step == get_max_tree_depth()) {
             prefill_model = LLM;
           }
-          // Not completed, continue SSM prefilling
         } else if (prefill_model == LLM) {
           if (update_llm_prefill_results(result)) {
             // This indicates that the prefilling phase finishes
@@ -566,16 +573,20 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
                 !pending_request_queue.empty()) {
               // Load the pending request to the batch
               load_pending_reqeust_to_batch();
-              request_manager_status = PREFILLING;
               prefill_model = SSM;
+              current_ssm_step = 0;
             } else {
               // No more empty slots, start the speculation
               request_manager_status = SSM_SPEC;
               // Reset the prefill_request
-              current_speculation_step = 0;
+              current_ssm_step = 0;
+              ssm_completed = false;
             }
+          } else {
+            // Not completed, start the next iteration of prefilling
+            prefill_model = SSM;
+            current_ssm_step = 0;
           }
-          // Not completed, continue LLM prefilling
         } else {
           assert(false && "Invalid prefill model.");
         }
@@ -601,23 +612,32 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         if (pending_request_queue.empty()) {
           // No pending request to process, continue the speculation
           request_manager_status = SSM_SPEC;
-          current_speculation_step = 0;
+          current_ssm_step = 0;
+          ssm_completed = false;
         } else {
-          request_manager_status = PREFILLING;
           load_pending_reqeust_to_batch();
+          request_manager_status = PREFILLING;
           prefill_model = SSM;
+          current_ssm_step = 0;
         }
       } else {
         request_manager_status = SSM_SPEC;
-        current_speculation_step = 0;
+        current_ssm_step = 0;
+        ssm_completed = false;
       }
       break;
     case SSM_SPEC:
-      if (update_ssm_inference_results(result)) {
-        // Stop condition for the speculation phase has been reached
+      // Update current_ssm_step first because when we first call
+      // update_ssm_inference_results, there's already a step of small model
+      // inference
+      current_ssm_step++;
+      if (!ssm_completed) {
+        ssm_completed = update_ssm_inference_results(result);
+      }
+
+      if (current_ssm_step == get_max_tree_depth()) {
         request_manager_status = LLM_VERIFY;
       }
-      // else, keep the current status
       break;
     default:
       assert(false && "Invalid request manager status.");
@@ -726,7 +746,7 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   return request_completed;
 }
 
-bool RequestManager::update_ssm_prefill_results(
+void RequestManager::update_ssm_prefill_results(
     InferenceResult const &ssm_prefill_result) {
   // This function is called by update_inference_results when the
   // request_manager_status is PREFILLING and the prefill_model is SSM.
@@ -734,11 +754,6 @@ bool RequestManager::update_ssm_prefill_results(
   prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
 
   profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
-
-  if (prefill_request->ssm_cache_size == prefill_request->tokens.size()) {
-    return true;
-  }
-  return false;
 }
 
 BatchConfig RequestManager::prepare_next_batch() {
@@ -748,7 +763,12 @@ BatchConfig RequestManager::prepare_next_batch() {
         return prepare_llm_prefilling_batch();
       } else if (decoding_mode == SPECULATIVE_DECODING) {
         if (prefill_model == SSM) {
-          return prepare_ssm_prefilling_batch();
+          if (current_ssm_step == 0) {
+            return prepare_ssm_prefilling_batch();
+          } else {
+            // Return an empty batch config
+            return BatchConfig();
+          }
         } else if (prefill_model == LLM) {
           return prepare_llm_prefilling_batch();
         } else {
@@ -761,10 +781,13 @@ BatchConfig RequestManager::prepare_next_batch() {
     case DECODING:
       return prepare_decoding_batch();
     case SSM_SPEC:
-      if (current_speculation_step == 0) {
+      if (current_ssm_step == 0) {
         return prepare_first_spec_batch_config();
-      } else {
+      } else if (!ssm_completed) {
         return prepare_next_spec_batch_config();
+      } else {
+        // Return an empty batch config
+        return BatchConfig();
       }
     case LLM_VERIFY:
       return prepare_verify_batch_config();
@@ -987,7 +1010,7 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
   // information of the committed tokens into BatchConfig.TokensInfo.
   // 2. Maintain BatchConfig::RequestsInfo and all other fields of
   // BatchConfig.
-  assert(current_speculation_step == 0);
+  assert(current_ssm_step == 0);
 
   BatchConfig new_bc;
   new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
@@ -1064,7 +1087,7 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
   if (verbose) {
     std::cout << "\n############### prepare_next_spec_batch_config "
                  "###############\n";
-    std::cout << "Current tree depth: " << current_speculation_step + 1 << "\n";
+    std::cout << "Current tree depth: " << current_ssm_step + 1 << "\n";
   }
 
   // Prepare the next batch for existing requests
@@ -1090,7 +1113,7 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
 
     // Fill in the tokens
     TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
-    if (token_tree.tree_layers.size() <= current_speculation_step) {
+    if (token_tree.tree_layers.size() <= current_ssm_step) {
       // This request has no token to decode in this and the following small
       // model inference steps
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
@@ -1124,7 +1147,7 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
             new_bc.requestsInfo[request_index].first_token_index_in_request +
             child_index;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            request.tokens.size() - 1 + current_speculation_step;
+            request.tokens.size() - 1 + current_ssm_step;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
 
         new_bc.num_tokens++;
@@ -1352,9 +1375,8 @@ bool RequestManager::update_ssm_inference_results(
     InferenceResult const &ssm_inference_result) {
   // This function returns false if no tokens are added to the token tree,
   // which indicates that the ssm inference phase is done.
-  assert(current_speculation_step >= 0 &&
-         "The current speculation step should be no less than 0");
-  current_speculation_step++;
+  assert(current_ssm_step >= 1 &&
+         "The current speculation step should be no less than 1");
 
   int num_branches = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
   int result_index = 0;
@@ -1375,7 +1397,7 @@ bool RequestManager::update_ssm_inference_results(
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
-    if (current_speculation_step == 1) {
+    if (current_ssm_step == 1) {
       request.ssm_committed = true;
       // Check if both the KV cache of SSM and LLM are committed, because
       // sometimes the LLM KV cache is committed by a verifying batch config,
@@ -1389,7 +1411,7 @@ bool RequestManager::update_ssm_inference_results(
       request.ssm_cache_size = request.tokens.size();
     }
 
-    if (current_speculation_step == 1) {
+    if (current_ssm_step == 1) {
       init_bitmask_spec(guid);
     }
     append_bitmask(guid);
@@ -1398,8 +1420,7 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  return all_request_last_layer_empty ||
-         current_speculation_step > get_max_tree_depth();
+  return all_request_last_layer_empty;
 }
 
 /* --------- Bitmask Related Functions --------- */
@@ -1447,8 +1468,7 @@ void RequestManager::init_bitmask_spec(RequestGuid guid) {
   // 1. Clear the causal mask and add a root into it, because the tree is
   // currently empty but we have a root.
   // 2. Maintain all other fields.
-  assert(current_speculation_step == 1 &&
-         "The current speculation step should be 1");
+  assert(current_ssm_step == 1 && "The current speculation step should be 1");
   Request &request = all_requests[guid];
   request.causal_mask = BatchConfig::BitMask();
   // Set the mask for the root
@@ -1462,14 +1482,14 @@ void RequestManager::append_bitmask(RequestGuid guid) {
   // This method changes the bitmask in place
   // This method is called by update_ssm_inference_results(), after the new
   // tokens are added to the token tree
-  assert(current_speculation_step >= 1 &&
+  assert(current_ssm_step >= 1 &&
          "The current speculation step should be no less than 1");
 
   Request &request = all_requests[guid];
   BatchConfig::BitMask &bitmask = request.causal_mask;
   TokenTree &token_tree = request.speculative_token_trees[0];
 
-  if (token_tree.tree_layers.size() <= current_speculation_step) {
+  if (token_tree.tree_layers.size() <= current_ssm_step) {
     // This request has no token added in this and the following small model
     // inference steps, skip it
     return;
@@ -1807,84 +1827,107 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     im->init_operators_inference(ssm);
   }
 
-  InferenceResultFuture last_irf;
+  InferenceResultFuture irf_0;
   {
     // Initialize futures for incr decoding
-    InferenceResult ir;
-    last_irf = Future::from_value<InferenceResult>(ir);
+    InferenceResult ir_0;
+    irf_0 = Future::from_value<InferenceResult>(ir_0);
   }
 
   request_manager_status = PREFILLING;
   prefill_model = SSM;
 
-  // long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
+  std::queue<InferenceResultFuture> infer_result_future_pipeline;
+  infer_result_future_pipeline.push(irf_0);
 
   while (!is_background_server_terminated()) {
-    // last_irf.get_void_result();
-    BatchConfigFuture bcf = get_next_batch_config(last_irf, ctx, runtime);
-    bcf.get_void_result();
-    // time_2 = Realm::Clock::current_time_in_microseconds();
-    // std::cout << "Iteration time: " << (time_2 - time_1) * 1e-3 << "ms"
-    //           << std::endl;
+    if (infer_result_future_pipeline.size() >= 4) {
+      // Block here to avoid launching too many batches
+      auto const &ir = infer_result_future_pipeline.front();
+      ir.get_void_result();
+    }
+    // deque finished batches
+    while (infer_result_future_pipeline.size() > 1) {
+      auto const &ir = infer_result_future_pipeline.front();
+      if (ir.is_ready()) {
+        infer_result_future_pipeline.pop();
+      } else {
+        break;
+      }
+    }
+
+    runtime->begin_trace(ctx, 12345 /*trace_id*/);
+    for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth(); ssm_step_i++) {
+      InferenceResultFuture irf = infer_result_future_pipeline.back();
+      BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
+      FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
+      infer_result_future_pipeline.push(fm.get_future(0));
+    }
+    InferenceResultFuture irf = infer_result_future_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
+    FutureMap fm = im->inference(llm, 0, bcf);
+    infer_result_future_pipeline.push(fm.get_future(0));
+    runtime->end_trace(ctx, 12345 /*trace_id*/);
+  }
+}
+
+/*static*/
+void RequestManager::serve_spec_infer_sync(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  {
+    // Compile the llm
+    im->compile_model_and_allocate_buffer(llm);
+    assert(im->model_weights_loaders.find(llm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[llm]->load_weights(llm);
+    // init operators
+    im->init_operators_inference(llm);
+  }
+  for (size_t i = 0; i < get_num_ssms(); i++) {
+    // Compile the i-th ssm
+    FFModel *ssm = get_ssm_model(i);
+    im->compile_model_and_allocate_buffer(ssm);
+    assert(im->model_weights_loaders.find(ssm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[ssm]->load_weights(ssm);
+    // init operators
+    im->init_operators_inference(ssm);
+  }
 
-    // time_1 = Realm::Clock::current_time_in_microseconds();
+  InferenceResultFuture irf_0;
+  {
+    // Initialize futures for incr decoding
+    InferenceResult ir_0;
+    irf_0 = Future::from_value<InferenceResult>(ir_0);
+  }
+
+  request_manager_status = PREFILLING;
+  prefill_model = SSM;
+
+  while (!is_background_server_terminated()) {
+    BatchConfigFuture bcf = get_next_batch_config(irf_0, ctx, runtime);
+    bcf.get_void_result();
     if ((request_manager_status == PREFILLING and prefill_model == LLM) or
         request_manager_status == LLM_VERIFY) {
-      //   std::cout << "Branch 1" << std::endl;
       runtime->begin_trace(ctx, 12345 /*trace_id*/);
       FutureMap fm = im->inference(llm, 0, bcf);
-      //   assert(fm.get_future_map_domain().get_volume() == 1);
-      last_irf = fm.get_future(0);
+      irf_0 = fm.get_future(0);
       runtime->end_trace(ctx, 12345 /*trace_id*/);
     } else if ((request_manager_status == PREFILLING and
                 prefill_model == SSM) or
                request_manager_status == SSM_SPEC) {
-      //   std::cout << "Branch 2" << std::endl;
       runtime->begin_trace(ctx, 23456 /*trace_id*/);
       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-      //   assert(fm.get_future_map_domain().get_volume() == 1);
-      last_irf = fm.get_future(0);
+      irf_0 = fm.get_future(0);
       runtime->end_trace(ctx, 23456 /*trace_id*/);
     } else {
       assert(false && "Invalid request manager status");
     }
   }
-
-  //   BatchConfigFuture bcf;
-
-  //   std::queue<InferenceResultFuture> infer_result_future_pipeline;
-  //   { infer_result_future_pipeline.push(last_irf); }
-
-  //   while (!is_background_server_terminated()) {
-  //     if (infer_result_future_pipeline.size() >= 4) {
-  //       // Block here to avoid launching too many batches
-  //       auto const &ir = infer_result_future_pipeline.front();
-  //       ir.get_void_result();
-  //     }
-  //     // deque finished batches
-  //     while (infer_result_future_pipeline.size() > 1) {
-  //       auto const &ir = infer_result_future_pipeline.front();
-  //       if (ir.is_ready()) {
-  //         infer_result_future_pipeline.pop();
-  //       } else {
-  //         break;
-  //       }
-  //     }
-
-  //     runtime->begin_trace(ctx, 12345 /*trace_id*/);
-  //     for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth();
-  //     ssm_step_i++) {
-  //       last_irf = infer_result_future_pipeline.back();
-  //       bcf = get_next_batch_config(last_irf, ctx, runtime);
-  //       FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-  //       infer_result_future_pipeline.push(fm.get_future(0));
-  //     }
-  //     last_irf = infer_result_future_pipeline.back();
-  //     bcf = get_next_batch_config(last_irf, ctx, runtime);
-  //     FutureMap fm = im->inference(llm, 0, bcf);
-  //     infer_result_future_pipeline.push(fm.get_future(0));
-  //     runtime->end_trace(ctx, 12345 /*trace_id*/);
-  //   }
 }
 
 void RequestManager::trigger_request_completion_future(
@@ -2112,7 +2155,7 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     assert(request.status == Request::RUNNING);
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
 
-    if (spec_token_tree.tree_layers.size() <= current_speculation_step) {
+    if (spec_token_tree.tree_layers.size() <= current_ssm_step) {
       // This request has no token added in this layer, skip it
       continue;
     }

From e674f7123322ce40266cca9554749179aab624ee Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 29 May 2024 01:18:28 -0400
Subject: [PATCH 290/667] Fixed load token task.

---
 src/runtime/request_manager.cu | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 326ebc5d2..ac5cc5e88 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -48,21 +48,23 @@ void RequestManager::load_tokens_task(
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 
-  for (int i = 0; i < batch_config->num_tokens; i++) {
-    dram_copy[i] = batch_config->tokensInfo[i].token_id;
+  if (batch_config->num_tokens > 0) {
+    for (int i = 0; i < batch_config->num_tokens; i++) {
+      dram_copy[i] = batch_config->tokensInfo[i].token_id;
+    }
+    TokenId *fb_ptr = helperGetTensorPointerWO<TokenId>(
+        regions[0], task->regions[0], FID_DATA, ctx, runtime);
+    Domain domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+    assert(batch_config->num_tokens <= domain.get_volume());
+    cudaStream_t stream;
+    checkCUDA(get_legion_stream(&stream));
+    checkCUDA(cudaMemcpyAsync(fb_ptr,
+                              dram_copy,
+                              sizeof(TokenId) * batch_config->num_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
   }
-  TokenId *fb_ptr = helperGetTensorPointerWO<TokenId>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  Domain domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  assert(batch_config->num_tokens <= domain.get_volume());
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkCUDA(cudaMemcpyAsync(fb_ptr,
-                            dram_copy,
-                            sizeof(TokenId) * batch_config->num_tokens,
-                            cudaMemcpyHostToDevice,
-                            stream));
 }
 
 void RequestManager::load_batch_config_task(

From cf18e8a4c7b7d95ceabf8c2bc37490a26764ea4d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 29 May 2024 07:59:00 -0700
Subject: [PATCH 291/667] chore: minor output format for alignment checking

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 58ba7bf34..907e5e900 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -502,7 +502,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
         << std::endl;
   }
-  *os << output << std::endl << std::endl;
+  *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
 
   if (!output_filepath.empty()) {
     output_file.close();

From 0f66b083ca31b17212362dcc2915284ae91d6100 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 30 May 2024 02:00:42 -0400
Subject: [PATCH 292/667] Fixed a bug.

---
 src/runtime/request_manager.cc | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 58ba7bf34..14a4c02fd 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -411,8 +411,17 @@ BatchConfig RequestManager::get_next_batch_config_task(
     Context ctx,
     Runtime *runtime) {
   RequestManager *rm = *((RequestManager **)task->args);
+  if (rm->request_manager_status == PREFILLING and rm->prefill_model == SSM and
+      rm->current_ssm_step != 0) {
+    // Return an empty batch config
+    return rm->get_next_batch_config(InferenceResult());
+  } else if (rm->request_manager_status == SSM_SPEC and rm->ssm_completed) {
+    return rm->get_next_batch_config(InferenceResult());
+  }
+
   InferenceResult const &result =
       Future(task->futures[0]).get_result<InferenceResult>();
+  int t_1 = Realm::Clock::current_time_in_microseconds();
   return rm->get_next_batch_config(result);
 }
 
@@ -470,12 +479,12 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   std::string output =
       this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
 
-  std::cout << "Request " << guid << " completed: " << std::endl
-            << "<bos>" << output;
+  std::cout << "Request " << guid << " completed: " << std::endl << std::endl;
+  std::cout << "<bos>" << output;
   if (eos_rit != request.tokens.rend()) {
     std::cout << "<eos>";
   }
-  std::cout << std::endl;
+  std::cout << std::endl << std::endl;
   ProfileInfo profile_info = profiling_requests[guid];
 
   std::ostream *os = &std::cout;
@@ -491,18 +500,21 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     }
   }
   *os << "Request " << guid << " profiling: " << std::endl;
-  *os << "Decoding time: "
-      << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
-      << "ms" << std::endl;
+  if (profile_info.start_decoding_time != 0) {
+    *os << "Decoding time: "
+        << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
+        << " ms" << std::endl;
+  } else {
+    *os << "Decoding time: 0 ms" << std::endl;
+  }
   *os << "Total time: "
-      << (profile_info.finish_time - profile_info.start_time) * 1e-3 << "ms"
+      << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
       << std::endl;
   *os << "LLM decoding steps: " << profile_info.llm_decoding_steps << std::endl;
   if (decoding_mode == SPECULATIVE_DECODING) {
     *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
         << std::endl;
   }
-  *os << output << std::endl << std::endl;
 
   if (!output_filepath.empty()) {
     output_file.close();
@@ -671,6 +683,10 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);
     prefill_completed = true;
 
+    if (prefill_request->tokens.back() == eos_token_id) {
+      request_complete_clean_up(prefill_request->batch_index);
+    }
+
     if (decoding_mode == SPECULATIVE_DECODING) {
       // Add the last token to the token tree
       prefill_request->committed_tokens.push_back(

From 6c608aca525d96e43e5886436803cc193ea1e098 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 29 May 2024 23:20:06 -0700
Subject: [PATCH 293/667] chore: minor output format for alignment checking

---
 src/runtime/request_manager.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 14a4c02fd..72577dc75 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -515,6 +515,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
         << std::endl;
   }
+  *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
 
   if (!output_filepath.empty()) {
     output_file.close();

From a654262dd5be460b3f9cb3ef6168fdcc74bce645 Mon Sep 17 00:00:00 2001
From: Remi <54138269+Flechman@users.noreply.github.com>
Date: Thu, 30 May 2024 11:11:51 -0400
Subject: [PATCH 294/667] Update write_to_output_file()

---
 src/runtime/request_manager.cc | 78 +++++++++++-----------------------
 1 file changed, 25 insertions(+), 53 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4bded8280..3c5732c1d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -33,17 +33,22 @@ using tokenizers::Tokenizer;
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
 void write_to_output_file(std::string const &output_filepath, std::string const &str) {
+  std::ostream *os = &std::cout;
+  std::ofstream output_file;
   if (!output_filepath.empty()) {
-    std::ofstream outputFile(output_filepath, std::ios::app);
-    if (outputFile.is_open()) {
-      outputFile << str << std::endl;
-      outputFile.close();
+    output_file.open(output_filepath, std::ios::app);
+    if (output_file.is_open()) {
+      os = &output_file;
     } else {
       std::cout << "Unable to open the output file: " << output_filepath
                 << std::endl;
       assert(false);
     }
   }
+  *os << str << std::endl;
+  if (!output_filepath.empty()) {
+    output_file.close();
+  }
 }
 
 std::string LoadBytesFromFile(std::string const &path) {
@@ -500,57 +505,24 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     std::cout << "<eos>";
   }
   std::cout << std::endl << std::endl;
-  ProfileInfo profile_info = profiling_requests[guid];
-
-  // TODO: merge write_to_output_file() with *os logic
-  std::ostream *os = &std::cout;
-  std::ofstream output_file;
-  if (!output_filepath.empty()) {
-    output_file.open(output_filepath, std::ios::app);
-    if (output_file.is_open()) {
-      os = &output_file;
-    } else {
-      std::cout << "Unable to open the output file: " << output_filepath
-                << std::endl;
-      assert(false);
-    }
-  }
-  *os << "Request " << guid << " profiling: " << std::endl;
-  if (profile_info.start_decoding_time != 0) {
-    *os << "Decoding time: "
-        << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
-        << " ms" << std::endl;
-  } else {
-    *os << "Decoding time: 0 ms" << std::endl;
-  }
-  *os << "Total time: "
-      << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
-      << std::endl;
-  *os << "LLM decoding steps: " << profile_info.llm_decoding_steps << std::endl;
+  RequestProfileInfo profile_info = profiling_requests[guid];
+  std::string str = "[" + std::to_string(guid) + "] Request completed:" + 
+                      " decoding_time_ms(" + std::to_string(
+                        (profile_info.finish_time-
+                          profile_info.start_decoding_time)
+                          *1e-3) + ")" + 
+                      " total_time_ms(" + std::to_string(
+                        (profile_info.finish_time-
+                          profile_info.start_time)
+                          *1e-3) + ")" + 
+                      " LLM_decoding_steps(" + std::to_string(
+                        profile_info.llm_decoding_steps) 
+                        + ")";
   if (decoding_mode == SPECULATIVE_DECODING) {
-    *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
-        << std::endl;
+    str = str + " SSM_decoding_steps(" + std::to_string(
+      profile_info.ssm_decoding_steps) 
+      + ")";
   }
-  *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
-
-  if (!output_filepath.empty()) {
-    output_file.close();
-  }
-  std::string str = "[" + std::to_string(guid) + "] Request completed: " + 
-                      "decoding_time_ms(" + std::to_string(
-                        (profiling_requests[guid].finish_time-
-                          profiling_requests[guid].start_decoding_time)
-                          *1e-3) + ") " + 
-                      "total_time_ms(" + std::to_string(
-                        (profiling_requests[guid].finish_time-
-                          profiling_requests[guid].start_time)
-                          *1e-3) + ") " + 
-                      "LLM_decoding_steps(" + std::to_string(
-                        profiling_requests[guid].llm_decoding_steps) 
-                        + ") " + 
-                      "SSM_decoding_steps(" + std::to_string(
-                        profiling_requests[guid].ssm_decoding_steps) 
-                        + ")";
   write_to_output_file(output_filepath, str);
 
   trigger_request_completion_future(guid);

From 31abfbfe56862daf7b195351eecfbe7d3c4c90d1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 30 May 2024 12:25:22 -0700
Subject: [PATCH 295/667] feat: partial enable cudaGraph

---
 src/ops/fused.cu | 49 ++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 03838cd49..a89b16d43 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -612,9 +612,10 @@ __host__ void
   //graph_params.Print();
   // int shard_id = task->index_point.point_data[0];
 
+  bool use_cuda_graph = (bc->prompt_phase == false && bc->get_mode() == TREE_SEARCH_MODE);
   bool captured = false;
 
-  if(metas->graph_collections.count(graph_params)  != 0) {
+  if(use_cuda_graph && metas->graph_collections.count(graph_params)  != 0) {
     captured = true;
     instance = metas->graph_collections[graph_params];
     // if (cudaGraphExecUpdate(instance, graph, NULL, &updateResult) != cudaSuccess) {
@@ -630,10 +631,12 @@ __host__ void
     // }
   }
 
-  // if (!captured) {
-  //   cudaGraph_t graph;
-  //   {    
-  //     cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+  if (!captured) {
+    cudaGraph_t graph;
+    {    
+      if (use_cuda_graph) {
+        cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+      }
       int ioff = 0, woff = 0, ooff = 0;
       for (int op = 0; op < fused->numOperators; op++) {
         // Domain my_id[MAX_NUM_INPUTS];
@@ -1158,22 +1161,28 @@ __host__ void
       // for (int i = 0; i < fused->numOutputs; i++)
       //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
       //   "[Fused:forward:output]");
-  //     cudaStreamEndCapture(stream, &graph);
-  //   }
-  //   cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
-  //   metas->graph_collections[graph_params] = instance;
-  //   // if(shard_id == 0) {
-  //   //   printf("*************start cudaGraphInstantiate**********\n");
-  //   //   graph_params.Print();
-  //   //   // bc->print();
-  //   //   printf("*************end cudaGraphInstantiate**********\n");
-  //   // }
-  //   cudaGraphDestroy(graph);
-  // }
+      if (use_cuda_graph) {
+        cudaStreamEndCapture(stream, &graph);
+      }
+    }
+    if (use_cuda_graph) { 
+      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+      metas->graph_collections[graph_params] = instance;
+      // if(shard_id == 0) {
+      //   printf("*************start cudaGraphInstantiate**********\n");
+      //   graph_params.Print();
+      //   // bc->print();
+      //   printf("*************end cudaGraphInstantiate**********\n");
+      // }
+      cudaGraphDestroy(graph);
+    }
+  }
 
-  // assert(metas->graph_collections.find(graph_params) !=
-  //       metas->graph_collections.end());
-  // cudaGraphLaunch(instance, stream);
+  if (use_cuda_graph) {
+    assert(metas->graph_collections.find(graph_params) !=
+          metas->graph_collections.end());
+    cudaGraphLaunch(instance, stream);
+  }
 }
 
 /*

From c9bff3c3425a8bc55f200b1c0494a4b429b6a2e6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 31 May 2024 00:07:28 -0700
Subject: [PATCH 296/667] chore: align tree_inc_attn w/ inc_attn

---
 src/ops/tree_inc_multihead_self_attention.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 273de4d5f..0aaa0f1c8 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -112,7 +112,7 @@ __global__ void compute_attention_kernel_fused_kernel(
   extern __shared__ char smem_[];
 
   float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_ + qk_smem_sz);
+  float *out_smem = reinterpret_cast<float *>(smem_);
 
   float qk_max = -FLT_MAX;
 
@@ -160,7 +160,7 @@ __global__ void compute_attention_kernel_fused_kernel(
     for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
       K_vec k[K_VECS_PER_THREAD];
       int const ti_circ = ti % max_seq_length;
-
+#pragma unroll
       for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
         int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
         if (ti < tlength) {

From afb9624dada4841a521efe1f583f2b27eef4bc51 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 31 May 2024 00:56:34 -0700
Subject: [PATCH 297/667] chore: time measurement in tree_inc

---
 src/ops/tree_inc_multihead_self_attention.cu | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 0aaa0f1c8..2b178cc2b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -849,10 +849,16 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
           BatchConfig::max_spec_tree_token_num(),
       m->hidden_size);
 
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
+
   dim3 grid(m->num_q_heads, bc->num_active_requests());
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
   // 0->qk production size, 1->total shared size
+  // per_head_size: 128, thd_per_v:32, prompt_phase: 0
   int smem_sz[2];
   if (per_head_size == 64) {
     constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
@@ -865,6 +871,15 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   } else {
     assert(false && "a unsupported head size");
   }
+
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // float elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+
 }
 
 template <typename DT>

From 042d6a97a102b0a7a303ebe8e4e7e67c4e8ef9af Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 31 May 2024 05:34:34 -0400
Subject: [PATCH 298/667] Move the bitmask to shared memory to improve
 performance.

---
 .../inc_multihead_self_attention_utils.cuh    |  4 +-
 src/ops/spec_inc_multihead_self_attention.cu  | 83 +++++++++----------
 src/ops/tree_inc_multihead_self_attention.cu  | 74 +++++++++--------
 3 files changed, 78 insertions(+), 83 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index 546d5e9a9..65e4bc962 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -493,7 +493,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head,
   }
 
   // todo fix this
-  int max_qk_length = max_query_length;
+  int max_qk_length = max_total_length;
 
   // The amount of shared memory needed to store the Q*K^T values in float.
   size_t qk_sz = div_up(max_qk_length + 1, 4) * 16;
@@ -521,7 +521,7 @@ struct threads_per_value_t {
 };
 
 #define test_bit(bit_mask, idx, pos)                                           \
-  (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
+  (((bit_mask)[idx][(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
 } // namespace FlexFlow
 #endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index c89f5b2bd..6342a5191 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -83,25 +83,23 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     }
   }
 
-  // threads converge
-  //   __syncthreads();
-
-  // request_idx = re
-
-  // BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
-  BatchConfig::BitMask *bitmask = &causalMask[requext_idx_in_batch];
+  int non_tree_cache_size =
+      causalMask[requext_idx_in_batch].non_tree_cache_size;
+  int tree_or_prompt_size =
+      causalMask[requext_idx_in_batch].tree_or_prompt_size;
+  int current_layer_size = causalMask[requext_idx_in_batch].current_layer_size;
+
+  __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
+                              [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
+  for (int i = tidx; i < tree_or_prompt_size; i += THREADS_PER_BLOCK) {
+    for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
+      bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
+    }
+  }
 
   int const first_step = 0;
 
-  // int const tlength =
-  //     request_infos[requext_idx_in_batch].first_token_depth_in_request +
-  //     request_infos[requext_idx_in_batch].num_tokens_in_batch;
-
-  //   int const totalCacheSize = bitmask->non_tree_cache_size +
-  //                              bitmask->tree_or_prompt_size +
-  //                              bitmask->prompt_size - 1;
-  int const totalCacheSize =
-      bitmask->non_tree_cache_size + bitmask->tree_or_prompt_size;
+  int const totalCacheSize = non_tree_cache_size + tree_or_prompt_size;
 
   int const first_token_idx =
       request_infos[requext_idx_in_batch].first_token_offset_in_batch;
@@ -151,11 +149,7 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
           ii * THREADS_PER_KEY * K_VEC_SIZE);
     }
 
-    // int const query_token = bitmask->prompt_size +
-    // bitmask->tree_or_prompt_size
-    // -
-    //                         1 - tree_branch_num + qi;
-    int const query_token = bitmask->tree_or_prompt_size - tree_branch_num + qi;
+    int const query_token = tree_or_prompt_size - tree_branch_num + qi;
 
     __syncthreads();
     for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
@@ -176,12 +170,9 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
       if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) {
         // todo add alobi here
         // bool const mask = ti_circ >= totalCacheSize;
-        bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                           (!test_bit(bitmask->bit_mask,
-                                      query_token,
-                                      ti - bitmask->non_tree_cache_size)));
-        // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-        //   (1 << query_token))));
+        bool const mask =
+            (ti >= non_tree_cache_size &&
+             (!test_bit(bit_mask, query_token, ti - non_tree_cache_size)));
 
         // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
         //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
@@ -231,12 +222,9 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
     float exp_sum = 0.f;
     for (int ti = first_step + tidx; ti < totalCacheSize;
          ti += THREADS_PER_BLOCK) {
-      bool const mask = (ti >= bitmask->non_tree_cache_size &&
-                         (!test_bit(bitmask->bit_mask,
-                                    query_token,
-                                    ti - bitmask->non_tree_cache_size)));
-      // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-      //   (1 << query_token))));
+      bool const mask =
+          (ti >= non_tree_cache_size &&
+           (!test_bit(bit_mask, query_token, ti - non_tree_cache_size)));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -397,19 +385,22 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                                                       Dh_MAX,                  \
                                                       THDS_PER_KEY,            \
                                                       THREADS_PER_VALUE>       \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length() +                                 \
-              BatchConfig::max_spec_tree_token_num(),                          \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos,                                                    \
-          m->causalMask,                                                       \
-          m->request_available)
+      <<<grid,                                                                 \
+         THDS_PER_BLOCK,                                                       \
+         smem_sz + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM *                      \
+                       BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 8,               \
+         stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
+                   static_cast<DT *>(m->keyCache),                             \
+                   static_cast<DT *>(m->valueCache),                           \
+                   output_ptr,                                                 \
+                   scale,                                                      \
+                   BatchConfig::max_sequence_length() +                        \
+                       BatchConfig::max_spec_tree_token_num(),                 \
+                   m->qProjSize,                                               \
+                   m->hidden_size,                                             \
+                   m->request_infos,                                           \
+                   m->causalMask,                                              \
+                   m->request_available)
 
 template <typename DT>
 void compute_spec_inc_attention_kernel_generation(
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 273de4d5f..05be94853 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -99,8 +99,16 @@ __global__ void compute_attention_kernel_fused_kernel(
       request_infos[requext_idx_in_batch].num_tokens_in_batch;
   int const qlength = request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
-  // BatchConfig::BitMask bitmask = causalMask[requext_idx_in_batch];
-  BatchConfig::BitMask *bitmask = &causalMask[requext_idx_in_batch];
+  __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
+                              [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
+  for (int i = tidx; i < qlength; i += THREADS_PER_BLOCK) {
+    for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
+      bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
+    }
+  }
+
+  int non_tree_cache_size =
+      causalMask[requext_idx_in_batch].non_tree_cache_size;
 
   int const first_token_idx =
       request_infos[requext_idx_in_batch].first_token_offset_in_batch;
@@ -173,13 +181,10 @@ __global__ void compute_attention_kernel_fused_kernel(
 
       if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
         bool const mask =
-            prompt_phase ? (qi + q_start < ti)
-                         : (ti >= bitmask->non_tree_cache_size &&
-                            (!test_bit(bitmask->bit_mask,
-                                       qi,
-                                       ti - bitmask->non_tree_cache_size)));
-        // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-        //    (1 << qi))));
+            prompt_phase
+                ? (qi + q_start < ti)
+                : (ti >= non_tree_cache_size &&
+                   (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
 
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
 
@@ -233,14 +238,10 @@ __global__ void compute_attention_kernel_fused_kernel(
 
     float exp_sum = 0.f;
     for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      bool const mask = prompt_phase
-                            ? (q_start + qi < ti)
-                            : (ti >= bitmask->non_tree_cache_size &&
-                               (!test_bit(bitmask->bit_mask,
-                                          qi,
-                                          ti - bitmask->non_tree_cache_size)));
-      // (!(bitmask->mask[ti - bitmask->non_tree_cache_size] &
-      //    (1 << qi))));
+      bool const mask =
+          prompt_phase ? (q_start + qi < ti)
+                       : (ti >= non_tree_cache_size &&
+                          (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -803,24 +804,27 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                         Dh_MAX,                                \
                                         THDS_PER_KEY,                          \
                                         THDS_PER_VALUE>                        \
-      <<<grid, THDS_PER_BLOCK, smem_sz[1], stream>>>(                          \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length() +                                 \
-              BatchConfig::max_spec_tree_token_num(),                          \
-          BatchConfig::max_tokens_per_batch(),                                 \
-          m->qProjSize,                                                        \
-          m->hidden_size,                                                      \
-          m->request_infos,                                                    \
-          m->num_q_heads,                                                      \
-          bc->num_active_requests(),                                           \
-          m->causalMask,                                                       \
-          m->request_available,                                                \
-          smem_sz[0],                                                          \
-          prompt_phase)
+      <<<grid,                                                                 \
+         THDS_PER_BLOCK,                                                       \
+         smem_sz[1] + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM *                   \
+                          BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 8,            \
+         stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
+                   static_cast<DT *>(m->keyCache),                             \
+                   static_cast<DT *>(m->valueCache),                           \
+                   output_ptr,                                                 \
+                   scale,                                                      \
+                   BatchConfig::max_sequence_length() +                        \
+                       BatchConfig::max_spec_tree_token_num(),                 \
+                   BatchConfig::max_tokens_per_batch(),                        \
+                   m->qProjSize,                                               \
+                   m->hidden_size,                                             \
+                   m->request_infos,                                           \
+                   m->num_q_heads,                                             \
+                   bc->num_active_requests(),                                  \
+                   m->causalMask,                                              \
+                   m->request_available,                                       \
+                   smem_sz[0],                                                 \
+                   prompt_phase)
 
 template <typename DT>
 void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,

From b68ce45b290e79df87e24b4e034cc10bb1cb7d78 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 31 May 2024 05:35:57 -0400
Subject: [PATCH 299/667] Removed an unused line.

---
 src/runtime/request_manager.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 14a4c02fd..a2ce4669f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -421,7 +421,6 @@ BatchConfig RequestManager::get_next_batch_config_task(
 
   InferenceResult const &result =
       Future(task->futures[0]).get_result<InferenceResult>();
-  int t_1 = Realm::Clock::current_time_in_microseconds();
   return rm->get_next_batch_config(result);
 }
 

From 10c4f655a484e37f649674b1c80b017fd5800cec Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 31 May 2024 03:34:35 -0700
Subject: [PATCH 300/667] chore: remove unused

---
 src/ops/spec_inc_multihead_self_attention.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 6342a5191..6331b3acd 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -87,7 +87,6 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
       causalMask[requext_idx_in_batch].non_tree_cache_size;
   int tree_or_prompt_size =
       causalMask[requext_idx_in_batch].tree_or_prompt_size;
-  int current_layer_size = causalMask[requext_idx_in_batch].current_layer_size;
 
   __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
                               [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];

From d4b0bafcd8ea0710538c70007d6bcca7ef28a6aa Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 31 May 2024 04:09:51 -0700
Subject: [PATCH 301/667] chore: minor

---
 .../ops/kernels/inc_multihead_self_attention_utils.cuh         | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu                   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index 65e4bc962..481243867 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -512,7 +512,7 @@ inline void smem_size_in_bytes_tree(int hidden_size_per_head,
   size_t red_sz = rows_per_red * hidden_size_per_head * sizeof(float) / 2;
   // The max.
   shared_mem[0] = qk_sz;
-  shared_mem[1] = softmax_sz + red_sz + q_size;
+  shared_mem[1] = max(softmax_sz, red_sz) + q_size;
 }
 
 template <typename T, int Dh>
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index cee2fe7f4..c022fabcf 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -806,8 +806,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                         THDS_PER_VALUE>                        \
       <<<grid,                                                                 \
          THDS_PER_BLOCK,                                                       \
-         smem_sz[1] + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM *                   \
-                          BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 8,            \
+         smem_sz[1],                                                           \
          stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
                    static_cast<DT *>(m->keyCache),                             \
                    static_cast<DT *>(m->valueCache),                           \

From 653069b1e0408be4e2b682b375032be4366feb1a Mon Sep 17 00:00:00 2001
From: Remi Delacourt <rdelacou@catalyst-cluster.cs.cmu.edu>
Date: Fri, 31 May 2024 10:14:11 -0400
Subject: [PATCH 302/667] Fix raft patch for Docker

---
 config/config.linux | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/config/config.linux b/config/config.linux
index b2c1e2703..397c80924 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,7 +111,9 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
-patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/raft.patch
+echo $(dirname $0)
+echo "====================="
+patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/../config/raft.patch
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc

From 040525bce4841f2d276bea12f19118231da36840 Mon Sep 17 00:00:00 2001
From: Remi <54138269+Flechman@users.noreply.github.com>
Date: Fri, 31 May 2024 11:36:32 -0400
Subject: [PATCH 303/667] Update config.linux

---
 config/config.linux | 2 --
 1 file changed, 2 deletions(-)

diff --git a/config/config.linux b/config/config.linux
index 397c80924..873f74783 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,8 +111,6 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
-echo $(dirname $0)
-echo "====================="
 patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/../config/raft.patch
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then

From 67117e208beb4f7c0a7989bf4399604de28daff3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 31 May 2024 23:24:19 -0400
Subject: [PATCH 304/667] Added a copy constructor for BatchConfig.

---
 include/flexflow/batch_config.h |  1 +
 src/runtime/batch_config.cc     | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 5ac175e00..e0f02ff20 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -33,6 +33,7 @@ class BatchConfig {
   using TokenId = int;
   BatchConfig(InferenceMode inference_mode = INC_DECODING_MODE,
               int model_id = 0);
+  BatchConfig(BatchConfig const &other);
   int num_active_requests() const;
   int num_active_tokens() const;
   static int max_requests_per_batch();
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index efbf82ab0..15b14e547 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -33,6 +33,31 @@ BatchConfig::BatchConfig(InferenceMode inference_mode_, int model_id_)
   // Other fields are already initialized to proper value.
 }
 
+BatchConfig::BatchConfig(BatchConfig const &rhs) {
+  model_id = rhs.model_id;
+  inference_mode = rhs.inference_mode;
+  num_available_requests = rhs.num_available_requests;
+  num_tokens = rhs.num_tokens;
+  prompt_phase = rhs.prompt_phase;
+  num_tokens_to_commit = rhs.num_tokens_to_commit;
+  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
+    tokensInfo[token_idx] = rhs.tokensInfo[token_idx];
+  }
+  for (int request_idx = 0; request_idx < max_requests_per_batch();
+       request_idx++) {
+    if (rhs.request_available[request_idx]) {
+      request_available[request_idx] = true;
+      requestsInfo[request_idx] = rhs.requestsInfo[request_idx];
+      causalMask[request_idx] = rhs.causalMask[request_idx];
+    }
+  }
+  for (int committed_token_idx = 0; committed_token_idx < num_tokens_to_commit;
+       committed_token_idx++) {
+    committed_tokens[committed_token_idx] =
+        rhs.committed_tokens[committed_token_idx];
+  }
+}
+
 /*static*/
 BatchConfig const *BatchConfig::from_future(BatchConfigFuture const &future) {
   return static_cast<BatchConfig const *>(

From 3edf56f4d4a165fcb7d4a3105e86ddee21d70196 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 1 Jun 2024 14:42:43 -0400
Subject: [PATCH 305/667] Optimized mask loading.

---
 src/ops/spec_inc_multihead_self_attention.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 6342a5191..40a631ab4 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -88,10 +88,12 @@ __global__ void compute_spec_inc_attention_kernel_generation_kernel(
   int tree_or_prompt_size =
       causalMask[requext_idx_in_batch].tree_or_prompt_size;
   int current_layer_size = causalMask[requext_idx_in_batch].current_layer_size;
+  int start_offset = tree_or_prompt_size - current_layer_size;
 
   __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
                               [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
-  for (int i = tidx; i < tree_or_prompt_size; i += THREADS_PER_BLOCK) {
+  for (int i = start_offset + tidx; i < tree_or_prompt_size;
+       i += THREADS_PER_BLOCK) {
     for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
       bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
     }

From 6ac1d7a41856cbc2275405e1d9791beb2287c4c9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 1 Jun 2024 18:41:32 -0400
Subject: [PATCH 306/667] Removed the parameter speculative_decoding from
 arg_top_k. Added a parameter renormalize to arg_top_k.

---
 include/flexflow/flexflow_c.h          |   2 +-
 include/flexflow/model.h               |  12 +-
 include/flexflow/ops/arg_topk.h        |   6 +-
 include/flexflow/ops/arg_topk_params.h |   2 +-
 inference/models/llama.cc              |   2 +-
 inference/models/opt.cc                |   2 +-
 inference/models/starcoder.cc          |   2 +-
 src/c/flexflow_c.cc                    |   5 +-
 src/ops/arg_topk.cc                    | 146 +++++++-----------
 src/ops/arg_topk.cu                    | 197 +++++++++++--------------
 10 files changed, 160 insertions(+), 216 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index aec5b0d57..0a6eebb18 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -571,7 +571,7 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
                                                flexflow_tensor_t const input_,
                                                int k,
                                                bool sorted,
-                                               bool speculative_decoding,
+                                               bool renormalize,
                                                char const *name);
 
 // flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 0346cf5cf..2f2706693 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -681,16 +681,16 @@ class FFModel {
              bool sorted,
              char const *name = NULL);
   Tensor gumbel_top_k(Tensor const input,
-                   // Tensor *outputs,
-                   int k,
-                   bool sorted,
-                   bool speculative_decoding,
-                   char const *name = NULL);
+                      // Tensor *outputs,
+                      int k,
+                      bool sorted,
+                      bool speculative_decoding,
+                      char const *name = NULL);
   Tensor arg_top_k(Tensor const input,
                    // Tensor *outputs,
                    int k,
                    bool sorted,
-                   bool speculative_decoding,
+                   bool renormalize,
                    char const *name = NULL);
   Tensor argmax(Tensor const input, bool beam_search, char const *name = NULL);
   Tensor sampling(Tensor const input, float top_p, char const *name = NULL);
diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 06bd4b84f..721ccd501 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -13,7 +13,7 @@ class ArgTopKMeta : public OpMeta {
 public:
   bool sorted;
   int k;
-  bool speculative_decoding;
+  bool renormalize;
   Realm::RegionInstance reserveInst;
   void *half_precision_output;
   int max_input_size;
@@ -32,7 +32,7 @@ class ArgTopK : public Op {
           ParallelTensor const input,
           int k,
           bool sorted,
-          bool speculative_decoding,
+          bool renormalize,
           char const *name);
   ArgTopK(FFModel &model,
           LayerID const &layer_guid,
@@ -109,7 +109,7 @@ class ArgTopK : public Op {
 public:
   int k;
   bool sorted;
-  bool speculative_decoding;
+  bool renormalize;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/arg_topk_params.h b/include/flexflow/ops/arg_topk_params.h
index b2876c011..306ce9dd1 100644
--- a/include/flexflow/ops/arg_topk_params.h
+++ b/include/flexflow/ops/arg_topk_params.h
@@ -11,7 +11,7 @@ struct ArgTopKParams {
   LayerID layer_guid;
   int k;
   bool sorted;
-  bool speculative_decoding;
+  bool renormalize;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 365722578..19bdd85fd 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -249,7 +249,7 @@ void LLAMA::create_llama_model(FFModel &ff,
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(dense, -1);
-    output = ff.arg_top_k(softmax, llama_config.k_of_arg_topk, false, true);
+    output = ff.arg_top_k(softmax, llama_config.k_of_arg_topk, false, false);
     // output = ff.top_k(softmax, )
   } else {
     // Tensor softmax = ff.softmax(dense, -1);
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index c29b53d10..9c563f9c2 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -248,7 +248,7 @@ void OPT::create_opt_model(FFModel &ff,
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
-    output = ff.arg_top_k(softmax, opt_config.k_of_arg_topk, false, true);
+    output = ff.arg_top_k(softmax, opt_config.k_of_arg_topk, false, false);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
     output = ff.argmax(lm_head, /*beam_Search*/ false);
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 0eaf8731a..3bc150865 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -208,7 +208,7 @@ void STARCODER::create_starcoder_model(
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(lm_head, -1);
     output =
-        ff.arg_top_k(softmax, startcoder_config.k_of_arg_topk, false, true);
+        ff.arg_top_k(softmax, startcoder_config.k_of_arg_topk, false, false);
   } else {
     // Tensor softmax = ff.softmax(dense, -1);
     if (generationConfig.do_sample) {
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 72a9f66d3..455cb131c 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1490,12 +1490,11 @@ flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
                                                flexflow_tensor_t const input_,
                                                int k,
                                                bool sorted,
-                                               bool speculative_decoding,
+                                               bool renormalize,
                                                char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
-  Tensor tensor =
-      handle->arg_top_k(input, k, sorted, speculative_decoding, name);
+  Tensor tensor = handle->arg_top_k(input, k, sorted, name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 83813b578..1ebf1257c 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -51,7 +51,7 @@ using PCG::Node;
 Tensor FFModel::arg_top_k(Tensor const input,
                           int k,
                           bool sorted,
-                          bool speculative_decoding,
+                          bool renormalize,
                           char const *name) {
   Layer *li = new Layer(this,
                         OP_ARG_TOPK,
@@ -59,7 +59,7 @@ Tensor FFModel::arg_top_k(Tensor const input,
                         name,
                         1 /*inputs*/,
                         0 /*weights*/,
-                        speculative_decoding ? 2 : 1 /*outputs*/,
+                        2 /*outputs*/,
                         input);
   {
     int numdims = input->num_dims;
@@ -72,14 +72,12 @@ Tensor FFModel::arg_top_k(Tensor const input,
     //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
     li->outputs[0] = create_tensor_legion_ordering(
         numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
-    if (speculative_decoding) {
-      li->outputs[1] = create_tensor_legion_ordering(
-          numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
-    }
+    li->outputs[1] = create_tensor_legion_ordering(
+        numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
   }
   li->add_int_property("k", k);
   li->add_int_property("sorted", sorted);
-  li->add_int_property("speculative_decoding", speculative_decoding);
+  li->add_int_property("renormalize", renormalize);
   layers.push_back(li);
   // outputs[0] = li->outputs[0];
   // outputs[1] = li->outputs[1];
@@ -95,23 +93,18 @@ Op *ArgTopK::create_operator_from_layer(
   int k = value;
   layer->get_int_property("sorted", value);
   bool sorted = (bool)value;
-  layer->get_int_property("speculative_decoding", value);
-  bool speculative_decoding = (bool)value;
-
-  return new ArgTopK(model,
-                     layer->layer_guid,
-                     inputs[0],
-                     k,
-                     sorted,
-                     speculative_decoding,
-                     layer->name);
+  layer->get_int_property("renormalize", value);
+  bool renormalize = (bool)value;
+
+  return new ArgTopK(
+      model, layer->layer_guid, inputs[0], k, sorted, renormalize, layer->name);
 }
 
 ArgTopKParams ArgTopK::get_params() const {
   ArgTopKParams params;
   params.k = this->k;
   params.sorted = this->sorted;
-  params.speculative_decoding = this->speculative_decoding;
+  params.renormalize = this->renormalize;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -125,7 +118,7 @@ bool ArgTopKParams::is_valid(ParallelTensorShape const &) const {
 
 bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) {
   return lhs.k == rhs.k && lhs.sorted == rhs.sorted &&
-         lhs.speculative_decoding == rhs.speculative_decoding;
+         lhs.renormalize == rhs.renormalize;
 }
 
 ArgTopK::ArgTopK(FFModel &model,
@@ -133,7 +126,7 @@ ArgTopK::ArgTopK(FFModel &model,
                  ParallelTensor const _input,
                  int _k,
                  bool _sorted,
-                 bool _speculative_decoding,
+                 bool _renormalize,
                  char const *name)
     : Op(model,
          OP_ARG_TOPK,
@@ -141,9 +134,9 @@ ArgTopK::ArgTopK(FFModel &model,
          name,
          1 /*inputs*/,
          0 /*weights*/,
-         _speculative_decoding ? 2 : 1 /*outputs*/,
+         2 /*outputs*/,
          _input),
-      k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
+      k(_k), sorted(_sorted), renormalize(_renormalize) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   int numdim = inputs[0]->num_dims;
@@ -158,10 +151,8 @@ ArgTopK::ArgTopK(FFModel &model,
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
-  if (_speculative_decoding) {
-    outputs[1] = model.create_parallel_tensor_legion_ordering(
-        numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
-  }
+  outputs[1] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
 }
 
 ArgTopK::ArgTopK(FFModel &model,
@@ -173,7 +164,7 @@ ArgTopK::ArgTopK(FFModel &model,
               input,
               other.k,
               other.sorted,
-              other.speculative_decoding,
+              other.renormalize,
               other.name) {}
 
 ArgTopK::ArgTopK(FFModel &model,
@@ -185,7 +176,7 @@ ArgTopK::ArgTopK(FFModel &model,
               input,
               params.k,
               params.sorted,
-              params.speculative_decoding,
+              params.renormalize,
               params.name) {}
 
 void ArgTopK::init_inference(FFModel const &ff,
@@ -287,7 +278,7 @@ OpMeta *ArgTopK::init_task(Task const *task,
   m->k = topk->k;
   std::strcpy(m->op_name, topk->name);
   m->layer_guid = topk->layer_guid;
-  m->speculative_decoding = topk->speculative_decoding;
+  m->renormalize = topk->renormalize;
   return m;
 }
 
@@ -311,64 +302,35 @@ FutureMap ArgTopK::inference(
   size_t machine_view_hash = view->hash();
   /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  if (speculative_decoding) {
-    IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
-                           parallel_is,
-                           TaskArgument(nullptr, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           machine_view_hash);
-    launcher.add_future(bc);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[0]->region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region));
-    launcher.add_field(1, FID_DATA);
-
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[1]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[1]->region));
-    launcher.add_field(2, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
-
-  } else {
-    IndexLauncher launcher(ARG_TOPK_INF_TASK_ID,
-                           parallel_is,
-                           TaskArgument(nullptr, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           machine_view_hash);
-    launcher.add_future(bc);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[0]->region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region));
-    launcher.add_field(1, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
-  }
+  IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
 }
 
 // just output the indices
@@ -454,7 +416,7 @@ void ArgTopK::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->k);
   sez.serialize(this->sorted);
-  sez.serialize(this->speculative_decoding);
+  sez.serialize(this->renormalize);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -471,10 +433,10 @@ Node ArgTopK::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   int k;
   bool sorted;
-  bool speculative_decoding;
+  bool renormalize;
   dez.deserialize(k);
   dez.deserialize(sorted);
-  dez.deserialize(speculative_decoding);
+  dez.deserialize(renormalize);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -483,7 +445,7 @@ Node ArgTopK::deserialize(FFModel &ff,
   params.layer_guid = layer_guid;
   params.k = k;
   params.sorted = sorted;
-  params.speculative_decoding = speculative_decoding;
+  params.renormalize = renormalize;
   strcpy(params.name, name);
   return ff.get_or_create_node<ArgTopK>(inputs[0], params);
 }
@@ -510,7 +472,7 @@ size_t hash<FlexFlow::ArgTopKParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.k);
   hash_combine(key, params.sorted);
-  hash_combine(key, params.speculative_decoding);
+  hash_combine(key, params.renormalize);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 50d2bb8d3..50b57b9af 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -23,53 +23,55 @@ using Legion::coord_t;
 
 // Adopted from Raft's select_k
 // https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/matrix/detail/select_radix.cuh#L1113
-template<typename T, typename idxT>
-void raft_radix_11bits_kernel(const T* in,
-                       int batch_size,
-                       idxT len,
-                       idxT k,
-                       T* out,
-                       idxT* out_idx = nullptr,
-                       bool greater = true,
-                       cudaStream_t stream = 0) {
-    raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
-        in,
-        static_cast<idxT*>(nullptr),
-        batch_size,
-        len,
-        k,
-        out,
-        out_idx,
-        !greater,
-        true,  // fused_last_filter
-        stream);
+template <typename T, typename idxT>
+void raft_radix_11bits_kernel(T const *in,
+                              int batch_size,
+                              idxT len,
+                              idxT k,
+                              T *out,
+                              idxT *out_idx = nullptr,
+                              bool greater = true,
+                              cudaStream_t stream = 0) {
+  raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
+      in,
+      static_cast<idxT *>(nullptr),
+      batch_size,
+      len,
+      k,
+      out,
+      out_idx,
+      !greater,
+      true, // fused_last_filter
+      stream);
 }
 
 // Adopted from Raft's select_k
 // https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/matrix/detail/select_radix.cuh#L1113
-template<typename T, typename idxT>
-void raft_radix_11bits_extra_pass_kernel(const T* in,
-                                  int batch_size,
-                                  idxT len,
-                                  idxT k,
-                                  T* out,
-                                  idxT* out_idx = nullptr,
-                                  bool greater = true,
-                                  cudaStream_t stream = 0) {
-    raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
-        in,
-        static_cast<idxT*>(nullptr),
-        batch_size,
-        len,
-        k,
-        out,
-        out_idx,
-        !greater,
-        false,  // fused_last_filter
-        stream);
+template <typename T, typename idxT>
+void raft_radix_11bits_extra_pass_kernel(T const *in,
+                                         int batch_size,
+                                         idxT len,
+                                         idxT k,
+                                         T *out,
+                                         idxT *out_idx = nullptr,
+                                         bool greater = true,
+                                         cudaStream_t stream = 0) {
+  raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
+      in,
+      static_cast<idxT *>(nullptr),
+      batch_size,
+      len,
+      k,
+      out,
+      out_idx,
+      !greater,
+      false, // fused_last_filter
+      stream);
 }
 
-__global__ void half2float_kernel(const half* __restrict__ in, float* __restrict__ out, int size) {
+__global__ void half2float_kernel(half const *__restrict__ in,
+                                  float *__restrict__ out,
+                                  int size) {
   // int stride = blockDim.x * gridDim.x,
   //     tid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -82,28 +84,30 @@ __global__ void half2float_kernel(const half* __restrict__ in, float* __restrict
 }
 
 template <typename DT>
-__global__ void insertion_sort_kernel(DT* topk_values, int* topk_indices, int batch_size, int k) {
-    int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (batch_index < batch_size) {
-        DT* values = topk_values + batch_index * k;
-        int* indices = topk_indices + batch_index * k;
-
-        for (int i = 1; i < k; i++) {
-            DT key_val = values[i];
-            int key_idx = indices[i];
-            int j = i - 1;
-            while (j >= 0 && values[j] < key_val) {
-                values[j + 1] = values[j];
-                indices[j + 1] = indices[j];
-                j = j - 1;
-            }
-            values[j + 1] = key_val;
-            indices[j + 1] = key_idx;
-        }
+__global__ void insertion_sort_kernel(DT *topk_values,
+                                      int *topk_indices,
+                                      int batch_size,
+                                      int k) {
+  int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (batch_index < batch_size) {
+    DT *values = topk_values + batch_index * k;
+    int *indices = topk_indices + batch_index * k;
+
+    for (int i = 1; i < k; i++) {
+      DT key_val = values[i];
+      int key_idx = indices[i];
+      int j = i - 1;
+      while (j >= 0 && values[j] < key_val) {
+        values[j + 1] = values[j];
+        indices[j + 1] = indices[j];
+        j = j - 1;
+      }
+      values[j + 1] = key_val;
+      indices[j + 1] = key_idx;
     }
+  }
 }
 
-
 /*static*/
 template <typename DT>
 void ArgTopK::forward_kernel(
@@ -117,35 +121,15 @@ void ArgTopK::forward_kernel(
     bool sorted,
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
-  if (m->speculative_decoding) {
-    assert(bc->num_active_requests() >= 0);
-    raft_radix_11bits_extra_pass_kernel<DT, int>(
-        input_ptr,
-        batch_size,
-        length,
-        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
-        output_ptr,
-        indices_ptr,
-        true,
-        stream);
-    if (sorted) {
-      assert(output_ptr != nullptr);
-      insertion_sort_kernel<<<GET_BLOCKS(batch_size),
-                             min((size_t)CUDA_NUM_THREADS, batch_size),
-                             0,
-                             stream>>>(output_ptr, indices_ptr, batch_size, k);
-    }
-  } else {
-    // raft_radix_11bits_extra_pass_kernel<DT, int>(
-    //     input_ptr,
-    //     batch_size,
-    //     length,
-    //     k,
-    //     static_cast<float*>(nullptr),
-    //     indices_ptr,
-    //     true,
-    //     stream);
-    assert(false && "Not in speculative decoding mode");
+  assert(bc->num_active_requests() >= 0);
+  raft_radix_11bits_extra_pass_kernel<DT, int>(
+      input_ptr, batch_size, length, k, output_ptr, indices_ptr, true, stream);
+  if (sorted) {
+    assert(output_ptr != nullptr);
+    insertion_sort_kernel<<<GET_BLOCKS(batch_size),
+                            min((size_t)CUDA_NUM_THREADS, batch_size),
+                            0,
+                            stream>>>(output_ptr, indices_ptr, batch_size, k);
   }
 }
 
@@ -202,34 +186,31 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
     // printf("ArgTopK: length = %d, batch_size = %d\n", length, batch_size);
     ArgTopK::forward_kernel(m,
                             input.get_half_ptr(),
-                            m->speculative_decoding ? (half *)m->half_precision_output
-                                                    : nullptr,
+                            (half *)m->half_precision_output,
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
                             k,
                             m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
+                            bc,
                             stream);
-    if (m->speculative_decoding) {
-      // transfer data from half to float (half_precision_output to output)
-      int size = length * batch_size;
-      half2float_kernel<<<GET_BLOCKS(size),
-                          min((int)CUDA_NUM_THREADS, size),
-                          0,
-                          stream>>>((const half *)m->half_precision_output, probs.get_float_ptr(), size);
-    }
+    // transfer data from half to float (half_precision_output to output)
+    int size = length * batch_size;
+    half2float_kernel<<<GET_BLOCKS(size),
+                        min((int)CUDA_NUM_THREADS, size),
+                        0,
+                        stream>>>(
+        (half const *)m->half_precision_output, probs.get_float_ptr(), size);
   } else if (input.data_type == DT_FLOAT) {
     ArgTopK::forward_kernel(m,
                             input.get_float_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
-                                                    : nullptr,
+                            probs.get_float_ptr(),
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
                             k,
                             m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
+                            bc,
                             stream);
   } else {
     assert(false && "Unsupported data type");
@@ -247,12 +228,14 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
 }
 
 ArgTopKMeta::ArgTopKMeta(FFHandler handler,
-                          Op const *op,
-                          MemoryAllocator &gpu_mem_allocator)
+                         Op const *op,
+                         MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, op) {
   max_input_size = BatchConfig::MAX_NUM_TOKENS * 32000; // TODO: use vocab_size
-  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(half) * max_input_size);
-  half_precision_output = gpu_mem_allocator.allocate_instance_untyped(sizeof(half) * max_input_size);
+  gpu_mem_allocator.create_legion_instance(reserveInst,
+                                           sizeof(half) * max_input_size);
+  half_precision_output = gpu_mem_allocator.allocate_instance_untyped(
+      sizeof(half) * max_input_size);
 }
 
 ArgTopKMeta::~ArgTopKMeta() {

From 8aa38fbef38c600f50ea65aa2f555c516b8f6f1d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 2 Jun 2024 02:31:03 -0400
Subject: [PATCH 307/667] Support renormalize for arg_topk.

---
 include/flexflow/ops/arg_topk.h |  1 +
 src/ops/arg_topk.cu             | 32 ++++++++++++++++++++++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 721ccd501..7ba5ed945 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -96,6 +96,7 @@ class ArgTopK : public Op {
                              int length,
                              int k,
                              bool sorted,
+                             bool renormalize,
                              BatchConfig const *bc,
                              ffStream_t stream);
   static void forward_kernel_wrapper(ArgTopKMeta const *m,
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 50b57b9af..099273da4 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -108,6 +108,24 @@ __global__ void insertion_sort_kernel(DT *topk_values,
   }
 }
 
+template <typename DT>
+__global__ void renormalize_kernel(DT *topk_values,
+                                   int batch_size,
+                                   int k,
+                                   float epsilon = 1e-6) {
+  int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
+  assert(batch_index < batch_size);
+  DT *values = topk_values + batch_index * k;
+  DT sum = 0;
+  for (int i = 0; i < k; i++) {
+    sum += values[i];
+  }
+  sum += epsilon;
+  for (int i = 0; i < k; i++) {
+    values[i] /= sum;
+  }
+}
+
 /*static*/
 template <typename DT>
 void ArgTopK::forward_kernel(
@@ -119,6 +137,7 @@ void ArgTopK::forward_kernel(
     int length,
     int k,
     bool sorted,
+    bool renormalize,
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
   assert(bc->num_active_requests() >= 0);
@@ -131,6 +150,13 @@ void ArgTopK::forward_kernel(
                             0,
                             stream>>>(output_ptr, indices_ptr, batch_size, k);
   }
+  if (renormalize) {
+    assert(output_ptr != nullptr);
+    renormalize_kernel<<<GET_BLOCKS(batch_size),
+                         min((size_t)CUDA_NUM_THREADS, batch_size),
+                         0,
+                         stream>>>(output_ptr, batch_size, k);
+  }
 }
 
 /*static*/
@@ -190,8 +216,9 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
-                            k,
+                            m->k,
                             m->sorted,
+                            m->renormalize,
                             bc,
                             stream);
     // transfer data from half to float (half_precision_output to output)
@@ -208,8 +235,9 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
-                            k,
+                            m->k,
                             m->sorted,
+                            m->renormalize,
                             bc,
                             stream);
   } else {

From 378b59879a96e1cf9fa7fca00c08dae544cd4aae Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 2 Jun 2024 03:30:46 -0400
Subject: [PATCH 308/667] Modified the InferenceResult class. Added
 constructors for it.

---
 include/flexflow/batch_config.h |  14 ++--
 src/ops/arg_topk.cc             |   2 +
 src/ops/argmax.cc               |   2 +
 src/ops/gumbel_topk.cc          | 126 +++++++++++++++++---------------
 src/ops/sampling.cc             |  20 ++---
 src/runtime/batch_config.cc     |  10 +++
 6 files changed, 102 insertions(+), 72 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index e0f02ff20..d0c2cd222 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -156,11 +156,15 @@ class BatchConfig {
 };
 
 struct InferenceResult {
-  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS *
-                                 BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-  float probs[BatchConfig::MAX_NUM_TOKENS *
-              BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
-  float topk_logits[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
+  int num_token_ids;
+  int num_gumbel_logits;
+  BatchConfig::TokenId
+      token_ids[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
+  float probs[BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS];
+  float gumbel_logits[BatchConfig::MAX_NUM_TOKENS *
+                      BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+  InferenceResult() : num_token_ids(0), num_gumbel_logits(0) {}
+  InferenceResult(InferenceResult const &other);
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 1ebf1257c..fdf10a370 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -368,6 +368,7 @@ InferenceResult
   }
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
@@ -399,6 +400,7 @@ InferenceResult ArgTopK::inference_speculative_task(
   ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
   download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index ad80b665e..fc6fec5b9 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -357,6 +357,7 @@ InferenceResult
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
   InferenceResult ir;
+  ir.num_token_ids = batch_size;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   download_tensor(m->probs, ir.probs, batch_size);
@@ -394,6 +395,7 @@ InferenceResult
   int batch_size = bc->num_active_tokens();
   ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
   InferenceResult ir;
+  ir.num_token_ids = batch_size;
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index 7a59131ad..a57d26e10 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -46,13 +46,14 @@ using Legion::TaskLauncher;
 using PCG::Node;
 
 // For an input tensor, computes the top k entries in each row
-// (resp. vector along the last dimension) using Gumbel trick (https://arxiv.org/abs/1903.06059). 
-// Thus, values.shape = indices.shape = input.shape[:-1] + [k]
+// (resp. vector along the last dimension) using Gumbel trick
+// (https://arxiv.org/abs/1903.06059). Thus, values.shape = indices.shape =
+// input.shape[:-1] + [k]
 Tensor FFModel::gumbel_top_k(Tensor const input,
-                          int k,
-                          bool sorted,
-                          bool speculative_decoding,
-                          char const *name) {
+                             int k,
+                             bool sorted,
+                             bool speculative_decoding,
+                             char const *name) {
   Layer *li = new Layer(this,
                         OP_GUMBEL_TOPK,
                         input->data_type,
@@ -100,12 +101,12 @@ Op *GumbelTopK::create_operator_from_layer(
   bool speculative_decoding = (bool)value;
 
   return new GumbelTopK(model,
-                     layer->layer_guid,
-                     inputs[0],
-                     k,
-                     sorted,
-                     speculative_decoding,
-                     layer->name);
+                        layer->layer_guid,
+                        inputs[0],
+                        k,
+                        sorted,
+                        speculative_decoding,
+                        layer->name);
 }
 
 GumbelTopKParams GumbelTopK::get_params() const {
@@ -130,12 +131,12 @@ bool operator==(GumbelTopKParams const &lhs, GumbelTopKParams const &rhs) {
 }
 
 GumbelTopK::GumbelTopK(FFModel &model,
-                 LayerID const &_layer_guid,
-                 ParallelTensor const _input,
-                 int _k,
-                 bool _sorted,
-                 bool _speculative_decoding,
-                 char const *name)
+                       LayerID const &_layer_guid,
+                       ParallelTensor const _input,
+                       int _k,
+                       bool _sorted,
+                       bool _speculative_decoding,
+                       char const *name)
     : Op(model,
          OP_GUMBEL_TOPK,
          _input->data_type,
@@ -168,33 +169,34 @@ GumbelTopK::GumbelTopK(FFModel &model,
 }
 
 GumbelTopK::GumbelTopK(FFModel &model,
-                 LayerID const &layer_guid,
-                 GumbelTopK const &other,
-                 ParallelTensor const input)
+                       LayerID const &layer_guid,
+                       GumbelTopK const &other,
+                       ParallelTensor const input)
     : GumbelTopK(model,
-              layer_guid,
-              input,
-              other.k,
-              other.sorted,
-              other.speculative_decoding,
-              other.name) {}
+                 layer_guid,
+                 input,
+                 other.k,
+                 other.sorted,
+                 other.speculative_decoding,
+                 other.name) {}
 
 GumbelTopK::GumbelTopK(FFModel &model,
-                 GumbelTopKParams const &params,
-                 ParallelTensor const input,
-                 char const *name)
+                       GumbelTopKParams const &params,
+                       ParallelTensor const input,
+                       char const *name)
     : GumbelTopK(model,
-              params.layer_guid,
-              input,
-              params.k,
-              params.sorted,
-              params.speculative_decoding,
-              params.name) {}
-
-void GumbelTopK::init_inference(FFModel const &ff,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+                 params.layer_guid,
+                 input,
+                 params.k,
+                 params.sorted,
+                 params.speculative_decoding,
+                 params.name) {}
+
+void GumbelTopK::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   assert(check_output_input_weight_same_parallel_is());
   parallel_is = batch_outputs[0]->parallel_is;
   ArgumentMap argmap;
@@ -267,9 +269,9 @@ void GumbelTopK::init(FFModel const &ff) {
 }
 
 OpMeta *GumbelTopK::init_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
   GumbelTopK *gumbel_topk = (GumbelTopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -277,7 +279,8 @@ OpMeta *GumbelTopK::init_task(Task const *task,
                        .best_affinity_to(task->target_proc)
                        .first();
   MemoryAllocator gpu_mem_allocator(gpu_mem);
-  GumbelTopKMeta *m = new GumbelTopKMeta(handle, gumbel_topk, gpu_mem_allocator);
+  GumbelTopKMeta *m =
+      new GumbelTopKMeta(handle, gumbel_topk, gpu_mem_allocator);
   m->profiling = gumbel_topk->profiling;
   m->inference_debugging = gumbel_topk->inference_debugging;
   m->sorted = gumbel_topk->sorted;
@@ -380,9 +383,9 @@ FutureMap GumbelTopK::inference(
 
 InferenceResult
     GumbelTopK::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
   // const GumbelTopK* topk = (const GumbelTopK*) task->args;
@@ -413,6 +416,8 @@ InferenceResult
   }
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
+  ir.num_gumbel_logits = batch_size * m->k;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
@@ -443,13 +448,18 @@ InferenceResult GumbelTopK::inference_speculative_task(
       DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime);
 
   int batch_size = bc.num_active_tokens();
-  GumbelTopK::forward_kernel_wrapper(m, input, log_probs, perturbed_log_probs, indices, batch_size, &bc);
+  GumbelTopK::forward_kernel_wrapper(
+      m, input, log_probs, perturbed_log_probs, indices, batch_size, &bc);
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size * m->k;
+  ir.num_gumbel_logits = batch_size * m->k;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
-  download_tensor<float>(log_probs.get_float_ptr(), ir.probs, batch_size * m->k);
-  download_tensor<float>(perturbed_log_probs.get_float_ptr(), ir.topk_logits, batch_size * m->k);
+  download_tensor<float>(
+      log_probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  download_tensor<float>(
+      perturbed_log_probs.get_float_ptr(), ir.gumbel_logits, batch_size * m->k);
   return ir;
 }
 
@@ -470,9 +480,9 @@ void GumbelTopK::serialize(Legion::Serializer &sez) const {
 }
 
 Node GumbelTopK::deserialize(FFModel &ff,
-                          Legion::Deserializer &dez,
-                          ParallelTensor inputs[],
-                          int num_inputs) {
+                             Legion::Deserializer &dez,
+                             ParallelTensor inputs[],
+                             int num_inputs) {
   assert(num_inputs == 1);
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
@@ -499,15 +509,15 @@ Node GumbelTopK::deserialize(FFModel &ff,
 }
 
 Op *GumbelTopK::materialize(FFModel &ff,
-                         ParallelTensor inputs[],
-                         int num_inputs) const {
+                            ParallelTensor inputs[],
+                            int num_inputs) const {
   GumbelTopKParams params = get_params();
   return new GumbelTopK(ff, params, inputs[0], this->name);
 }
 
 bool GumbelTopK::measure_operator_cost(Simulator *sim,
-                                    MachineView const &mv,
-                                    CostMetrics &cost_metrics) const {
+                                       MachineView const &mv,
+                                       CostMetrics &cost_metrics) const {
   return false;
 }
 
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index 3c67fc6ec..d273780c2 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -47,7 +47,7 @@ using PCG::Node;
 // For an input tensor, computes the top k entries in each row
 // (resp. vector along the last dimension). Thus,
 // values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::sampling(const Tensor input, float top_p, char const *name) {
+Tensor FFModel::sampling(Tensor const input, float top_p, char const *name) {
   Layer *li = new Layer(this,
                         OP_SAMPLING,
                         input->data_type,
@@ -103,7 +103,7 @@ bool operator==(SamplingParams const &lhs, SamplingParams const &rhs) {
 }
 
 Sampling::Sampling(FFModel &model,
-                   const ParallelTensor _input,
+                   ParallelTensor const _input,
                    float _top_p,
                    char const *name)
     : Op(model,
@@ -132,12 +132,12 @@ Sampling::Sampling(FFModel &model,
 
 Sampling::Sampling(FFModel &model,
                    Sampling const &other,
-                   const ParallelTensor input)
+                   ParallelTensor const input)
     : Sampling(model, input, other.top_p, other.name) {}
 
 Sampling::Sampling(FFModel &model,
                    SamplingParams const &params,
-                   const ParallelTensor input,
+                   ParallelTensor const input,
                    char const *name)
     : Sampling(model, input, params.top_p, params.name) {}
 
@@ -246,11 +246,12 @@ void Sampling::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap Sampling::inference(FFModel const &ff,
-                              /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                              std::vector<ParallelTensor> const &batch_inputs,
-                              std::vector<ParallelTensor> const &batch_outputs,
-                              MachineView const *mv) {
+FutureMap Sampling::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
@@ -316,6 +317,7 @@ InferenceResult
   }
 
   InferenceResult ir;
+  ir.num_token_ids = batch_size;
   download_tensor<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 15b14e547..97bddafb7 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -222,4 +222,14 @@ void BatchConfig::save_to_file(std::string const &filename) const {
   }
 }
 
+InferenceResult::InferenceResult(InferenceResult const &other) {
+  num_token_ids = other.num_token_ids;
+  num_gumbel_logits = other.num_gumbel_logits;
+  std::copy(other.token_ids, other.token_ids + num_token_ids, token_ids);
+  std::copy(other.probs, other.probs + num_token_ids, probs);
+  std::copy(other.gumbel_logits,
+            other.gumbel_logits + num_gumbel_logits,
+            gumbel_logits);
+}
+
 }; // namespace FlexFlow

From 38f7920c7bf00e3598645b20e53261d1e93a7fe6 Mon Sep 17 00:00:00 2001
From: Remi <54138269+Flechman@users.noreply.github.com>
Date: Sun, 2 Jun 2024 10:25:33 -0400
Subject: [PATCH 309/667] Change ssh to https

---
 .gitmodules | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 115712b36..5cf5dc3c6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,11 +25,11 @@
 	fetchRecurseSubmodules = true
 [submodule "deps/spdlog"]
 	path = deps/spdlog
-	url = git@github.com:gabime/spdlog.git
+	url = https://github.com/gabime/spdlog.git
 [submodule "deps/rmm"]
 	path = deps/rmm
-	url = git@github.com:rapidsai/rmm.git
+	url = https://github.com/rapidsai/rmm.git
 [submodule "deps/raft"]
 	path = deps/raft
-	url = git@github.com:rapidsai/raft.git
+	url = https://github.com/rapidsai/raft.git
 	ignore = dirty

From 6881f4080ee634f41a5178b3bb9898b33e3fec97 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 02:07:26 -0700
Subject: [PATCH 310/667] feat: addadd submodule deps/flashinfer

---
 .gitmodules     | 3 +++
 deps/flashinfer | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 deps/flashinfer

diff --git a/.gitmodules b/.gitmodules
index 115712b36..63bcbc2ca 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -33,3 +33,6 @@
 	path = deps/raft
 	url = git@github.com:rapidsai/raft.git
 	ignore = dirty
+[submodule "deps/flashinfer"]
+	path = deps/flashinfer
+	url = https://github.com/flashinfer-ai/flashinfer.git
diff --git a/deps/flashinfer b/deps/flashinfer
new file mode 160000
index 000000000..7def34e31
--- /dev/null
+++ b/deps/flashinfer
@@ -0,0 +1 @@
+Subproject commit 7def34e316a731cd069f7fd30a9a2ffc70fad02a

From e3eefeb79187c3f694454b141d671a9f195e16b0 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 04:45:33 -0700
Subject: [PATCH 311/667] feat: add flashinfer into build process

---
 CMakeLists.txt | 2 ++
 FlexFlow.mk    | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d90465b4e..5618f315a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,6 +210,8 @@ if(NOT BUILD_LEGION_ONLY)
 
   list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/spdlog/include)
 
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)
+
   if (FF_GPU_BACKEND STREQUAL "cuda")
     list(APPEND FF_CC_FLAGS
       -DFF_USE_CUDA)
diff --git a/FlexFlow.mk b/FlexFlow.mk
index cf92d8270..fadcf4de3 100644
--- a/FlexFlow.mk
+++ b/FlexFlow.mk
@@ -96,7 +96,8 @@ endif
 
 
 INC_FLAGS	+= -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src \
-				-I${FF_HOME}/deps/raft/cpp/include -I${FF_HOME}/deps/rmm/include -I${FF_HOME}/deps/spdlog/include
+				-I${FF_HOME}/deps/raft/cpp/include -I${FF_HOME}/deps/rmm/include -I${FF_HOME}/deps/spdlog/include \
+				-I${FF_HOME}/deps/flashinfer/include
 CC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
 NVCC_FLAGS	+= -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768 \
 			    --expt-relaxed-constexpr --extended-lambda

From 919a4645e4ee326f1e773eb676b4b43509e49d89 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 08:26:09 -0700
Subject: [PATCH 312/667] chore: backup tree_inc_multihead_self_attention

---
 ...tree_inc_multihead_self_attention.h.backup |  155 +++
 ...ree_inc_multihead_self_attention.cu.backup | 1119 +++++++++++++++++
 2 files changed, 1274 insertions(+)
 create mode 100644 include/flexflow/ops/tree_inc_multihead_self_attention.h.backup
 create mode 100644 src/ops/tree_inc_multihead_self_attention.cu.backup

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h.backup b/include/flexflow/ops/tree_inc_multihead_self_attention.h.backup
new file mode 100644
index 000000000..45a7a6b56
--- /dev/null
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h.backup
@@ -0,0 +1,155 @@
+#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
+#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
+
+#include "flexflow/accessor.h"
+#include "flexflow/device.h"
+#include "flexflow/fftype.h"
+#include "flexflow/inference.h"
+#include "flexflow/layer.h"
+#include "flexflow/node.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/operator.h"
+#include "flexflow/ops/inc_multihead_self_attention.h"
+#include "flexflow/ops/tree_inc_multihead_self_attention_params.h"
+#include "math.h"
+#include <cfloat>
+#include <complex>
+
+namespace FlexFlow {
+
+class TreeIncMultiHeadSelfAttentionMeta;
+
+class TreeIncMultiHeadSelfAttention : public Op {
+public:
+  using Params = TreeIncMultiHeadSelfAttentionParams;
+  using Input = ParallelTensor;
+
+  TreeIncMultiHeadSelfAttention(FFModel &model,
+                                LayerID const &layer_guid,
+                                ParallelTensor const _input,
+                                int _embed_dim,
+                                int _num_q_heads,
+                                int _num_kv_heads,
+                                int _kdim,
+                                int _vdim,
+                                float _dropout,
+                                bool _qkv_bias,
+                                bool _final_bias,
+                                bool _add_zero_attn,
+                                bool _apply_rotary_embedding,
+                                bool _scaling_query,
+                                float _scaling_factor,
+                                bool _qk_prod_scaling,
+                                bool _position_bias,
+                                bool allocate_weights,
+                                DataType _quantization_type,
+                                bool _offload,
+                                int _tensor_parallelism_degree,
+                                char const *name);
+  TreeIncMultiHeadSelfAttention(FFModel &model,
+                                ParallelTensor const _input,
+                                ParallelTensor const _weight,
+                                int _embed_dim,
+                                int _num_q_heads,
+                                int _num_kv_heads,
+                                int _kdim,
+                                int _vdim,
+                                float _dropout,
+                                bool _qkv_bias,
+                                bool _final_bias,
+                                bool _add_zero_attn,
+                                bool _apply_rotary_embedding,
+                                bool _scaling_query,
+                                float _scaling_factor,
+                                bool _qk_prod_scaling,
+                                bool _position_bias,
+                                bool allocate_weights,
+                                DataType _quantization_type,
+                                bool _offload,
+                                int _tensor_parallelism_degree,
+                                char const *name);
+  TreeIncMultiHeadSelfAttention(FFModel &model,
+                                TreeIncMultiHeadSelfAttention const &other,
+                                ParallelTensor const input,
+                                bool allocate_weights);
+  TreeIncMultiHeadSelfAttention(FFModel &model,
+                                Params const &params,
+                                Input const &inputs,
+                                bool allocate_weights = false,
+                                char const *name = nullptr);
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override {
+    assert(0);
+  }
+  bool get_int_parameter(PMParameter, int *) const override;
+
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &mv,
+                             CostMetrics &cost_metrics) const override;
+
+  static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m,
+                                       BatchConfig const *bc,
+                                       int shard_id,
+                                       GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorR const &weight,
+                                       GenericTensorAccessorW const &output,
+                                       GenericTensorAccessorR const &bias);
+
+  Params get_params() const;
+
+public:
+  int num_q_heads, num_kv_heads, tensor_parallelism_degree;
+  float dropout, scaling_factor;
+  bool qkv_bias;
+  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
+      qk_prod_scaling, position_bias;
+  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int qoSeqLength, kvSeqLength;
+  DataType quantization_type;
+  bool offload;
+};
+
+class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
+public:
+  TreeIncMultiHeadSelfAttentionMeta(FFHandler handler,
+                                    TreeIncMultiHeadSelfAttention const *attn,
+                                    GenericTensorAccessorR const &weight,
+                                    MemoryAllocator &gpu_mem_allocator,
+                                    int num_samples,
+                                    int _num_q_heads,
+                                    int _num_kv_heads);
+  ~TreeIncMultiHeadSelfAttentionMeta(void);
+
+public:
+  int num_active_tokens;
+  Realm::RegionInstance committed_token_reserve_inst;
+  BatchConfig::CommittedTokensInfo *committed_token_infos;
+  BatchConfig::BitMask *causalMask;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
diff --git a/src/ops/tree_inc_multihead_self_attention.cu.backup b/src/ops/tree_inc_multihead_self_attention.cu.backup
new file mode 100644
index 000000000..c022fabcf
--- /dev/null
+++ b/src/ops/tree_inc_multihead_self_attention.cu.backup
@@ -0,0 +1,1119 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/ops/tree_inc_multihead_self_attention.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+
+// declare Legion names
+using Legion::coord_t;
+using Legion::Memory;
+
+#define WARP_SIZE 32
+
+using namespace Kernels::IncMultiHeadAttention;
+
+namespace Kernels {
+namespace TreeIncMultiHeadAttention {
+
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_attention_kernel_fused_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int const max_seq_length,
+    int const max_token_per_batch,
+    int per_head_size,
+    int hidden_size,
+    /* Reserved: BatchConfig Updated */
+    BatchConfig::PerRequestInfo *request_infos,
+    int num_heads,
+    int num_requests,
+    BatchConfig::BitMask *causalMask,
+    bool *request_available,
+    int qk_smem_sz,
+    bool prompt_phase) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // request idx
+  int const request_idx = blockIdx.y;
+
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+    }
+  }
+
+  // threads converge
+  //   __syncthreads();
+
+  int const first_step = 0;
+
+  int const tlength =
+      request_infos[requext_idx_in_batch].first_token_index_in_request +
+      request_infos[requext_idx_in_batch].num_tokens_in_batch;
+  int const qlength = request_infos[requext_idx_in_batch].num_tokens_in_batch;
+
+  __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
+                              [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
+  for (int i = tidx; i < qlength; i += THREADS_PER_BLOCK) {
+    for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
+      bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
+    }
+  }
+
+  int non_tree_cache_size =
+      causalMask[requext_idx_in_batch].non_tree_cache_size;
+
+  int const first_token_idx =
+      request_infos[requext_idx_in_batch].first_token_offset_in_batch;
+
+  int q_start =
+      request_infos[requext_idx_in_batch].first_token_index_in_request;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+
+  for (int qi = 0; qi < qlength; qi += 1) {
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
+          ii * THREADS_PER_KEY * K_VEC_SIZE);
+
+      // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
+      //     printf("laod q %d,  %d %.10f\n",
+      //     request_idx,
+      //            qi,q_vecs[ki_o][ii].x);
+      //   }
+    }
+
+    __syncthreads();
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      K_vec k[K_VECS_PER_THREAD];
+      int const ti_circ = ti % max_seq_length;
+#pragma unroll
+      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+        if (ti < tlength) {
+          k[ii] = *reinterpret_cast<K_vec const *>(
+              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
+              jj);
+        }
+      }
+      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+
+      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+        bool const mask =
+            prompt_phase
+                ? (qi + q_start < ti)
+                : (ti >= non_tree_cache_size &&
+                   (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
+
+        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+
+        // if (head_idx == 0 && !mask) {
+        //   printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f,
+        //   %.10f, %d\n",
+        //          request_idx,
+        //          qi,
+        //          ti,
+        //          qk,
+        //          q_vecs[ki_o][0].x,
+        //          k[0].x,
+        //          bitmask->non_tree_cache_size);
+        // }
+        qk_smem[ti - first_step] = mask ? 0.0f : qk;
+      }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+    }
+
+    // Decompose the thread index into warp and lane.
+    int const warp = tidx / WARP_SIZE;
+    int const lane = tidx % WARP_SIZE;
+
+    // The warp leader writes the max to shared memory.
+    if (lane == 0) {
+      red_smem[warp] = qk_max;
+    }
+
+    // Make sure the products are in shared memory.
+    __syncthreads();
+
+    // The warps finalize the reduction.
+    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+    }
+
+    // Broadcast to all the threads in the warp.
+    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
+
+    // if (head_idx == 0 && qi == 9 && tidx == 0) {
+    //   printf("tree attn first token qk_max %f\n", qk_max);
+    // }
+
+    float exp_sum = 0.f;
+    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+      bool const mask =
+          prompt_phase ? (q_start + qi < ti)
+                       : (ti >= non_tree_cache_size &&
+                          (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
+      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
+      exp_sum += logit;
+      qk_smem[ti - first_step] = mask ? 0.0f : logit;
+    }
+
+    // Compute the sum.
+    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+    // softmax
+    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+      qk_smem[ti - first_step] *= inv_sum;
+    }
+
+    __syncthreads();
+
+    // value projection
+    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+    // A vector of V elements for the current timestep.
+    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+    // The value computed by this thread.
+    int vo = tidx / THREADS_PER_VALUE;
+    // The hidden dimensions computed by this particular thread.
+    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+    Out_sum out;
+    zero(out);
+
+    // The base pointer for the value in the cache buffer.
+    DT const *v_cache_batch =
+        value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
+
+    if (Dh == Dh_MAX || vi < Dh) {
+      for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+        // Load the values from the cache.
+        int const ti_circ = ti % max_seq_length;
+        V_vec v = *reinterpret_cast<V_vec const *>(
+            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+        float logit = qk_smem[ti - first_step];
+        out = FlexFlow::fma(logit, cast_to_float(v), out);
+      }
+    }
+
+    //   // Make sure we can start writing to shared memory.
+    __syncthreads();
+
+    // Run the final reduction amongst the different groups computing different
+    // partial outputs.
+    if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+      for (int active_groups = V_PER_ITER; active_groups >= 2;
+           active_groups /= 2) {
+
+        // The midpoint in the number of active groups.
+        int midpoint = active_groups / 2;
+
+        // The upper part of active threads store to shared memory.
+        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+              out;
+        }
+        __syncthreads();
+
+        // The bottom warps update their values.
+        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                    out);
+        }
+        __syncthreads();
+      }
+    }
+
+    // Output the final values.
+    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+      convert_from_float(*reinterpret_cast<V_vec *>(
+                             output_ptr + (first_token_idx + qi) * hidden_size +
+                             head_idx * per_head_size + vi),
+                         out);
+      // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) {
+      //   printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n",
+      //          out.x,
+      //          out.y,
+      //          out.z,
+      //          out.w,
+      //          vi,
+      //          (first_token_idx + qi) * hidden_size + head_idx *
+      //          per_head_size +
+      //              vi);
+      // }
+    }
+  }
+}
+
+template <typename DT>
+__global__ void commit_tokens_kernel(
+    DT const *devQKVProjArray,
+    DT *kCache_ptr,
+    DT *vCache_ptr,
+    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    int qProjSize,
+    int kProjSize,
+    int vProjSize,
+    int num_tokens_to_commit,
+    int num_active_tokens_in_last_batch,
+    int max_seq_len,
+    int hidden_size) {
+
+  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
+
+    int token_pos = i / (hidden_size);
+    int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
+    int offset = i % hidden_size;
+    assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
+
+    size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size +
+                     hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+
+    int const req_id = committedTokenInfos[token_pos].request_index;
+    int const tok_id = committedTokenInfos[token_pos].token_depth;
+
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = vVal;
+  }
+}
+
+template <typename DT>
+void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
+                   BatchConfig const *bc,
+                   cudaStream_t stream) {
+  int num_tokens_to_commit = bc->num_tokens_to_commit;
+  if (num_tokens_to_commit > 0) {
+    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
+    commit_tokens_kernel<<<GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream>>>(
+        static_cast<DT *>(m->devQKVProjArray),
+        static_cast<DT *>(m->keyCache),
+        static_cast<DT *>(m->valueCache),
+        m->committed_token_infos,
+        m->qProjSize,
+        m->kProjSize,
+        m->vProjSize,
+        num_tokens_to_commit,
+        m->num_active_tokens, // number of active tokens in previous batch
+        BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num(),
+        m->hidden_size);
+  }
+}
+
+template <typename DT>
+__global__ void
+    update_tree_branch_kv_cache(DT const *devQKVProjArray,
+                                DT *kCache_ptr,
+                                DT *vCache_ptr,
+                                BatchConfig::PerTokenInfo const *tokenInfos,
+                                int qProjSize,
+                                int kProjSize,
+                                int vProjSize,
+                                int num_tokens_in_branch,
+                                int processed_tokens_in_batch,
+                                int total_tokens_in_batch,
+                                int max_seq_len,
+                                int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
+
+    int token_idx = i / (hidden_size);
+    int offset = i % hidden_size;
+
+    token_idx += processed_tokens_in_batch; // get index in the whole batch
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+
+    int const req_id = tokenInfos[token_idx].request_index;
+    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = vVal;
+  }
+}
+
+template <typename DT>
+__global__ void update_tree_branch_kv_cache_fused(
+    DT const *devQKVProjArray,
+    DT *kCache_ptr,
+    DT *vCache_ptr,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerRequestInfo *request_infos,
+    int qProjSize,
+    int kProjSize,
+    int vProjSize,
+    int num_new_tokens,
+    int max_seq_len,
+    int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) {
+
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+
+    int const req_idx = tokenInfos[token_idx].request_index;
+    int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+
+    kCache_ptr[req_idx * (hidden_size * max_seq_len) +
+               token_abs_idx * hidden_size + offset] = kVal;
+    vCache_ptr[req_idx * (hidden_size * max_seq_len) +
+               token_abs_idx * hidden_size + offset] = vVal;
+  }
+}
+
+template <typename DT>
+__global__ void tree_fill_entries_above_diagonal(DT *matrix,
+                                                 size_t new_tokens,
+                                                 size_t total_tokens_in_request,
+                                                 size_t num_q_heads,
+                                                 DT value) {
+  CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) {
+    // size_t head_idx = i / (new_tokens * total_tokens_in_request);
+    size_t src_idx = (i / new_tokens) % total_tokens_in_request;
+    size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens;
+    // Casual Mask
+    if (src_idx > dst_idx) {
+      matrix[i] = value;
+    }
+  }
+}
+
+template <typename DT>
+void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
+                              BatchConfig const *bc,
+                              int shard_id,
+                              DT *output_ptr,
+                              DT const *bias_ptr,
+                              DT const *weight_ptr,
+                              cudaStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
+#endif
+  // int num_requests = bc->num_active_requests();
+  int processed_tokens_in_batch = 0;
+  // int qkv_block_size =
+  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
+  int q_block_size = m->qProjSize;
+  int kt_block_size = m->kProjSize;
+  int kt_req_block_size = kt_block_size * m->num_q_heads *
+                          (BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
+  int vt_block_size = m->vProjSize;
+  int vt_req_block_size = vt_block_size * m->num_q_heads *
+                          (BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
+  assert(m->qProjSize == m->kProjSize);
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (!bc->request_available[i]) {
+      continue;
+    }
+    assert(processed_tokens_in_batch ==
+           bc->requestsInfo[i].first_token_offset_in_batch);
+    int last_token_idx_of_the_request =
+        processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
+    while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
+      int num_new_tokens = 1;
+      int j = processed_tokens_in_batch;
+      while ((j + 1 <= last_token_idx_of_the_request) &&
+             (bc->tokensInfo[j].abs_index_in_request + 1 ==
+              bc->tokensInfo[j + 1].abs_index_in_request)) {
+        j++;
+        num_new_tokens++;
+      }
+
+      int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1;
+      assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
+      {
+        // update K-V cache
+        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
+        update_tree_branch_kv_cache<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+            static_cast<DT *>(m->devQKVProjArray),
+            static_cast<DT *>(m->keyCache),
+            static_cast<DT *>(m->valueCache),
+            m->token_infos,
+            m->qProjSize,
+            m->kProjSize,
+            m->vProjSize,
+            num_new_tokens,            // num_tokens_in_branch
+            processed_tokens_in_batch, // num_processed_tokens_in_batch
+            m->num_active_tokens,      // total_tokens_in_batch
+            BatchConfig::max_sequence_length() +
+                BatchConfig::max_spec_tree_token_num(),
+            m->hidden_size);
+      }
+
+      // bc->token_last_available_idx[i] + 1;
+      // Compute (QK^T/sqrt(d_k))
+      int m_ = num_new_tokens;
+      int n = total_tokens_in_request;
+      int k = m->qProjSize;
+      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
+          ldc = m_;
+      int strideA = q_block_size;
+      int strideB = kt_block_size;
+      int strideC = num_new_tokens * total_tokens_in_request;
+
+      // a flag of using this scaling alpha
+      DT alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+      }
+      // To get A, skip over Q entries from previous requests (same head)
+      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                    processed_tokens_in_batch * m->qProjSize * m->num_q_heads *
+                        QKV_WEIGHT_NUM;
+      // To get B, skip over K entries from previous requests (all heads +
+      // padding)
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // To get C, skip over QK^T products from previous requests
+      DT *C = static_cast<DT *>(m->qk_prods);
+
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           m_,
+                                           n,
+                                           k,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // add alibi position bias to qk production
+      // add alibi position bias to qk production
+      if (*m->position_bias) {
+        size_t parallelism =
+            m->num_q_heads * total_tokens_in_request * num_new_tokens;
+        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
+                                    min((size_t)CUDA_NUM_THREADS, parallelism),
+                                    0,
+                                    stream>>>(C,
+                                              num_new_tokens,
+                                              total_tokens_in_request,
+                                              m->num_q_heads,
+                                              m->global_num_q_heads,
+                                              shard_id);
+      }
+
+      // Fill all elements above diagonal in qk prods with -inf to force
+      // causal attention.
+      assert(num_new_tokens <= total_tokens_in_request);
+      if (num_new_tokens > 1) {
+        size_t parallelism =
+            m->num_q_heads * num_new_tokens * total_tokens_in_request;
+        tree_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
+                                           min((size_t)CUDA_NUM_THREADS,
+                                               parallelism),
+                                           0,
+                                           stream>>>(
+            C,
+            num_new_tokens,
+            total_tokens_in_request,
+            m->num_q_heads,
+            static_cast<DT>(-INFINITY));
+      }
+      // Compute Softmax(QK^T/sqrt(d_k))
+      // Before modifying the parameters below, make sure to read the following
+      // description of the CUDNN_TENSOR_NCHW tensor layout, from
+      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
+      // This tensor format specifies that the data is laid out in the following
+      // order: batch size, feature maps, rows, columns. The strides are
+      // implicitly defined in such a way that the data are contiguous in memory
+      // with no padding between images, feature maps, rows, and columns; the
+      // columns are the inner dimension and the images are the outermost
+      // dimension.
+      int n_param = m->num_q_heads;
+      int c_param = total_tokens_in_request;
+      int h_param = 1;
+      int w_param = num_new_tokens;
+      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
+                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_data_type,
+                                            n_param,
+                                            c_param,
+                                            h_param,
+                                            w_param));
+      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      // The softmax operation below is executed according to the
+      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+      // softmax operation is computed per spatial location (H,W) per image (N)
+      // across dimension C.
+      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
+                                     CUDNN_SOFTMAX_ACCURATE,
+                                     CUDNN_SOFTMAX_MODE_CHANNEL,
+                                     &softmax_alpha,
+                                     m->qk_tensor,
+                                     C,
+                                     &softmax_beta,
+                                     m->qk_tensor,
+                                     C_softmax));
+      // Matmul softmax(QK^T/sqrt(d_k)) by V
+      alpha = 1.0f, beta = 0.0f;
+      m_ = m->vProjSize;
+      n = num_new_tokens;
+      k = total_tokens_in_request;
+      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      strideA = vt_block_size;
+      strideB = num_new_tokens * total_tokens_in_request;
+      strideC = m->vProjSize;
+      // To get A, skip over V^T entries from previous requests (all heads +
+      // padding)
+      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      B = C_softmax;
+      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
+      // requests
+      C = static_cast<DT *>(m->attn_heads) +
+          processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n,
+                                           k,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      processed_tokens_in_batch += num_new_tokens;
+    }
+    // Before moving to the next request
+    // check that we have finished all tokens of the request
+    assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
+  }
+  // Project to output, save result directly on output tensor
+  DT alpha = 1.0f, beta = 0.0f;
+  int m_ = m->oProjSize;
+  int k = m->vProjSize * m->num_q_heads;
+  int n = processed_tokens_in_batch;
+  int lda = k, ldb = k, ldc = m_;
+  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                         m->kProjSize * m->num_q_heads +
+                                         m->vProjSize * m->num_q_heads);
+  DT const *B = static_cast<DT *>(m->attn_heads);
+  DT *C = static_cast<DT *>(output_ptr);
+
+  checkCUDA(cublasGemmEx(m->handle.blas,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         m_,
+                         n,
+                         k,
+                         &alpha,
+                         A,
+                         cublas_data_type,
+                         lda,
+                         B,
+                         cublas_data_type,
+                         ldb,
+                         &beta,
+                         C,
+                         cublas_data_type,
+                         ldc,
+                         compute_type,
+                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  if (*m->final_bias && shard_id == 0) {
+    int parallelism = m->oProjSize * processed_tokens_in_batch;
+    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+                          m->kProjSize * m->global_num_q_heads +
+                          m->vProjSize * m->global_num_q_heads;
+    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>(output_ptr,
+                                  bias_ptr,
+                                  processed_tokens_in_batch,
+                                  qkv_weight_size,
+                                  m->oProjSize);
+  }
+
+  assert(processed_tokens_in_batch == bc->num_active_tokens());
+}
+
+#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(DT,                          \
+                                                  Dh,                          \
+                                                  Dh_MAX,                      \
+                                                  THDS_PER_KEY,                \
+                                                  THDS_PER_VALUE,              \
+                                                  THDS_PER_BLOCK,              \
+                                                  stream,                      \
+                                                  prompt_phase)                \
+  smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
+                              BatchConfig::max_sequence_length() +             \
+                                  BatchConfig::max_spec_tree_token_num(),      \
+                              THDS_PER_VALUE,                                  \
+                              THDS_PER_BLOCK,                                  \
+                              bc,                                              \
+                              smem_sz);                                        \
+  compute_attention_kernel_fused_kernel<DT,                                    \
+                                        THDS_PER_BLOCK,                        \
+                                        Dh,                                    \
+                                        Dh_MAX,                                \
+                                        THDS_PER_KEY,                          \
+                                        THDS_PER_VALUE>                        \
+      <<<grid,                                                                 \
+         THDS_PER_BLOCK,                                                       \
+         smem_sz[1],                                                           \
+         stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
+                   static_cast<DT *>(m->keyCache),                             \
+                   static_cast<DT *>(m->valueCache),                           \
+                   output_ptr,                                                 \
+                   scale,                                                      \
+                   BatchConfig::max_sequence_length() +                        \
+                       BatchConfig::max_spec_tree_token_num(),                 \
+                   BatchConfig::max_tokens_per_batch(),                        \
+                   m->qProjSize,                                               \
+                   m->hidden_size,                                             \
+                   m->request_infos,                                           \
+                   m->num_q_heads,                                             \
+                   bc->num_active_requests(),                                  \
+                   m->causalMask,                                              \
+                   m->request_available,                                       \
+                   smem_sz[0],                                                 \
+                   prompt_phase)
+
+template <typename DT>
+void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
+                                    BatchConfig const *bc,
+                                    DT *output_ptr,
+                                    cudaStream_t stream) {
+
+  // update the kv cache
+  //  update K-V cache
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<DT *>(m->keyCache),
+      static_cast<DT *>(m->valueCache),
+      m->token_infos,
+      m->request_infos,
+      m->qProjSize,
+      m->kProjSize,
+      m->vProjSize,
+      num_new_tokens,
+      BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num(),
+      m->hidden_size);
+
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
+
+  dim3 grid(m->num_q_heads, bc->num_active_requests());
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  // 0->qk production size, 1->total shared size
+  // per_head_size: 128, thd_per_v:32, prompt_phase: 0
+  int smem_sz[2];
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream, bc->prompt_phase);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream, bc->prompt_phase);
+  } else {
+    assert(false && "a unsupported head size");
+  }
+
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // float elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+
+}
+
+template <typename DT>
+void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
+                      BatchConfig const *bc,
+                      int shard_id,
+                      DT const *input_ptr,
+                      DT const *weight_ptr,
+                      DT *output_ptr,
+                      DT const *bias_ptr,
+                      cudaStream_t stream) {
+  // additional processing for weight uploading
+  if (m->handle.offload_reserve_space != nullptr) {
+    // Note that we update weight_ptr and bias_ptr when uploading weight and
+    // bias
+    cudaMemcpyAsync(m->weight_ptr,
+                    weight_ptr,
+                    m->weightSize,
+                    cudaMemcpyHostToDevice,
+                    stream);
+    weight_ptr = static_cast<DT *>(m->weight_ptr);
+    if (m->biasSize > 0) {
+      cudaMemcpyAsync(
+          m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
+      bias_ptr = static_cast<DT *>(m->bias_ptr);
+    }
+  }
+
+  // copy committed tokens info to GPU for the commit_tokens kernel
+  // Note that m->num_active_tokens stores the number of active
+  // tokens in the previous batch, which is needed for committing
+  // keys/values to the key-value cache
+  // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
+  // "\n";
+
+  commit_tokens<DT>(m, bc, stream);
+
+  // After commit we update m->num_active_tokens to be the number of active
+  // tokens for the current batch
+  m->num_active_tokens = bc->num_active_tokens();
+
+  // here because we need postion info in infernece 1
+  if (m->offload && m->biasSize > 0) {
+    cudaMemcpyAsync(
+        m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
+    bias_ptr = static_cast<DT *>(m->bias_ptr);
+  }
+  // phase 1: Implement kernel to compute KQV for input tokens
+  compute_qkv_kernel(m,
+                     bc,
+                     shard_id,
+                     input_ptr,
+                     weight_ptr,
+                     static_cast<DT *>(m->devQKVProjArray),
+                     bias_ptr,
+                     stream);
+
+  // phase 2: No need to update key/val cache
+  // IncMultiHeadSelfAttention::update_kv_cache_kernel(
+  //    m, bc, stream);
+  // use the new kernel
+  compute_attention_kernel_fused<DT>(
+      m, bc, static_cast<DT *>(m->attn_heads), stream);
+
+  // Debug output:
+  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
+  //   printf("Output: ");
+  //   for (int i = 0; i < 1; ++i) {
+  //     float temp = 0;
+  //     for (int j = 0; j < m->hidden_size; ++j) {
+  //       temp += temp_output[i * m->hidden_size + j];
+  //     }
+  //     printf("%.6f ", temp);
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
+
+  int processed_tokens_in_batch = bc->num_active_tokens();
+
+  compute_o_prod_bias(m,
+                      bc,
+                      shard_id,
+                      output_ptr,
+                      weight_ptr,
+                      bias_ptr,
+                      processed_tokens_in_batch,
+                      stream);
+}
+
+} // namespace TreeIncMultiHeadAttention
+} // namespace Kernels
+
+/*static*/
+void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+    TreeIncMultiHeadSelfAttentionMeta *m,
+    BatchConfig const *bc,
+    int shard_id,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &output,
+    GenericTensorAccessorR const &bias) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  bool use_bias = *m->qkv_bias || *m->final_bias;
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // assert(input.data_type == weight.data_type);
+  assert(input.data_type == output.data_type);
+  if (use_bias) {
+    assert(input.data_type == bias.data_type);
+  }
+
+  if (input.data_type == DT_HALF) {
+    if (m->offload) {
+      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+    }
+
+    half const *bias_ptr =
+        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_half_ptr(),
+        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
+        output.get_half_ptr(),
+        bias_ptr,
+        stream);
+  } else if (input.data_type == DT_FLOAT) {
+    if (m->offload) {
+      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+    }
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+        m,
+        bc,
+        shard_id,
+        input.get_float_ptr(),
+        m->offload ? static_cast<float *>(m->weight_ptr)
+                   : weight.get_float_ptr(),
+        output.get_float_ptr(),
+        bias_ptr,
+        stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+  }
+}
+
+TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
+    FFHandler handler,
+    TreeIncMultiHeadSelfAttention const *attn,
+    GenericTensorAccessorR const &weight,
+    MemoryAllocator &gpu_mem_allocator,
+    int num_samples,
+    int _num_q_heads,
+    int _num_kv_heads)
+    : IncMultiHeadSelfAttentionMeta(handler,
+                                    TREE_VERIFY_MODE,
+                                    attn,
+                                    attn->qSize,
+                                    attn->kSize,
+                                    attn->vSize,
+                                    attn->qProjSize,
+                                    attn->kProjSize,
+                                    attn->vProjSize,
+                                    attn->oProjSize,
+                                    attn->apply_rotary_embedding,
+                                    attn->qkv_bias,
+                                    attn->scaling_query,
+                                    attn->qk_prod_scaling,
+                                    attn->position_bias,
+                                    attn->final_bias,
+                                    attn->scaling_factor,
+                                    weight,
+                                    gpu_mem_allocator,
+                                    num_samples,
+                                    attn->num_q_heads,
+                                    attn->num_kv_heads,
+                                    _num_q_heads,
+                                    _num_kv_heads,
+                                    attn->quantization_type,
+                                    attn->offload),
+      num_active_tokens(0) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  checkCUDNN(cudnnSetStream(handler.dnn, stream));
+
+  // allocate memory for the seqArray and reserve space
+  {
+    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
+        reinterpret_cast<char *>(handler.batch_config_metadata) +
+        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
+        sizeof(BatchConfig::request_available));
+    committed_token_infos =
+        reinterpret_cast<BatchConfig::CommittedTokensInfo *>(
+            reinterpret_cast<char *>(handler.batch_config_metadata) +
+            sizeof(BatchConfig::tokensInfo) +
+            sizeof(BatchConfig::requestsInfo) +
+            sizeof(BatchConfig::request_available) +
+            sizeof(BatchConfig::causalMask));
+  }
+
+  cudaStreamSynchronize(stream);
+}
+
+TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
+  if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) {
+    committed_token_reserve_inst.destroy();
+  }
+}
+
+}; // namespace FlexFlow

From a54b42fbb3fbd8cdc5b33c752ff3827f901aafc7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 10:55:49 -0700
Subject: [PATCH 313/667] chore: remove unused

---
 src/ops/tree_inc_multihead_self_attention.cu | 348 -------------------
 1 file changed, 348 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c022fabcf..784cfd6e8 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -401,41 +401,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-template <typename DT>
-__global__ void
-    update_tree_branch_kv_cache(DT const *devQKVProjArray,
-                                DT *kCache_ptr,
-                                DT *vCache_ptr,
-                                BatchConfig::PerTokenInfo const *tokenInfos,
-                                int qProjSize,
-                                int kProjSize,
-                                int vProjSize,
-                                int num_tokens_in_branch,
-                                int processed_tokens_in_batch,
-                                int total_tokens_in_batch,
-                                int max_seq_len,
-                                int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
-
-    int token_idx = i / (hidden_size);
-    int offset = i % hidden_size;
-
-    token_idx += processed_tokens_in_batch; // get index in the whole batch
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
-}
-
 template <typename DT>
 __global__ void update_tree_branch_kv_cache_fused(
     DT const *devQKVProjArray,
@@ -470,319 +435,6 @@ __global__ void update_tree_branch_kv_cache_fused(
   }
 }
 
-template <typename DT>
-__global__ void tree_fill_entries_above_diagonal(DT *matrix,
-                                                 size_t new_tokens,
-                                                 size_t total_tokens_in_request,
-                                                 size_t num_q_heads,
-                                                 DT value) {
-  CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) {
-    // size_t head_idx = i / (new_tokens * total_tokens_in_request);
-    size_t src_idx = (i / new_tokens) % total_tokens_in_request;
-    size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens;
-    // Casual Mask
-    if (src_idx > dst_idx) {
-      matrix[i] = value;
-    }
-  }
-}
-
-template <typename DT>
-void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              BatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int processed_tokens_in_batch = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size = kt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size = vt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i]) {
-      continue;
-    }
-    assert(processed_tokens_in_batch ==
-           bc->requestsInfo[i].first_token_offset_in_batch);
-    int last_token_idx_of_the_request =
-        processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
-    while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
-      int num_new_tokens = 1;
-      int j = processed_tokens_in_batch;
-      while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_index_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_index_in_request)) {
-        j++;
-        num_new_tokens++;
-      }
-
-      int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1;
-      assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
-      {
-        // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
-        update_tree_branch_kv_cache<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-            static_cast<DT *>(m->devQKVProjArray),
-            static_cast<DT *>(m->keyCache),
-            static_cast<DT *>(m->valueCache),
-            m->token_infos,
-            m->qProjSize,
-            m->kProjSize,
-            m->vProjSize,
-            num_new_tokens,            // num_tokens_in_branch
-            processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
-            BatchConfig::max_sequence_length() +
-                BatchConfig::max_spec_tree_token_num(),
-            m->hidden_size);
-      }
-
-      // bc->token_last_available_idx[i] + 1;
-      // Compute (QK^T/sqrt(d_k))
-      int m_ = num_new_tokens;
-      int n = total_tokens_in_request;
-      int k = m->qProjSize;
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens_in_request;
-
-      // a flag of using this scaling alpha
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // To get A, skip over Q entries from previous requests (same head)
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    processed_tokens_in_batch * m->qProjSize * m->num_q_heads *
-                        QKV_WEIGHT_NUM;
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // To get C, skip over QK^T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // add alibi position bias to qk production
-      // add alibi position bias to qk production
-      if (*m->position_bias) {
-        size_t parallelism =
-            m->num_q_heads * total_tokens_in_request * num_new_tokens;
-        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens_in_request,
-                                              m->num_q_heads,
-                                              m->global_num_q_heads,
-                                              shard_id);
-      }
-
-      // Fill all elements above diagonal in qk prods with -inf to force
-      // causal attention.
-      assert(num_new_tokens <= total_tokens_in_request);
-      if (num_new_tokens > 1) {
-        size_t parallelism =
-            m->num_q_heads * num_new_tokens * total_tokens_in_request;
-        tree_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                           min((size_t)CUDA_NUM_THREADS,
-                                               parallelism),
-                                           0,
-                                           stream>>>(
-            C,
-            num_new_tokens,
-            total_tokens_in_request,
-            m->num_q_heads,
-            static_cast<DT>(-INFINITY));
-      }
-      // Compute Softmax(QK^T/sqrt(d_k))
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens_in_request;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-      // Matmul softmax(QK^T/sqrt(d_k)) by V
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->vProjSize;
-      n = num_new_tokens;
-      k = total_tokens_in_request;
-      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      strideA = vt_block_size;
-      strideB = num_new_tokens * total_tokens_in_request;
-      strideC = m->vProjSize;
-      // To get A, skip over V^T entries from previous requests (all heads +
-      // padding)
-      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      B = C_softmax;
-      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-      // requests
-      C = static_cast<DT *>(m->attn_heads) +
-          processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      processed_tokens_in_batch += num_new_tokens;
-    }
-    // Before moving to the next request
-    // check that we have finished all tokens of the request
-    assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
-  }
-  // Project to output, save result directly on output tensor
-  DT alpha = 1.0f, beta = 0.0f;
-  int m_ = m->oProjSize;
-  int k = m->vProjSize * m->num_q_heads;
-  int n = processed_tokens_in_batch;
-  int lda = k, ldb = k, ldc = m_;
-  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                         m->kProjSize * m->num_q_heads +
-                                         m->vProjSize * m->num_q_heads);
-  DT const *B = static_cast<DT *>(m->attn_heads);
-  DT *C = static_cast<DT *>(output_ptr);
-
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         m_,
-                         n,
-                         k,
-                         &alpha,
-                         A,
-                         cublas_data_type,
-                         lda,
-                         B,
-                         cublas_data_type,
-                         ldb,
-                         &beta,
-                         C,
-                         cublas_data_type,
-                         ldc,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * processed_tokens_in_batch;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(output_ptr,
-                                  bias_ptr,
-                                  processed_tokens_in_batch,
-                                  qkv_weight_size,
-                                  m->oProjSize);
-  }
-
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
-}
-
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(DT,                          \
                                                   Dh,                          \
                                                   Dh_MAX,                      \

From 6370a0d570527c527a2a531a4ab777bafba16a7d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 13:19:33 -0700
Subject: [PATCH 314/667] chore: change file output to check alignment

---
 src/runtime/request_manager.cc | 46 +++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5d9920081..285c8f068 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -352,7 +352,8 @@ RequestManager::RequestGuid
       output = output + " " + std::to_string(request.tokens[i]);
     }
     log_req_mgr.print("%s", output.c_str());
-    write_to_output_file(output_filepath, output);
+    std::cout << output << std::endl;
+    // write_to_output_file(output_filepath, output);
   }
 
   GenerationResult gr;
@@ -504,6 +505,43 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     std::cout << "<eos>";
   }
   std::cout << std::endl << std::endl;
+  {
+    RequestProfileInfo profile_info = profiling_requests[guid];
+
+    std::ostream *os = &std::cout;
+    std::ofstream output_file;
+    if (!output_filepath.empty()) {
+      output_file.open(output_filepath, std::ios::app);
+      if (output_file.is_open()) {
+        os = &output_file;
+      } else {
+        std::cout << "Unable to open the output file: " << output_filepath
+                  << std::endl;
+        assert(false);
+      }
+    }
+    *os << "Request " << guid << " profiling: " << std::endl;
+    if (profile_info.start_decoding_time != 0) {
+      *os << "Decoding time: "
+          << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
+          << " ms" << std::endl;
+    } else {
+      *os << "Decoding time: 0 ms" << std::endl;
+    }
+    *os << "Total time: "
+        << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
+        << std::endl;
+    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps << std::endl;
+      if (decoding_mode == SPECULATIVE_DECODING) {
+        *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
+            << std::endl;
+      }
+      *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
+
+      if (!output_filepath.empty()) {
+        output_file.close();
+      }
+  }
   RequestProfileInfo profile_info = profiling_requests[guid];
   std::string str = "[" + std::to_string(guid) + "] Request completed:" + 
                       " decoding_time_ms(" + std::to_string(
@@ -522,7 +560,8 @@ void RequestManager::request_complete_clean_up(int batch_index) {
       profile_info.ssm_decoding_steps) 
       + ")";
   }
-  write_to_output_file(output_filepath, str);
+  std::cout << str << std::endl;
+  // write_to_output_file(output_filepath, str);
 
   trigger_request_completion_future(guid);
 }
@@ -2028,7 +2067,8 @@ void RequestManager::terminate_background_server() {
     }
     generated_tokens_per_step += ")";
     str += generated_tokens_per_step;
-    write_to_output_file(output_filepath, str);
+    std::cout << str << std::endl;
+    // write_to_output_file(output_filepath, str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate
     Runtime *runtime = Runtime::get_runtime();

From 516f2901159f25ca1458db1ca26ab6c4276eabc1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 13:22:10 -0700
Subject: [PATCH 315/667] chore: minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 3 +++
 src/runtime/request_manager.cc               | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 784cfd6e8..6ff29b592 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -356,6 +356,9 @@ __global__ void commit_tokens_kernel(
 
     int token_pos = i / (hidden_size);
     int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
+    if (token_idx_in_last_batch == -1) {
+      return;
+    }
     int offset = i % hidden_size;
     assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 285c8f068..e3ab8d544 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -712,7 +712,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
     for (int i = 0; i < prefill_request->num_tokens_in_batch; i++) {
       prefill_request->committed_tokens.push_back(Request::CommittedToken{
-          i,
+          -1,
           committed_token_offset + i,
           prefill_request->tokens[i + committed_token_offset]});
     }

From 5e39ca1bf6b9f8a191c08886eb34eef4c3c1f1a3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Jun 2024 13:35:15 -0700
Subject: [PATCH 316/667] chore: minor

---
 src/runtime/request_manager.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e3ab8d544..128fd4671 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -352,8 +352,7 @@ RequestManager::RequestGuid
       output = output + " " + std::to_string(request.tokens[i]);
     }
     log_req_mgr.print("%s", output.c_str());
-    std::cout << output << std::endl;
-    // write_to_output_file(output_filepath, output);
+    write_to_output_file("", output);
   }
 
   GenerationResult gr;
@@ -560,8 +559,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
       profile_info.ssm_decoding_steps) 
       + ")";
   }
-  std::cout << str << std::endl;
-  // write_to_output_file(output_filepath, str);
+  write_to_output_file("", str);
 
   trigger_request_completion_future(guid);
 }
@@ -2067,8 +2065,7 @@ void RequestManager::terminate_background_server() {
     }
     generated_tokens_per_step += ")";
     str += generated_tokens_per_step;
-    std::cout << str << std::endl;
-    // write_to_output_file(output_filepath, str);
+    write_to_output_file("", str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate
     Runtime *runtime = Runtime::get_runtime();

From 0d85f589fea2ab315164ae625376ab342a06a15b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 00:19:20 -0700
Subject: [PATCH 317/667] feat: change commit tokens behavior

---
 include/flexflow/batch_config.h              | 6 +++---
 include/flexflow/request_manager.h           | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu | 2 +-
 src/runtime/batch_config.cc                  | 2 +-
 src/runtime/request_manager.cc               | 5 +++--
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index e0f02ff20..0f36e1b23 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -83,9 +83,9 @@ class BatchConfig {
   };
 
   struct CommittedTokensInfo {
-    int token_index = -1;   // the index of the token in the previous batch
-    int request_index = -1; // request index in the batch
-    int token_depth = -1;   // position of the token in the request's sequence
+    int index_in_kv_cache = -1; // the index in the temporary key-value cache
+    int request_index = -1;     // request index in the batch
+    int token_depth = -1; // position of the token in the request's sequence
   };
 
   class BitMask {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 46604da42..52e01b53b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -100,7 +100,7 @@ struct Request {
   // cache: prompt_length + generated_sequence_length +
   // index_in_committed_tokens.
   //
-  // from_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_index
+  // from_index -> TreeVerifyBatchConfig::CommittedTokensInfo.index_in_kv_cache
   // to_index -> TreeVerifyBatchConfig::CommittedTokensInfo.token_depth
   //
   // Actually, for a committed token, the `to_index` for the LLM KV cache and
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 6ff29b592..d905de525 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -355,7 +355,7 @@ __global__ void commit_tokens_kernel(
   CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
 
     int token_pos = i / (hidden_size);
-    int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
+    int token_idx_in_last_batch = committedTokenInfos[token_pos].index_in_kv_cache;
     if (token_idx_in_last_batch == -1) {
       return;
     }
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 15b14e547..b177511ee 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -171,7 +171,7 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
     os << "Committed tokens info:\n";
     for (int i = 0; i < bc.num_tokens_to_commit; i++) {
       os << "  Token " << i << ":\n";
-      os << "    Token index: " << bc.committed_tokens[i].token_index
+      os << "    Index in kv cache: " << bc.committed_tokens[i].index_in_kv_cache
          << std::endl;
       os << "    Request index: " << bc.committed_tokens[i].request_index
          << std::endl;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 128fd4671..8708c8a1e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -933,7 +933,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       if (!request.llm_committed) {
         // Committed tokens
         for (int i = 0; i < request.committed_tokens.size() - 1; i++) {
-          bc.committed_tokens[bc.num_tokens_to_commit].token_index =
+          bc.committed_tokens[bc.num_tokens_to_commit].index_in_kv_cache =
               request.committed_tokens[i].from_index;
           bc.committed_tokens[bc.num_tokens_to_commit].request_index =
               request_index;
@@ -1292,7 +1292,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
             committed_tokens.at(committed_token_index);
         new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
             request_index;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_index =
+        new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
             committed_token.from_index;
         new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
             committed_token.to_index;
@@ -1662,6 +1662,7 @@ void RequestManager::get_verify_results_greedy(
     assert(request.status == Request::RUNNING);
 
     int llm_result_offset = request.first_token_offset_in_batch;
+    int llm_cache_size = request.llm_cache_size;
     int committed_token_index = request.tokens.size() - 1;
 
     TokenTree &token_tree = request.speculative_token_trees[0];

From 146fa2efd30fbc93126cee6ed20013a2fe9f4d8d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 02:27:38 -0700
Subject: [PATCH 318/667] feat: eliminate usage of devQKVProjArray in
 commit_tokens_kernel

---
 src/ops/tree_inc_multihead_self_attention.cu | 43 ++++++++------------
 src/runtime/request_manager.cc               |  6 +--
 2 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d905de525..a04a0789d 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -347,34 +347,27 @@ __global__ void commit_tokens_kernel(
     int qProjSize,
     int kProjSize,
     int vProjSize,
-    int num_tokens_to_commit,
+    int token_pos,
     int num_active_tokens_in_last_batch,
     int max_seq_len,
     int hidden_size) {
+  int const index_in_kv_cache = committedTokenInfos[token_pos].index_in_kv_cache;
+  if (index_in_kv_cache == -1) {
+    return;
+  }
 
-  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
-
-    int token_pos = i / (hidden_size);
-    int token_idx_in_last_batch = committedTokenInfos[token_pos].index_in_kv_cache;
-    if (token_idx_in_last_batch == -1) {
-      return;
-    }
-    int offset = i % hidden_size;
-    assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
-
-    size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size +
-                     hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
+  int const req_id = committedTokenInfos[token_pos].request_index;
+  int const tok_id = committedTokenInfos[token_pos].token_depth;
 
-    int const req_id = committedTokenInfos[token_pos].request_index;
-    int const tok_id = committedTokenInfos[token_pos].token_depth;
+  size_t from_idx = req_id * (hidden_size * max_seq_len) +
+                    index_in_kv_cache * hidden_size;
+  size_t to_idx = req_id * (hidden_size * max_seq_len) +
+                  tok_id * hidden_size;
+  assert(to_idx < from_idx);
 
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
+  CUDA_KERNEL_LOOP(offset, hidden_size) {
+    kCache_ptr[to_idx + offset] = kCache_ptr[from_idx + offset];
+    vCache_ptr[to_idx + offset] = vCache_ptr[from_idx + offset];
   }
 }
 
@@ -383,8 +376,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
-  if (num_tokens_to_commit > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
+  for (int i = 0; i < num_tokens_to_commit; i++) {
+    int parallelism = m->hidden_size;
     commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                            min(CUDA_NUM_THREADS, parallelism),
                            0,
@@ -396,7 +389,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->qProjSize,
         m->kProjSize,
         m->vProjSize,
-        num_tokens_to_commit,
+        i,
         m->num_active_tokens, // number of active tokens in previous batch
         BatchConfig::max_sequence_length() +
             BatchConfig::max_spec_tree_token_num(),
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8708c8a1e..0e0d7e448 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1662,13 +1662,13 @@ void RequestManager::get_verify_results_greedy(
     assert(request.status == Request::RUNNING);
 
     int llm_result_offset = request.first_token_offset_in_batch;
-    int llm_cache_size = request.llm_cache_size;
+    int llm_cache_size = request.tokens.size() - 1;
     int committed_token_index = request.tokens.size() - 1;
 
     TokenTree &token_tree = request.speculative_token_trees[0];
     // First add the root to the committed tokens
     request.committed_tokens.push_back(Request::CommittedToken(
-        llm_result_offset, committed_token_index, request.tokens.back()));
+        llm_cache_size, committed_token_index, request.tokens.back()));
     committed_token_index++;
     // Don't add it to request.tokens because it has already been added.
 
@@ -1714,7 +1714,7 @@ void RequestManager::get_verify_results_greedy(
             // pruned tokens)
             // to_index: the committed token index in the request
             request.committed_tokens.push_back(
-                Request::CommittedToken(llm_result_offset + current_token_index,
+                Request::CommittedToken(llm_cache_size + current_token_index,
                                         committed_token_index,
                                         node_ptr->id));
             request.tokens.push_back(node_ptr->id);

From 94f7abf666bc328573b63d7a1cbfbc293e134460 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 02:28:01 -0700
Subject: [PATCH 319/667] chore: minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a04a0789d..5d8b878c4 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -340,7 +340,6 @@ __global__ void compute_attention_kernel_fused_kernel(
 
 template <typename DT>
 __global__ void commit_tokens_kernel(
-    DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
@@ -363,7 +362,7 @@ __global__ void commit_tokens_kernel(
                     index_in_kv_cache * hidden_size;
   size_t to_idx = req_id * (hidden_size * max_seq_len) +
                   tok_id * hidden_size;
-  assert(to_idx < from_idx);
+  assert(to_idx <= from_idx);
 
   CUDA_KERNEL_LOOP(offset, hidden_size) {
     kCache_ptr[to_idx + offset] = kCache_ptr[from_idx + offset];
@@ -382,7 +381,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                            min(CUDA_NUM_THREADS, parallelism),
                            0,
                            stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
         static_cast<DT *>(m->keyCache),
         static_cast<DT *>(m->valueCache),
         m->committed_token_infos,

From c6f4b0a5f546a6ab7959d617285f4bb1e904e45e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 02:34:05 -0700
Subject: [PATCH 320/667] chore: minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 5d8b878c4..03108eb63 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -396,7 +396,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void update_tree_branch_kv_cache_fused(
+__global__ void update_tree_branch_kv_cache_kernel(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
@@ -481,7 +481,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   //  update K-V cache
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = m->hidden_size * num_new_tokens;
-  update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
+  update_tree_branch_kv_cache_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
                                       stream>>>(

From e63c0422bcc3810118aab5da55779f83008f97d4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 03:43:10 -0700
Subject: [PATCH 321/667] feat: compact q_vec into continuous one

---
 src/ops/tree_inc_multihead_self_attention.cu | 73 ++++++++++----------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 03108eb63..41a8a57f0 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -127,7 +127,7 @@ __global__ void compute_attention_kernel_fused_kernel(
   // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
   __shared__ float red_smem[WARPS_PER_BLOCK * 2];
 
-  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
+  const DT *q_ptr = query + first_token_idx * hidden_size +
                     head_idx * per_head_size;
   __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
 
@@ -154,7 +154,7 @@ __global__ void compute_attention_kernel_fused_kernel(
 #pragma unroll
     for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
       q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
+          q_ptr + (hidden_size * qi) + ki +
           ii * THREADS_PER_KEY * K_VEC_SIZE);
 
       // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
@@ -397,7 +397,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 __global__ void update_tree_branch_kv_cache_kernel(
-    DT const *devQKVProjArray,
+    DT *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
     BatchConfig::PerTokenInfo const *tokenInfos,
@@ -405,27 +405,24 @@ __global__ void update_tree_branch_kv_cache_kernel(
     int qProjSize,
     int kProjSize,
     int vProjSize,
-    int num_new_tokens,
+    int token_idx,
     int max_seq_len,
     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) {
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+  size_t from_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_idx = req_idx * (hidden_size * max_seq_len) +
+                  token_abs_idx * hidden_size;
 
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_idx = tokenInfos[token_idx].request_index;
-    int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    kCache_ptr[req_idx * (hidden_size * max_seq_len) +
-               token_abs_idx * hidden_size + offset] = kVal;
-    vCache_ptr[req_idx * (hidden_size * max_seq_len) +
-               token_abs_idx * hidden_size + offset] = vVal;
+  CUDA_KERNEL_LOOP(offset, hidden_size) {
+    kCache_ptr[to_idx + offset] = 
+               devQKVProjArray[from_idx + hidden_size + offset];
+    vCache_ptr[to_idx + offset] = 
+               devQKVProjArray[from_idx + hidden_size * 2 + offset];
+    devQKVProjArray[token_idx * hidden_size + offset] = 
+               devQKVProjArray[from_idx + offset];
   }
 }
 
@@ -480,23 +477,25 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   // update the kv cache
   //  update K-V cache
   int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  update_tree_branch_kv_cache_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<DT *>(m->keyCache),
-      static_cast<DT *>(m->valueCache),
-      m->token_infos,
-      m->request_infos,
-      m->qProjSize,
-      m->kProjSize,
-      m->vProjSize,
-      num_new_tokens,
-      BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num(),
-      m->hidden_size);
+  int parallelism = m->hidden_size;
+  for (int i = 0; i < num_new_tokens; i++) {
+    update_tree_branch_kv_cache_kernel<<<GET_BLOCKS(parallelism),
+                                        min(CUDA_NUM_THREADS, parallelism),
+                                        0,
+                                        stream>>>(
+        static_cast<DT *>(m->devQKVProjArray),
+        static_cast<DT *>(m->keyCache),
+        static_cast<DT *>(m->valueCache),
+        m->token_infos,
+        m->request_infos,
+        m->qProjSize,
+        m->kProjSize,
+        m->vProjSize,
+        i,
+        BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num(),
+        m->hidden_size);
+  }
 
   // cudaEvent_t t_start, t_end;
   // cudaEventCreate(&t_start);

From ed4fbc067c0543972f8d76b2006e0fdaacce996e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 04:15:45 -0700
Subject: [PATCH 322/667] chore: add TODOs

---
 src/ops/tree_inc_multihead_self_attention.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 41a8a57f0..295f68edf 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -375,6 +375,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
+  // TODO: parallel across queries
   for (int i = 0; i < num_tokens_to_commit; i++) {
     int parallelism = m->hidden_size;
     commit_tokens_kernel<<<GET_BLOCKS(parallelism),
@@ -478,6 +479,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   //  update K-V cache
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = m->hidden_size;
+  // TODO: parallel across queries
   for (int i = 0; i < num_new_tokens; i++) {
     update_tree_branch_kv_cache_kernel<<<GET_BLOCKS(parallelism),
                                         min(CUDA_NUM_THREADS, parallelism),

From cc515996011651ccc12fdf0703eca1094b695d03 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 07:02:02 -0700
Subject: [PATCH 323/667] chore: add CustomMask to BatchConfig

---
 include/flexflow/batch_config.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 0f36e1b23..62ec76ffe 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -148,6 +148,29 @@ class BatchConfig {
     }
   };
 
+  class CustomMask {
+  public:
+    CustomMask() {
+      float neg_inf = -std::numeric_limits<float>::infinity();
+      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
+        for (int j = 0; j < MAX_SPEC_TREE_TOKEN_NUM + MAX_NUM_TOKENS; j++) {
+          mask[i][j] = neg_inf;
+        }
+      }
+    }
+
+    CustomMask(CustomMask const &other) {
+      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
+        for (int j = 0; j < MAX_SPEC_TREE_TOKEN_NUM + MAX_NUM_TOKENS; j++) {
+          mask[i][j] = other.mask[i][j];
+        }
+      }
+    }
+
+    float mask[MAX_SPEC_TREE_TOKEN_NUM][MAX_SPEC_TREE_TOKEN_NUM + MAX_NUM_TOKENS];
+  };
+
+  // CustomMask custom_mask[MAX_NUM_REQUESTS];
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];

From 433ae1c9c501015117033bd92b4cf436438c151a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 08:30:50 -0700
Subject: [PATCH 324/667] feat: add custom_mask in tree_verify_attn op (GPU
 mem)

---
 include/flexflow/batch_config.h               | 23 -------------------
 .../ops/tree_inc_multihead_self_attention.h   |  3 ++-
 src/ops/tree_inc_multihead_self_attention.cu  | 13 +++++++++--
 3 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 62ec76ffe..0f36e1b23 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -148,29 +148,6 @@ class BatchConfig {
     }
   };
 
-  class CustomMask {
-  public:
-    CustomMask() {
-      float neg_inf = -std::numeric_limits<float>::infinity();
-      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
-        for (int j = 0; j < MAX_SPEC_TREE_TOKEN_NUM + MAX_NUM_TOKENS; j++) {
-          mask[i][j] = neg_inf;
-        }
-      }
-    }
-
-    CustomMask(CustomMask const &other) {
-      for (int i = 0; i < MAX_SPEC_TREE_TOKEN_NUM; i++) {
-        for (int j = 0; j < MAX_SPEC_TREE_TOKEN_NUM + MAX_NUM_TOKENS; j++) {
-          mask[i][j] = other.mask[i][j];
-        }
-      }
-    }
-
-    float mask[MAX_SPEC_TREE_TOKEN_NUM][MAX_SPEC_TREE_TOKEN_NUM + MAX_NUM_TOKENS];
-  };
-
-  // CustomMask custom_mask[MAX_NUM_REQUESTS];
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 45a7a6b56..a9806ce1f 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -145,7 +145,8 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 
 public:
   int num_active_tokens;
-  Realm::RegionInstance committed_token_reserve_inst;
+  Realm::RegionInstance custom_mask_reserve_inst;
+  float *custom_mask;
   BatchConfig::CommittedTokensInfo *committed_token_infos;
   BatchConfig::BitMask *causalMask;
 };
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 295f68edf..da16943c2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -739,6 +739,15 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
 
+  {
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              BatchConfig::max_spec_tree_token_num() *
+                              (BatchConfig::max_spec_tree_token_num() +
+                                BatchConfig::max_sequence_length());
+    gpu_mem_allocator.create_legion_instance(custom_mask_reserve_inst, sizeof(float) * custom_mask_size);
+    custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
+  }
+
   // allocate memory for the seqArray and reserve space
   {
     causalMask = reinterpret_cast<BatchConfig::BitMask *>(
@@ -758,8 +767,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
 }
 
 TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
-  if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) {
-    committed_token_reserve_inst.destroy();
+  if (custom_mask_reserve_inst != Realm::RegionInstance::NO_INST) {
+    custom_mask_reserve_inst.destroy();
   }
 }
 

From ff7aa52a5877b14a203a14106d19291e006926ba Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 09:26:01 -0700
Subject: [PATCH 325/667] feat: update custom_mask on gpu side

---
 .../inc_multihead_self_attention_utils.cuh    |  3 +
 src/ops/tree_inc_multihead_self_attention.cu  | 71 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index 481243867..fbe0c5547 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -523,5 +523,8 @@ struct threads_per_value_t {
 #define test_bit(bit_mask, idx, pos)                                           \
   (((bit_mask)[idx][(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
+#define test_bit_orig(bit_mask, idx, pos)                                           \
+  (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
+
 } // namespace FlexFlow
 #endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index da16943c2..1d3d77b8e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -396,6 +396,74 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
+__global__ void update_custom_mask_kernel(
+    float *custom_mask,
+    BatchConfig::BitMask *causalMask,
+    BatchConfig::PerRequestInfo *request_infos,
+    bool *request_available,
+    int const num_requests,
+    int const max_q_length,
+    int const max_kv_length,
+    float mask_value) {
+  // get thread idx in [0, num_requests * max_q_length)
+  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // get (request_idx, q_idx) from thread idx
+  int const request_idx = idx / max_q_length;
+  int const q_idx = idx % max_q_length;
+  
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+    }
+  }
+
+  int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch;
+  int const q_start =
+      request_infos[requext_idx_in_batch].first_token_index_in_request;
+  if (q_idx >= q_length) {
+    return;
+  }
+  assert(q_start + q_length <= max_kv_length);
+
+  float *mask = custom_mask + request_idx * max_q_length * max_kv_length +
+                q_idx * (q_start + q_length);
+  // update custom mask
+  for (int i = 0; i < q_start; i++) {
+    mask[i] = 0.0f;
+  }
+  BatchConfig::BitMask *bitmask = &causalMask[requext_idx_in_batch];
+  for (int i = 0; i < q_length; i++) {
+    mask[q_start + i] = test_bit_orig(bitmask->bit_mask, q_idx, i)
+                  ? 0.0f : mask_value;
+  }
+}
+
+void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        cudaStream_t stream) {
+  int const num_requests = bc->num_active_requests();
+  int const max_q_length = BatchConfig::max_spec_tree_token_num();
+  int const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+                            BatchConfig::max_sequence_length();
+  int parallelism = num_requests * max_q_length;
+  update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
+                              min(CUDA_NUM_THREADS, parallelism),
+                              0,
+                              stream>>>(
+      m->custom_mask,
+      m->causalMask,
+      m->request_infos,
+      m->request_available,
+      num_requests,
+      max_q_length,
+      max_kv_length,
+      -std::numeric_limits<float>::infinity());
+}
+
 template <typename DT>
 __global__ void update_tree_branch_kv_cache_kernel(
     DT *devQKVProjArray,
@@ -587,6 +655,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                      bias_ptr,
                      stream);
 
+  // update gpu-side custom mask referring from CaualMask
+  update_custom_mask(m, bc, stream);
+
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
   //    m, bc, stream);

From 55c748010ef7332eb30bfcb648e6fcbc5bce8ab1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 10:56:34 -0700
Subject: [PATCH 326/667] feat: add scratch_space

---
 include/flexflow/ops/tree_inc_multihead_self_attention.h | 3 ++-
 src/ops/tree_inc_multihead_self_attention.cu             | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index a9806ce1f..1a5ff7a9a 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -145,8 +145,9 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 
 public:
   int num_active_tokens;
-  Realm::RegionInstance custom_mask_reserve_inst;
+  Realm::RegionInstance flashinfer_reserve_inst;
   float *custom_mask;
+  float *scratch_space;
   BatchConfig::CommittedTokensInfo *committed_token_infos;
   BatchConfig::BitMask *causalMask;
 };
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 1d3d77b8e..a4999c00e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -815,8 +815,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                               BatchConfig::max_spec_tree_token_num() *
                               (BatchConfig::max_spec_tree_token_num() +
                                 BatchConfig::max_sequence_length());
-    gpu_mem_allocator.create_legion_instance(custom_mask_reserve_inst, sizeof(float) * custom_mask_size);
+    size_t scratch_space_size = 8 * 1024 * 1024; // 32 MB float
+    gpu_mem_allocator.create_legion_instance(flashinfer_reserve_inst, 
+                sizeof(float) * (custom_mask_size + scratch_space_size));
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
+    scratch_space = gpu_mem_allocator.allocate_instance<float>(scratch_space_size);
   }
 
   // allocate memory for the seqArray and reserve space
@@ -838,8 +841,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
 }
 
 TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
-  if (custom_mask_reserve_inst != Realm::RegionInstance::NO_INST) {
-    custom_mask_reserve_inst.destroy();
+  if (flashinfer_reserve_inst != Realm::RegionInstance::NO_INST) {
+    flashinfer_reserve_inst.destroy();
   }
 }
 

From b30dcce455bf266fb3edbd87bcb9223a939b4beb Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 10:57:19 -0700
Subject: [PATCH 327/667] chore: split the attention kernel

---
 src/ops/tree_inc_multihead_self_attention.cu | 58 +++++++++++---------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a4999c00e..b00d16dd2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -465,7 +465,7 @@ void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void update_tree_branch_kv_cache_kernel(
+__global__ void update_qkv_cache_kernel(
     DT *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
@@ -495,6 +495,34 @@ __global__ void update_tree_branch_kv_cache_kernel(
   }
 }
 
+template <typename DT>
+void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
+                                 BatchConfig const *bc,
+                                 cudaStream_t stream) {
+  // update the kv cache, compact the q array
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size;
+  // TODO: parallel across queries
+  for (int i = 0; i < num_new_tokens; i++) {
+    update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                              min(CUDA_NUM_THREADS, parallelism),
+                              0,
+                              stream>>>(
+        static_cast<DT *>(m->devQKVProjArray),
+        static_cast<DT *>(m->keyCache),
+        static_cast<DT *>(m->valueCache),
+        m->token_infos,
+        m->request_infos,
+        m->qProjSize,
+        m->kProjSize,
+        m->vProjSize,
+        i,
+        BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num(),
+        m->hidden_size);
+  }
+}
+
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(DT,                          \
                                                   Dh,                          \
                                                   Dh_MAX,                      \
@@ -542,31 +570,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
                                     BatchConfig const *bc,
                                     DT *output_ptr,
                                     cudaStream_t stream) {
-
-  // update the kv cache
-  //  update K-V cache
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size;
-  // TODO: parallel across queries
-  for (int i = 0; i < num_new_tokens; i++) {
-    update_tree_branch_kv_cache_kernel<<<GET_BLOCKS(parallelism),
-                                        min(CUDA_NUM_THREADS, parallelism),
-                                        0,
-                                        stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
-        m->token_infos,
-        m->request_infos,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        i,
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        m->hidden_size);
-  }
-
   // cudaEvent_t t_start, t_end;
   // cudaEventCreate(&t_start);
   // cudaEventCreate(&t_end);
@@ -658,6 +661,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // update gpu-side custom mask referring from CaualMask
   update_custom_mask(m, bc, stream);
 
+  // update key-val cache, compact q array
+  update_qkv_cache<DT>(m, bc, stream);
+
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
   //    m, bc, stream);

From 638df76c9b3983823bca3af8142750c65abe5147 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Jun 2024 13:47:09 -0700
Subject: [PATCH 328/667] feat: add tree_verify_attention based on flashinfer

---
 src/ops/tree_inc_multihead_self_attention.cu | 130 +++++++++++++++++--
 1 file changed, 119 insertions(+), 11 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index b00d16dd2..9bcfb19e8 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -15,6 +15,7 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
@@ -34,6 +35,12 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
+using namespace flashinfer;
+// using flashinfer::QKVLayout;
+// using flashinfer::PosEncodingMode;
+// using flashinfer::MaskMode;
+// using flashinfer::SinglePrefillWithKVCacheDispatched;
+
 template <typename DT,
           int THREADS_PER_BLOCK,
           int Dh,
@@ -603,6 +610,111 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 }
 
+#define DISPATCH_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
+  if (group_size == 1) {                                     \
+    constexpr size_t GROUP_SIZE = 1;                         \
+    __VA_ARGS__                                              \
+  } else if (group_size == 4) {                              \
+    constexpr size_t GROUP_SIZE = 4;                         \
+    __VA_ARGS__                                              \
+  } else if (group_size == 8) {                              \
+    constexpr size_t GROUP_SIZE = 8;                         \
+    __VA_ARGS__                                              \
+  } else {                                                   \
+    std::ostringstream err_msg;                              \
+    err_msg << "Unsupported group_size: " << group_size;     \
+    throw std::invalid_argument(err_msg.str());              \
+  }
+
+#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...)     \
+  switch (head_dim) {                                  \
+    case 64: {                                         \
+      constexpr size_t HEAD_DIM = 64;                  \
+      __VA_ARGS__                                      \
+      break;                                           \
+    }                                                  \
+    case 128: {                                        \
+      constexpr size_t HEAD_DIM = 128;                 \
+      __VA_ARGS__                                      \
+      break;                                           \
+    }                                                  \
+    case 256: {                                        \
+      constexpr size_t HEAD_DIM = 256;                 \
+      __VA_ARGS__                                      \
+      break;                                           \
+    }                                                  \
+    default: {                                         \
+      std::ostringstream err_msg;                      \
+      err_msg << "Unsupported head_dim: " << head_dim; \
+      throw std::invalid_argument(err_msg.str());      \
+    }                                                  \
+  }
+
+template <typename DT>
+void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
+                          BatchConfig const *bc,
+                          DT *output_ptr,
+                          cudaStream_t stream) {
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const group_size = num_q_heads / num_kv_heads;
+  uint32_t const head_dim = m->qProjSize;
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+
+  // for finding q, k, v, custom_mask pointers
+  uint32_t const hidden_size = m->hidden_size;
+  uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
+                          BatchConfig::max_spec_tree_token_num();
+  uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
+  uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+                            BatchConfig::max_sequence_length();
+  
+  // flashinfer parameters
+
+  for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
+    if (!bc->request_available[req_idx]) {
+      continue;
+    }
+    BatchConfig::PerRequestInfo const *req = bc->requestsInfo + req_idx;
+    uint32_t q_len = req->num_tokens_in_batch,
+             q_start = req->first_token_index_in_request,
+             kv_len = q_len + q_start;
+
+    DT* q = static_cast<DT *>(m->devQKVProjArray) + req->first_token_offset_in_batch * hidden_size,
+      * k = static_cast<DT *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
+      * v = static_cast<DT *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
+      * o = output_ptr + req->first_token_offset_in_batch * hidden_size;
+    float* tmp = m->scratch_space;
+    float* custom_mask = m->custom_mask + req_idx * max_q_length * max_kv_length;
+
+    // DISPATCH_GROUP_SIZE(
+    //   group_size, GROUP_SIZE,
+    //     {DISPATCH_HEAD_DIM(
+    //       head_dim, HEAD_DIM, {
+    SinglePrefillWithKVCacheDispatched<
+        1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+        false, MaskMode::kNone, DT, DT>(
+          q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
+          num_kv_heads, q_len, kv_len, sm_scale,
+          /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+    // })});
+  }
+
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // float elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+}
+
 template <typename DT>
 void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
@@ -648,7 +760,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
         m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
-  // phase 1: Implement kernel to compute KQV for input tokens
+  // Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
                      shard_id,
@@ -658,18 +770,14 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                      bias_ptr,
                      stream);
 
-  // update gpu-side custom mask referring from CaualMask
+  // Update gpu-side custom mask referring from CaualMask
   update_custom_mask(m, bc, stream);
 
-  // update key-val cache, compact q array
+  // Update key-val cache, compact q array
   update_qkv_cache<DT>(m, bc, stream);
 
-  // phase 2: No need to update key/val cache
-  // IncMultiHeadSelfAttention::update_kv_cache_kernel(
-  //    m, bc, stream);
-  // use the new kernel
-  compute_attention_kernel_fused<DT>(
-      m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // Compute attention
+  tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
@@ -738,7 +846,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+    Kernels::TreeIncMultiHeadAttention::inference_kernel<half>(
         m,
         bc,
         shard_id,
@@ -753,7 +861,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
+    Kernels::TreeIncMultiHeadAttention::inference_kernel<float>(
         m,
         bc,
         shard_id,

From 4da083935509db742bf85abd28b11473c04bf3d4 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 5 Jun 2024 04:11:38 -0400
Subject: [PATCH 329/667] Support sampling and speculative sampling.

---
 include/flexflow/inference.h             |  18 +-
 include/flexflow/request_manager.h       |   8 +
 inference/incr_decoding/incr_decoding.cc |   6 +-
 inference/models/llama.cc                |  16 +-
 inference/spec_infer/spec_infer.cc       |  29 ++-
 src/runtime/request_manager.cc           | 214 ++++++++++++++++++++++-
 6 files changed, 270 insertions(+), 21 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index f24a797ff..7f85294aa 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -22,14 +22,22 @@ namespace FlexFlow {
 
 struct GenerationConfig {
   bool do_sample = false;
+  bool spec_sample = false;
   float temperature = 0.8;
+  // top-p renormalization
   float topp = 0.6;
-  GenerationConfig(bool _do_sample, float _temperature, float _topp) {
-    temperature = _temperature > 0 ? _temperature : temperature;
-    topp = _topp > 0 ? _topp : topp;
-    do_sample = _do_sample;
+  // top-k renormalization
+  int topk = 16;
+  GenerationConfig(bool _do_sample = false,
+                   float _temperature = 0.8,
+                   float _topp = 0.6,
+                   bool _spec_sample = false,
+                   int topk = 16)
+      : do_sample(_do_sample), temperature(_temperature), topp(_topp),
+        spec_sample(_spec_sample), topk(topk) {
+    assert(temperature > 0.0);
+    assert(topk <= BatchConfig::MAX_K_LOGITS);
   }
-  GenerationConfig() {}
 };
 
 struct GenerationResult {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 9dc30f9f6..050a13c77 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -383,6 +383,7 @@ class RequestManager {
 
   // LLM result verification
   void get_verify_results_greedy(InferenceResult const &llm_verify_result);
+  void get_verify_results_sample(InferenceResult const &llm_verify_result);
 
   // Bitmask related
   void init_bitmask_prompt(RequestGuid guid, int prompt_length);
@@ -398,6 +399,13 @@ class RequestManager {
   bool add_tokens_to_spec_token_tree(
       InferenceResult const &ssm_inference_result);
   /* ---------- Spec Decoding Helper Functions ---------- */
+  void renormalize(std::vector<std::pair<TokenId, float>> &D,
+                   std::unordered_map<TokenId, float> &R,
+                   TokenId token_id);
+  std::tuple<int, BatchConfig::TokenId, bool>
+      reject_sampling(std::vector<std::pair<TokenId, float>> &D,
+                      std::unordered_map<TokenId, float> &R,
+                      int k);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 6a3667d70..99abf1f78 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -131,9 +131,9 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   bool do_sample = false;
-  float temperature = 0.0f;
-  float topp = 0.0f;
-  int max_requests_per_batch = 8;
+  float temperature = 0.8f;
+  float topp = 0.6f;
+  int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
   RequestManager::DecodingMode decoding_mode =
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 19bdd85fd..0fa846bc0 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -250,15 +250,23 @@ void LLAMA::create_llama_model(FFModel &ff,
   if (mode == TREE_SEARCH_MODE) {
     Tensor softmax = ff.softmax(dense, -1);
     output = ff.arg_top_k(softmax, llama_config.k_of_arg_topk, false, false);
-    // output = ff.top_k(softmax, )
+  } else if (mode == INC_DECODING_MODE) {
+    if (generation_config.do_sample) {
+      Tensor softmax = ff.softmax(dense, -1);
+      output = ff.sampling(softmax, generation_config.topp);
+    } else {
+      output = ff.argmax(dense, /*beam_Search*/ false);
+    }
   } else {
-    // Tensor softmax = ff.softmax(dense, -1);
     if (generation_config.do_sample) {
       dense = ff.scalar_truediv(dense, generation_config.temperature, false);
       Tensor softmax = ff.softmax(dense, -1);
-      output = ff.sampling(softmax, generation_config.topp);
+      if (generation_config.spec_sample) {
+        output = ff.arg_top_k(softmax, generation_config.topk, false, true);
+      } else {
+        output = ff.sampling(softmax, generation_config.topp);
+      }
     } else {
-      // output = ff.arg_top_k(dense, /*k=*/1, false);
       output = ff.argmax(dense, /*beam_Search*/ false);
     }
   }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 141538c75..f43b0f197 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -63,7 +63,9 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &expansion_degree) {
+                      int &expansion_degree,
+                      bool &spec_sampling,
+                      bool &do_sample) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -122,6 +124,15 @@ void parse_input_args(char **argv,
       expansion_degree = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--spec_sampling")) {
+      spec_sampling = true;
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do_sample")) {
+      do_sample = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -274,15 +285,17 @@ void FlexFlow::top_level_task(Task const *task,
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
-  int max_requests_per_batch = 8;
-  int max_tokens_per_batch = 256;
-  int max_sequence_length = 512;
+  int max_requests_per_batch = 1;
+  int max_tokens_per_batch = 40;
+  int max_sequence_length = 256;
   int max_spec_tree_token_num = 64;
   int expansion_degree = 3;
-  int max_tree_depth = 16;
+  int max_tree_depth = 8;
   int max_tree_width = 16;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::SPECULATIVE_DECODING;
+  bool spec_sampling = false;
+  bool do_sample = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -296,7 +309,9 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   expansion_degree);
+                   expansion_degree,
+                   spec_sampling,
+                   do_sample);
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -305,7 +320,7 @@ void FlexFlow::top_level_task(Task const *task,
          ffconfig.numNodes * ffconfig.workersPerNode);
 
   // Create SentencePiece tokenizer or OPT tokenizer
-  GenerationConfig generationConfig;
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16);
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index abe196f2b..0eb93e5fe 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -22,6 +22,7 @@
 #include <future>
 #include <iomanip>
 #include <new>
+#include <random>
 #include <stack>
 #include <stdexcept>
 
@@ -658,11 +659,11 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
-  int committed_token_offset = prefill_request->llm_cache_size;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
-  prefill_request->committed_tokens.clear();
 
   if (decoding_mode == SPECULATIVE_DECODING) {
+    int committed_token_offset = prefill_request->llm_cache_size;
+    prefill_request->committed_tokens.clear();
     // Modified the state because the last commitment completes
     prefill_request->llm_committed = true;
     assert(prefill_request->ssm_committed and prefill_request->llm_committed);
@@ -1580,6 +1581,215 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 }
 /* --------- Bitmask Related Functions --------- */
 
+void RequestManager::renormalize(std::vector<std::pair<TokenId, float>> &D,
+                                 std::unordered_map<TokenId, float> &R,
+                                 TokenId token_id) {
+  float token_prob;
+  for (auto &kv : D) {
+    TokenId d_token_id = kv.first;
+    float d_prob = kv.second;
+    if (R.find(d_token_id) != R.end()) {
+      float r_prob = R[d_token_id];
+      R[d_token_id] = max(0.0f, r_prob - d_prob);
+    }
+    if (d_token_id == token_id) {
+      token_prob = d_prob;
+      kv.second = 0.0f;
+    }
+  }
+  // Normalize R
+  float sum_r = 0.0f;
+  for (auto &kv : R) {
+    sum_r += kv.second;
+  }
+  for (auto &kv : R) {
+    kv.second /= (sum_r + 1e-6);
+  }
+  // Normalize D
+  for (auto &kv : D) {
+    kv.second /= (1.0f - token_prob - 1e-6);
+  }
+}
+
+std::tuple<int, BatchConfig::TokenId, bool>
+    RequestManager::reject_sampling(std::vector<std::pair<TokenId, float>> &D,
+                                    std::unordered_map<TokenId, float> &R,
+                                    int k) {
+  assert(D.size() == k);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+  double r;
+  for (int i = 0; i < k; ++i) {
+    // Generate a random number in the range [0, 1)
+    r = dis(gen);
+    double d_prob = (double)D[i].second;
+    if (R.find(D[i].first) != R.end()) {
+      double r_prob = (double)R[D[i].first];
+      if (r < d_prob / d_prob + 1e-6) {
+        return {i, D[i].first, true};
+      }
+    }
+    // else, r_prob = 0.0, reject the token
+    renormalize(D, R, D[i].first);
+  }
+  std::vector<double> r_probs;
+  std::vector<BatchConfig::TokenId> r_tokens;
+  for (auto &kv : R) {
+    r_probs.push_back(kv.second);
+    r_tokens.push_back(kv.first);
+  }
+  std::discrete_distribution<> r_dist(r_probs.begin(), r_probs.end());
+  int sampled_index = r_dist(gen);
+  return {-1, r_tokens[sampled_index], false};
+}
+
+void RequestManager::get_verify_results_sample(
+    InferenceResult const &llm_verify_result) {
+  // This function maintain the generated token list of the request and the
+  // committed tokens.
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int llm_result_offset =
+        request.first_token_offset_in_batch * BatchConfig::MAX_K_LOGITS;
+    int llm_input_offset = request.first_token_offset_in_batch;
+    int committed_token_index = request.tokens.size() - 1;
+
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    // First add the root to the committed tokens
+    request.committed_tokens.push_back(Request::CommittedToken(
+        llm_input_offset, committed_token_index, request.tokens.back()));
+    committed_token_index++;
+    // Don't add it to request.tokens because it has already been added.
+
+    // The position of the last accepted token in its tree layer (includeing
+    // the pruned tokens)
+    int last_accepted_token_index_in_layer = 0;
+    // The index of the last accepted token in the entire tree (excluding the
+    // pruned tokens)
+    int last_accepted_token_index = 0;
+    float last_accepted_token_accumulated_log_prob = 0.0f;
+    int current_token_index = 1; // Because we skip the root
+    bool rejected = false;
+
+    auto layer_it = token_tree.tree_layers.begin();
+    ++layer_it;
+    for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
+      // We skip the first layer
+      std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+      std::vector<std::pair<TokenId, float>> D;
+      std::unordered_map<TokenId, float> R;
+      // Data format: <current_token_index, current_token_index_in_layer,
+      // acc_log_prob>
+      std::unordered_map<TokenId, std::tuple<int, int, float>> d_token_info;
+
+      int current_token_index_in_layer = 0;
+
+      // Iterate through the tokens in the current layer to find the candidate
+      // tokens whose parent is the last accepted token
+      for (auto const &node_ptr : tree_layer) {
+        if (node_ptr->pruned) {
+          // Don't increase current_token_index here
+          current_token_index_in_layer++;
+          continue;
+        }
+        if (node_ptr->parent_pos != last_accepted_token_index_in_layer) {
+          // The token's parent is not accepted
+          current_token_index++;
+          current_token_index_in_layer++;
+          continue;
+        } else {
+          // The token's parent is accepted
+          float prob = std::exp(node_ptr->log_accumulated_prob -
+                                last_accepted_token_accumulated_log_prob);
+          D.push_back({node_ptr->id, prob});
+          d_token_info[node_ptr->id] = {current_token_index,
+                                        current_token_index_in_layer,
+                                        node_ptr->log_accumulated_prob};
+          current_token_index++;
+          current_token_index_in_layer++;
+        }
+      }
+
+      int result_offset = llm_result_offset +
+                          last_accepted_token_index * BatchConfig::MAX_K_LOGITS;
+      for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
+        TokenId token_id = llm_verify_result.token_ids[result_offset + i];
+        R[token_id] = llm_verify_result.probs[result_offset + i];
+      }
+
+      auto [sampled_index, token_id, accepted] =
+          reject_sampling(D, R, D.size());
+      if (accepted) {
+        // The token's parent is accepted, and this token's id equals the
+        // llm's sample at its parent's position. We accept this token.
+        // from_index: the index of the token in the tree (excluding the
+        // pruned tokens)
+        // to_index: the committed token index in the request
+        request.committed_tokens.push_back(Request::CommittedToken(
+            llm_input_offset + std::get<0>(d_token_info[token_id]),
+            committed_token_index,
+            token_id));
+        request.tokens.push_back(token_id);
+
+        last_accepted_token_index = std::get<0>(d_token_info[token_id]);
+        last_accepted_token_index_in_layer =
+            std::get<1>(d_token_info[token_id]);
+        last_accepted_token_accumulated_log_prob =
+            std::get<2>(d_token_info[token_id]);
+        committed_token_index++;
+      } else {
+        request.committed_tokens.push_back(
+            Request::CommittedToken(-1, committed_token_index, token_id));
+        rejected = true;
+        break;
+      }
+    }
+
+    // Add the last token (that is not in the cache of the LLM) if the sampling
+    // procedure succeed in the last layer
+    // from_index: since this token is not in the token tree, the llm doesn't
+    // have its KV cache, so the from_index should be a place holder, which is
+    // -1
+    if (!rejected) {
+      std::unordered_map<TokenId, float> R;
+      std::vector<std::pair<TokenId, float>> D;
+      int result_offset = llm_result_offset +
+                          last_accepted_token_index * BatchConfig::MAX_K_LOGITS;
+      for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
+        TokenId token_id = llm_verify_result.token_ids[result_offset + i];
+        R[token_id] = llm_verify_result.probs[result_offset + i];
+      }
+      auto [sampled_index, token_id, accepted] =
+          reject_sampling(D, R, D.size());
+      request.committed_tokens.push_back(
+          Request::CommittedToken(-1, committed_token_index, token_id));
+      request.tokens.push_back(token_id);
+    }
+
+    request.llm_committed = false;
+    request.ssm_committed = false;
+
+    if (verbose) {
+      std::cout << "Request " << request.guid << " committed tokens: ";
+      for (auto const &committed_token : request.committed_tokens) {
+        std::cout << committed_token.token_id << " ("
+                  << tokenizer_->Decode({committed_token.token_id}) << ") ";
+      }
+      std::cout << std::endl;
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Output sequence: " << output << std::endl;
+    }
+  }
+}
+
 void RequestManager::get_verify_results_greedy(
     InferenceResult const &llm_verify_result) {
   // This function maintain the generated token list of the request and the

From 9cf91b9b35a25dfd36bacd89ed4f89d30d8122c8 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 03:06:14 -0700
Subject: [PATCH 330/667] feat: add flashinfer attention kernel instantiate

---
 .../tree_inc_multihead_self_attention_impl.cu | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 src/ops/tree_inc_multihead_self_attention_impl.cu

diff --git a/src/ops/tree_inc_multihead_self_attention_impl.cu b/src/ops/tree_inc_multihead_self_attention_impl.cu
new file mode 100644
index 000000000..5ce821ad5
--- /dev/null
+++ b/src/ops/tree_inc_multihead_self_attention_impl.cu
@@ -0,0 +1,105 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flashinfer/attention_impl.cuh"
+
+// This is for instantiating the template attention kernels
+namespace flashinfer {
+
+// group_size[] = {1, 4, 8};
+// head_dim[] = {64, 128, 256};
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  1, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  4, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  4, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  4, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  8, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  8, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  8, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kNone, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+  // template cudaError_t SinglePrefillWithKVCacheDispatched<
+  //     1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  //     false, MaskMode::kNone, float, float>(
+  //       float* q, float* k, float* v, float* custom_mask, float* o,
+  //       float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+  //       float sm_scale, float rope_scale,
+  //       float rope_theta, cudaStream_t stream);
+} // namespace flashinfer

From 26fea0c5658c0c4d96c261454bfbeab534b3103d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 03:06:59 -0700
Subject: [PATCH 331/667] feat: use kernel dispatch

---
 src/ops/tree_inc_multihead_self_attention.cu | 139 ++++++++++---------
 1 file changed, 71 insertions(+), 68 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 9bcfb19e8..98b191afb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -22,6 +22,50 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/utils/cuda_helper.h"
 
+#include <sstream>
+#include <stdexcept>
+
+#define DISPATCH_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
+  if (group_size == 1) {                                     \
+    constexpr size_t GROUP_SIZE = 1;                         \
+    __VA_ARGS__                                              \
+  } else if (group_size == 4) {                              \
+    constexpr size_t GROUP_SIZE = 4;                         \
+    __VA_ARGS__                                              \
+  } else if (group_size == 8) {                              \
+    constexpr size_t GROUP_SIZE = 8;                         \
+    __VA_ARGS__                                              \
+  } else {                                                   \
+    std::ostringstream err_msg;                              \
+    err_msg << "Unsupported group_size: " << group_size;     \
+    throw std::invalid_argument(err_msg.str());              \
+  }
+
+#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...)     \
+  switch (head_dim) {                                  \
+    case 64: {                                         \
+      constexpr size_t HEAD_DIM = 64;                  \
+      __VA_ARGS__                                      \
+      break;                                           \
+    }                                                  \
+    case 128: {                                        \
+      constexpr size_t HEAD_DIM = 128;                 \
+      __VA_ARGS__                                      \
+      break;                                           \
+    }                                                  \
+    case 256: {                                        \
+      constexpr size_t HEAD_DIM = 256;                 \
+      __VA_ARGS__                                      \
+      break;                                           \
+    }                                                  \
+    default: {                                         \
+      std::ostringstream err_msg;                      \
+      err_msg << "Unsupported head_dim: " << head_dim; \
+      throw std::invalid_argument(err_msg.str());      \
+    }                                                  \
+  }
+
+
 namespace FlexFlow {
 
 // declare Legion names
@@ -35,11 +79,10 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
-using namespace flashinfer;
-// using flashinfer::QKVLayout;
-// using flashinfer::PosEncodingMode;
-// using flashinfer::MaskMode;
-// using flashinfer::SinglePrefillWithKVCacheDispatched;
+using flashinfer::QKVLayout;
+using flashinfer::PosEncodingMode;
+using flashinfer::MaskMode;
+using flashinfer::SinglePrefillWithKVCacheDispatched;
 
 template <typename DT,
           int THREADS_PER_BLOCK,
@@ -610,46 +653,6 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
 
 }
 
-#define DISPATCH_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
-  if (group_size == 1) {                                     \
-    constexpr size_t GROUP_SIZE = 1;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 4) {                              \
-    constexpr size_t GROUP_SIZE = 4;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 8) {                              \
-    constexpr size_t GROUP_SIZE = 8;                         \
-    __VA_ARGS__                                              \
-  } else {                                                   \
-    std::ostringstream err_msg;                              \
-    err_msg << "Unsupported group_size: " << group_size;     \
-    throw std::invalid_argument(err_msg.str());              \
-  }
-
-#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...)     \
-  switch (head_dim) {                                  \
-    case 64: {                                         \
-      constexpr size_t HEAD_DIM = 64;                  \
-      __VA_ARGS__                                      \
-      break;                                           \
-    }                                                  \
-    case 128: {                                        \
-      constexpr size_t HEAD_DIM = 128;                 \
-      __VA_ARGS__                                      \
-      break;                                           \
-    }                                                  \
-    case 256: {                                        \
-      constexpr size_t HEAD_DIM = 256;                 \
-      __VA_ARGS__                                      \
-      break;                                           \
-    }                                                  \
-    default: {                                         \
-      std::ostringstream err_msg;                      \
-      err_msg << "Unsupported head_dim: " << head_dim; \
-      throw std::invalid_argument(err_msg.str());      \
-    }                                                  \
-  }
-
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                           BatchConfig const *bc,
@@ -693,17 +696,17 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     float* tmp = m->scratch_space;
     float* custom_mask = m->custom_mask + req_idx * max_q_length * max_kv_length;
 
-    // DISPATCH_GROUP_SIZE(
-    //   group_size, GROUP_SIZE,
-    //     {DISPATCH_HEAD_DIM(
-    //       head_dim, HEAD_DIM, {
-    SinglePrefillWithKVCacheDispatched<
-        1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+    DISPATCH_GROUP_SIZE(
+      group_size, GROUP_SIZE,
+        {DISPATCH_HEAD_DIM(
+          head_dim, HEAD_DIM, {
+    flashinfer::SinglePrefillWithKVCacheDispatched<
+        GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
         false, MaskMode::kNone, DT, DT>(
           q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
           num_kv_heads, q_len, kv_len, sm_scale,
           /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-    // })});
+    })});
   }
 
   // cudaEventRecord(t_end, stream);
@@ -855,22 +858,22 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_half_ptr(),
         bias_ptr,
         stream);
-  } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel<float>(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
+  // } else if (input.data_type == DT_FLOAT) {
+  //   if (m->offload) {
+  //     pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+  //   }
+  //   float const *bias_ptr =
+  //       use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+  //   Kernels::TreeIncMultiHeadAttention::inference_kernel<float>(
+  //       m,
+  //       bc,
+  //       shard_id,
+  //       input.get_float_ptr(),
+  //       m->offload ? static_cast<float *>(m->weight_ptr)
+  //                  : weight.get_float_ptr(),
+  //       output.get_float_ptr(),
+  //       bias_ptr,
+  //       stream);
   } else {
     assert(false && "Unspported data type");
   }

From 0137964c8cd894385e7a7ce35c1cbcfa0eca5968 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 08:07:43 -0700
Subject: [PATCH 332/667] feat: orig attention kernel use custom_mask

---
 src/ops/tree_inc_multihead_self_attention.cu | 51 +++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 98b191afb..5bfc95d59 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -105,6 +105,9 @@ __global__ void compute_attention_kernel_fused_kernel(
     int num_heads,
     int num_requests,
     BatchConfig::BitMask *causalMask,
+    float* custom_mask,
+    int max_q_length,
+    int max_kv_length,
     bool *request_available,
     int qk_smem_sz,
     bool prompt_phase) {
@@ -149,13 +152,7 @@ __global__ void compute_attention_kernel_fused_kernel(
       request_infos[requext_idx_in_batch].num_tokens_in_batch;
   int const qlength = request_infos[requext_idx_in_batch].num_tokens_in_batch;
 
-  __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
-                              [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
-  for (int i = tidx; i < qlength; i += THREADS_PER_BLOCK) {
-    for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
-      bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
-    }
-  }
+  custom_mask = custom_mask + request_idx * max_q_length * max_kv_length;
 
   int non_tree_cache_size =
       causalMask[requext_idx_in_batch].non_tree_cache_size;
@@ -234,7 +231,7 @@ __global__ void compute_attention_kernel_fused_kernel(
             prompt_phase
                 ? (qi + q_start < ti)
                 : (ti >= non_tree_cache_size &&
-                   (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
+                   (custom_mask[qi * tlength + ti - non_tree_cache_size] < -1.0f));
 
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
 
@@ -291,7 +288,7 @@ __global__ void compute_attention_kernel_fused_kernel(
       bool const mask =
           prompt_phase ? (q_start + qi < ti)
                        : (ti >= non_tree_cache_size &&
-                          (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
+                          (custom_mask[qi * tlength + ti - non_tree_cache_size] < -1.0f));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -611,6 +608,9 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
                    m->num_q_heads,                                             \
                    bc->num_active_requests(),                                  \
                    m->causalMask,                                              \
+                   m->custom_mask,                                             \
+                   max_q_length,                                               \
+                   max_kv_length,                                              \
                    m->request_available,                                       \
                    smem_sz[0],                                                 \
                    prompt_phase)
@@ -628,6 +628,9 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
   dim3 grid(m->num_q_heads, bc->num_active_requests());
   int const per_head_size = m->qProjSize;
   float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  int max_q_length = BatchConfig::max_spec_tree_token_num();
+  int max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+                      BatchConfig::max_sequence_length();
   // 0->qk production size, 1->total shared size
   // per_head_size: 128, thd_per_v:32, prompt_phase: 0
   int smem_sz[2];
@@ -783,23 +786,23 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
-  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
-  //   float *temp_output = new float[size];
-  //   cudaDeviceSynchronize();
-  //   cudaMemcpy(
-  //       temp_output, m->attn_heads, size * sizeof(float),
-  //       cudaMemcpyDeviceToHost);
-  //   printf("Output: ");
-  //   for (int i = 0; i < 1; ++i) {
-  //     float temp = 0;
-  //     for (int j = 0; j < m->hidden_size; ++j) {
-  //       temp += temp_output[i * m->hidden_size + j];
-  //     }
-  //     printf("%.6f ", temp);
+  // int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  // float *temp_output = new float[size];
+  // cudaDeviceSynchronize();
+  // cudaMemcpy(
+  //     temp_output, m->attn_heads, size * sizeof(float),
+  //     cudaMemcpyDeviceToHost);
+  // printf("Output: ");
+  // for (int i = 0; i < 1; ++i) {
+  //   float temp = 0;
+  //   for (int j = 0; j < m->hidden_size; ++j) {
+  //     temp += temp_output[i * m->hidden_size + j];
   //   }
-  //   printf("\n");
+  //   printf("%.6f ", temp);
+  // }
+  // printf("\n");
 
-  //   delete[] temp_output;
+  // delete[] temp_output;
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 

From 27a830a2eda0f067d1a84e7a368d185fd23b4493 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 08:18:45 -0700
Subject: [PATCH 333/667] fix: minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 5bfc95d59..e4c694377 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -231,7 +231,7 @@ __global__ void compute_attention_kernel_fused_kernel(
             prompt_phase
                 ? (qi + q_start < ti)
                 : (ti >= non_tree_cache_size &&
-                   (custom_mask[qi * tlength + ti - non_tree_cache_size] < -1.0f));
+                   (custom_mask[qi * tlength + ti] < -1.0f));
 
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
 
@@ -288,7 +288,7 @@ __global__ void compute_attention_kernel_fused_kernel(
       bool const mask =
           prompt_phase ? (q_start + qi < ti)
                        : (ti >= non_tree_cache_size &&
-                          (custom_mask[qi * tlength + ti - non_tree_cache_size] < -1.0f));
+                          (custom_mask[qi * tlength + ti] < -1.0f));
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -785,6 +785,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // Compute attention
   tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
+  // compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+
   // Debug output:
   // int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
   // float *temp_output = new float[size];

From 44ce741d5069eef92a53affb905457dc1188efa4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 08:29:58 -0700
Subject: [PATCH 334/667] chore: original kernel eliminate CausalMask
 dependency

---
 src/ops/tree_inc_multihead_self_attention.cu | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index e4c694377..b4c3830f3 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -104,7 +104,6 @@ __global__ void compute_attention_kernel_fused_kernel(
     BatchConfig::PerRequestInfo *request_infos,
     int num_heads,
     int num_requests,
-    BatchConfig::BitMask *causalMask,
     float* custom_mask,
     int max_q_length,
     int max_kv_length,
@@ -154,9 +153,6 @@ __global__ void compute_attention_kernel_fused_kernel(
 
   custom_mask = custom_mask + request_idx * max_q_length * max_kv_length;
 
-  int non_tree_cache_size =
-      causalMask[requext_idx_in_batch].non_tree_cache_size;
-
   int const first_token_idx =
       request_infos[requext_idx_in_batch].first_token_offset_in_batch;
 
@@ -230,8 +226,7 @@ __global__ void compute_attention_kernel_fused_kernel(
         bool const mask =
             prompt_phase
                 ? (qi + q_start < ti)
-                : (ti >= non_tree_cache_size &&
-                   (custom_mask[qi * tlength + ti] < -1.0f));
+                : (custom_mask[qi * tlength + ti] < -1.0f);
 
         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
 
@@ -287,8 +282,7 @@ __global__ void compute_attention_kernel_fused_kernel(
     for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
       bool const mask =
           prompt_phase ? (q_start + qi < ti)
-                       : (ti >= non_tree_cache_size &&
-                          (custom_mask[qi * tlength + ti] < -1.0f));
+                       : (custom_mask[qi * tlength + ti] < -1.0f);
       float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
       exp_sum += logit;
       qk_smem[ti - first_step] = mask ? 0.0f : logit;
@@ -607,7 +601,6 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
                    m->request_infos,                                           \
                    m->num_q_heads,                                             \
                    bc->num_active_requests(),                                  \
-                   m->causalMask,                                              \
                    m->custom_mask,                                             \
                    max_q_length,                                               \
                    max_kv_length,                                              \
@@ -783,9 +776,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
-  tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
-  // compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   // int size = m->hidden_size * BatchConfig::max_tokens_per_batch();

From f0b86365d1d2042310689d0ed9e54e8d8cc48bd4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 17:44:43 -0700
Subject: [PATCH 335/667] fix: MaskMode::kCustom

---
 src/ops/tree_inc_multihead_self_attention.cu  |  2 +-
 .../tree_inc_multihead_self_attention_impl.cu | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index b4c3830f3..2c1803b86 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -698,7 +698,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
           head_dim, HEAD_DIM, {
     flashinfer::SinglePrefillWithKVCacheDispatched<
         GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-        false, MaskMode::kNone, DT, DT>(
+        false, MaskMode::kCustom, DT, DT>(
           q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
           num_kv_heads, q_len, kv_len, sm_scale,
           /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
diff --git a/src/ops/tree_inc_multihead_self_attention_impl.cu b/src/ops/tree_inc_multihead_self_attention_impl.cu
index 5ce821ad5..95fd1ce7b 100644
--- a/src/ops/tree_inc_multihead_self_attention_impl.cu
+++ b/src/ops/tree_inc_multihead_self_attention_impl.cu
@@ -25,7 +25,7 @@ namespace flashinfer {
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   1, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -33,7 +33,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -41,7 +41,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -49,7 +49,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   4, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -57,7 +57,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   4, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -65,7 +65,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   4, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -73,7 +73,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   8, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -81,7 +81,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   8, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -89,7 +89,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   8, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kNone, half, half>(
+  false, MaskMode::kCustom, half, half>(
     half* q, half* k, half* v, float* custom_mask, half* o,
     float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
     float sm_scale, float rope_scale,
@@ -97,7 +97,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
 
   // template cudaError_t SinglePrefillWithKVCacheDispatched<
   //     1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  //     false, MaskMode::kNone, float, float>(
+  //     false, MaskMode::kCustom, float, float>(
   //       float* q, float* k, float* v, float* custom_mask, float* o,
   //       float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
   //       float sm_scale, float rope_scale,

From 10f7dea11cc10af3ba597111182f2967e680daaa Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 18:11:52 -0700
Subject: [PATCH 336/667] chore: add debug output

---
 src/ops/tree_inc_multihead_self_attention.cu | 53 ++++++++++++++------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 2c1803b86..10899bb8c 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -778,26 +778,49 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // Compute attention
   // tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
+  // Debug output:
+  // {
+  //   int size = m->hidden_size * bc->num_active_tokens();
+  //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
+  //   printf("Output (flashinfer attention) :");
+  //   for (int i = 0; i < 1; ++i) {
+  //     float temp = 0;
+  //     for (int j = 0; j < m->hidden_size; ++j) {
+  //       temp += temp_output[i * m->hidden_size + j];
+  //     }
+  //     printf("%.6f ", temp);
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
+  // }
+
   compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
-  // int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
-  // float *temp_output = new float[size];
-  // cudaDeviceSynchronize();
-  // cudaMemcpy(
-  //     temp_output, m->attn_heads, size * sizeof(float),
-  //     cudaMemcpyDeviceToHost);
-  // printf("Output: ");
-  // for (int i = 0; i < 1; ++i) {
-  //   float temp = 0;
-  //   for (int j = 0; j < m->hidden_size; ++j) {
-  //     temp += temp_output[i * m->hidden_size + j];
+  // {
+  //   int size = m->hidden_size * bc->num_active_tokens();
+  //   float *temp_output = new float[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, m->attn_heads, size * sizeof(float),
+  //       cudaMemcpyDeviceToHost);
+  //   printf("Output (original attention) :");
+  //   for (int i = 0; i < 1; ++i) {
+  //     float temp = 0;
+  //     for (int j = 0; j < m->hidden_size; ++j) {
+  //       temp += temp_output[i * m->hidden_size + j];
+  //     }
+  //     printf("%.6f ", temp);
   //   }
-  //   printf("%.6f ", temp);
-  // }
-  // printf("\n");
+  //   printf("\n");
 
-  // delete[] temp_output;
+  //   delete[] temp_output;
+  // }
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 

From fa61220ebfadec7c68da78131c14a6a58b0b9003 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 18:21:16 -0700
Subject: [PATCH 337/667] fix: neg_inf for half

---
 src/ops/tree_inc_multihead_self_attention.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 10899bb8c..170d4fcab 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -502,7 +502,7 @@ void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
       num_requests,
       max_q_length,
       max_kv_length,
-      -std::numeric_limits<float>::infinity());
+      -5e4);
 }
 
 template <typename DT>

From edfa0730ba1cd0409c761a32849f65c5565a9306 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 18:25:56 -0700
Subject: [PATCH 338/667] fix: flashinfer support non-prompt phase

---
 src/ops/tree_inc_multihead_self_attention.cu | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 170d4fcab..4e83de3ff 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -776,7 +776,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
-  // tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  if (!bc->prompt_phase) {
+    tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   // {
@@ -799,7 +800,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   delete[] temp_output;
   // }
 
-  compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  } else {
+    compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  }
 
   // Debug output:
   // {

From b1e4d4969eac171e8a8923edd05dbcb05e14272e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 19:22:09 -0700
Subject: [PATCH 339/667] fix: align w/ original attention kernel

---
 src/ops/tree_inc_multihead_self_attention.cu  | 49 ++++--------
 .../tree_inc_multihead_self_attention_impl.cu | 77 +++++++++++++++++++
 2 files changed, 94 insertions(+), 32 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 4e83de3ff..39dc8df1b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -696,12 +696,21 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       group_size, GROUP_SIZE,
         {DISPATCH_HEAD_DIM(
           head_dim, HEAD_DIM, {
-    flashinfer::SinglePrefillWithKVCacheDispatched<
+    if (bc->prompt_phase) {
+      flashinfer::SinglePrefillWithKVCacheDispatched<
         GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-        false, MaskMode::kCustom, DT, DT>(
-          q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
-          num_kv_heads, q_len, kv_len, sm_scale,
+        false, MaskMode::kCausal, DT, DT>(
+          q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
+          /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
           /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+    } else {
+      flashinfer::SinglePrefillWithKVCacheDispatched<
+          GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, DT, DT>(
+            q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
+            num_kv_heads, q_len, kv_len, sm_scale,
+            /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+    }
     })});
   }
 
@@ -770,14 +779,15 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                      stream);
 
   // Update gpu-side custom mask referring from CaualMask
-  update_custom_mask(m, bc, stream);
+  if (!bc->prompt_phase) {
+    update_custom_mask(m, bc, stream);
+  }
 
   // Update key-val cache, compact q array
   update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
-  if (!bc->prompt_phase) {
-    tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   // {
@@ -800,31 +810,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   delete[] temp_output;
   // }
 
-  } else {
-    compute_attention_kernel_fused<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
-  }
-
-  // Debug output:
-  // {
-  //   int size = m->hidden_size * bc->num_active_tokens();
-  //   float *temp_output = new float[size];
-  //   cudaDeviceSynchronize();
-  //   cudaMemcpy(
-  //       temp_output, m->attn_heads, size * sizeof(float),
-  //       cudaMemcpyDeviceToHost);
-  //   printf("Output (original attention) :");
-  //   for (int i = 0; i < 1; ++i) {
-  //     float temp = 0;
-  //     for (int j = 0; j < m->hidden_size; ++j) {
-  //       temp += temp_output[i * m->hidden_size + j];
-  //     }
-  //     printf("%.6f ", temp);
-  //   }
-  //   printf("\n");
-
-  //   delete[] temp_output;
-  // }
-
   int processed_tokens_in_batch = bc->num_active_tokens();
 
   compute_o_prod_bias(m,
diff --git a/src/ops/tree_inc_multihead_self_attention_impl.cu b/src/ops/tree_inc_multihead_self_attention_impl.cu
index 95fd1ce7b..07ae82453 100644
--- a/src/ops/tree_inc_multihead_self_attention_impl.cu
+++ b/src/ops/tree_inc_multihead_self_attention_impl.cu
@@ -23,6 +23,83 @@ namespace flashinfer {
 // group_size[] = {1, 4, 8};
 // head_dim[] = {64, 128, 256};
 
+/********** prefill instantiations for half precision **********/
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  1, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  4, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  4, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  4, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  8, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  8, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+template cudaError_t SinglePrefillWithKVCacheDispatched<
+  8, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
+  false, MaskMode::kCausal, half, half>(
+    half* q, half* k, half* v, float* custom_mask, half* o,
+    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
+    float sm_scale, float rope_scale,
+    float rope_theta, cudaStream_t stream);
+
+
+/********** append instantiations for half precision **********/
+
 template cudaError_t SinglePrefillWithKVCacheDispatched<
   1, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
   false, MaskMode::kCustom, half, half>(

From da544333c54463bddcfe5929727226ac16bd5d67 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 19:28:12 -0700
Subject: [PATCH 340/667] chore: remove unused

---
 src/ops/tree_inc_multihead_self_attention.cu | 380 -------------------
 1 file changed, 380 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 39dc8df1b..c3d3e5bb6 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -84,301 +84,6 @@ using flashinfer::PosEncodingMode;
 using flashinfer::MaskMode;
 using flashinfer::SinglePrefillWithKVCacheDispatched;
 
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_fused_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int const max_seq_length,
-    int const max_token_per_batch,
-    int per_head_size,
-    int hidden_size,
-    /* Reserved: BatchConfig Updated */
-    BatchConfig::PerRequestInfo *request_infos,
-    int num_heads,
-    int num_requests,
-    float* custom_mask,
-    int max_q_length,
-    int max_kv_length,
-    bool *request_available,
-    int qk_smem_sz,
-    bool prompt_phase) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  // request id in batch config
-  int requext_idx_in_batch = -1;
-  int cnt_1 = 0;
-  while (cnt_1 < request_idx + 1) {
-    requext_idx_in_batch++;
-    if (request_available[requext_idx_in_batch]) {
-      cnt_1++;
-    }
-  }
-
-  // threads converge
-  //   __syncthreads();
-
-  int const first_step = 0;
-
-  int const tlength =
-      request_infos[requext_idx_in_batch].first_token_index_in_request +
-      request_infos[requext_idx_in_batch].num_tokens_in_batch;
-  int const qlength = request_infos[requext_idx_in_batch].num_tokens_in_batch;
-
-  custom_mask = custom_mask + request_idx * max_q_length * max_kv_length;
-
-  int const first_token_idx =
-      request_infos[requext_idx_in_batch].first_token_offset_in_batch;
-
-  int q_start =
-      request_infos[requext_idx_in_batch].first_token_index_in_request;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + first_token_idx * hidden_size +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-
-  for (int qi = 0; qi < qlength; qi += 1) {
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-          q_ptr + (hidden_size * qi) + ki +
-          ii * THREADS_PER_KEY * K_VEC_SIZE);
-
-      // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
-      //     printf("laod q %d,  %d %.10f\n",
-      //     request_idx,
-      //            qi,q_vecs[ki_o][ii].x);
-      //   }
-    }
-
-    __syncthreads();
-    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-      K_vec k[K_VECS_PER_THREAD];
-      int const ti_circ = ti % max_seq_length;
-#pragma unroll
-      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-        if (ti < tlength) {
-          k[ii] = *reinterpret_cast<K_vec const *>(
-              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
-              jj);
-        }
-      }
-      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-
-      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-        bool const mask =
-            prompt_phase
-                ? (qi + q_start < ti)
-                : (custom_mask[qi * tlength + ti] < -1.0f);
-
-        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-
-        // if (head_idx == 0 && !mask) {
-        //   printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f,
-        //   %.10f, %d\n",
-        //          request_idx,
-        //          qi,
-        //          ti,
-        //          qk,
-        //          q_vecs[ki_o][0].x,
-        //          k[0].x,
-        //          bitmask->non_tree_cache_size);
-        // }
-        qk_smem[ti - first_step] = mask ? 0.0f : qk;
-      }
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Decompose the thread index into warp and lane.
-    int const warp = tidx / WARP_SIZE;
-    int const lane = tidx % WARP_SIZE;
-
-    // The warp leader writes the max to shared memory.
-    if (lane == 0) {
-      red_smem[warp] = qk_max;
-    }
-
-    // Make sure the products are in shared memory.
-    __syncthreads();
-
-    // The warps finalize the reduction.
-    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Broadcast to all the threads in the warp.
-    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-    // if (head_idx == 0 && qi == 9 && tidx == 0) {
-    //   printf("tree attn first token qk_max %f\n", qk_max);
-    // }
-
-    float exp_sum = 0.f;
-    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      bool const mask =
-          prompt_phase ? (q_start + qi < ti)
-                       : (custom_mask[qi * tlength + ti] < -1.0f);
-      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
-      exp_sum += logit;
-      qk_smem[ti - first_step] = mask ? 0.0f : logit;
-    }
-
-    // Compute the sum.
-    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-    // softmax
-    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      qk_smem[ti - first_step] *= inv_sum;
-    }
-
-    __syncthreads();
-
-    // value projection
-    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-    // A vector of V elements for the current timestep.
-    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-    // The value computed by this thread.
-    int vo = tidx / THREADS_PER_VALUE;
-    // The hidden dimensions computed by this particular thread.
-    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-    Out_sum out;
-    zero(out);
-
-    // The base pointer for the value in the cache buffer.
-    DT const *v_cache_batch =
-        value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
-
-    if (Dh == Dh_MAX || vi < Dh) {
-      for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-        // Load the values from the cache.
-        int const ti_circ = ti % max_seq_length;
-        V_vec v = *reinterpret_cast<V_vec const *>(
-            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-        float logit = qk_smem[ti - first_step];
-        out = FlexFlow::fma(logit, cast_to_float(v), out);
-      }
-    }
-
-    //   // Make sure we can start writing to shared memory.
-    __syncthreads();
-
-    // Run the final reduction amongst the different groups computing different
-    // partial outputs.
-    if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-      for (int active_groups = V_PER_ITER; active_groups >= 2;
-           active_groups /= 2) {
-
-        // The midpoint in the number of active groups.
-        int midpoint = active_groups / 2;
-
-        // The upper part of active threads store to shared memory.
-        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-              out;
-        }
-        __syncthreads();
-
-        // The bottom warps update their values.
-        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                    out);
-        }
-        __syncthreads();
-      }
-    }
-
-    // Output the final values.
-    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-      convert_from_float(*reinterpret_cast<V_vec *>(
-                             output_ptr + (first_token_idx + qi) * hidden_size +
-                             head_idx * per_head_size + vi),
-                         out);
-      // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) {
-      //   printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n",
-      //          out.x,
-      //          out.y,
-      //          out.z,
-      //          out.w,
-      //          vi,
-      //          (first_token_idx + qi) * hidden_size + head_idx *
-      //          per_head_size +
-      //              vi);
-      // }
-    }
-  }
-}
-
 template <typename DT>
 __global__ void commit_tokens_kernel(
     DT *kCache_ptr,
@@ -564,91 +269,6 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(DT,                          \
-                                                  Dh,                          \
-                                                  Dh_MAX,                      \
-                                                  THDS_PER_KEY,                \
-                                                  THDS_PER_VALUE,              \
-                                                  THDS_PER_BLOCK,              \
-                                                  stream,                      \
-                                                  prompt_phase)                \
-  smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
-                              BatchConfig::max_sequence_length() +             \
-                                  BatchConfig::max_spec_tree_token_num(),      \
-                              THDS_PER_VALUE,                                  \
-                              THDS_PER_BLOCK,                                  \
-                              bc,                                              \
-                              smem_sz);                                        \
-  compute_attention_kernel_fused_kernel<DT,                                    \
-                                        THDS_PER_BLOCK,                        \
-                                        Dh,                                    \
-                                        Dh_MAX,                                \
-                                        THDS_PER_KEY,                          \
-                                        THDS_PER_VALUE>                        \
-      <<<grid,                                                                 \
-         THDS_PER_BLOCK,                                                       \
-         smem_sz[1],                                                           \
-         stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
-                   static_cast<DT *>(m->keyCache),                             \
-                   static_cast<DT *>(m->valueCache),                           \
-                   output_ptr,                                                 \
-                   scale,                                                      \
-                   BatchConfig::max_sequence_length() +                        \
-                       BatchConfig::max_spec_tree_token_num(),                 \
-                   BatchConfig::max_tokens_per_batch(),                        \
-                   m->qProjSize,                                               \
-                   m->hidden_size,                                             \
-                   m->request_infos,                                           \
-                   m->num_q_heads,                                             \
-                   bc->num_active_requests(),                                  \
-                   m->custom_mask,                                             \
-                   max_q_length,                                               \
-                   max_kv_length,                                              \
-                   m->request_available,                                       \
-                   smem_sz[0],                                                 \
-                   prompt_phase)
-
-template <typename DT>
-void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                    BatchConfig const *bc,
-                                    DT *output_ptr,
-                                    cudaStream_t stream) {
-  // cudaEvent_t t_start, t_end;
-  // cudaEventCreate(&t_start);
-  // cudaEventCreate(&t_end);
-  // cudaEventRecord(t_start, stream);
-
-  dim3 grid(m->num_q_heads, bc->num_active_requests());
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  int max_q_length = BatchConfig::max_spec_tree_token_num();
-  int max_kv_length = BatchConfig::max_spec_tree_token_num() + 
-                      BatchConfig::max_sequence_length();
-  // 0->qk production size, 1->total shared size
-  // per_head_size: 128, thd_per_v:32, prompt_phase: 0
-  int smem_sz[2];
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream, bc->prompt_phase);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream, bc->prompt_phase);
-  } else {
-    assert(false && "a unsupported head size");
-  }
-
-  // cudaEventRecord(t_end, stream);
-  // checkCUDA(cudaEventSynchronize(t_end));
-  // float elapsed = 0;
-  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
-  // cudaEventDestroy(t_start);
-  // cudaEventDestroy(t_end);
-
-}
-
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                           BatchConfig const *bc,

From 6f1d7c932055dc506e3db5d29d5e01273b9e1657 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Jun 2024 20:21:34 -0700
Subject: [PATCH 341/667] chore: finetune max ssm steps

---
 inference/spec_infer/spec_infer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 141538c75..5bb524095 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -279,7 +279,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 512;
   int max_spec_tree_token_num = 64;
   int expansion_degree = 3;
-  int max_tree_depth = 16;
+  int max_tree_depth = 8;
   int max_tree_width = 16;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::SPECULATIVE_DECODING;

From 792a4b78b0d8e51483bdd9d20ecb856938795819 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 6 Jun 2024 20:01:26 -0400
Subject: [PATCH 342/667] Fixed bugs in the sampling kernel. Fixed bugs in
 request manager profiling. Added seeding option to spec_infer.cc and
 incr_decoding.cc. However for speculative decoding, seeding cannot fully
 control the randomness, because during the verification, tokens in the same
 batch access the random state simultaneously.

---
 include/flexflow/inference.h             |  4 +-
 include/flexflow/ops/sampling.h          |  4 +-
 inference/incr_decoding/incr_decoding.cc | 12 +++-
 inference/spec_infer/spec_infer.cc       | 16 +++--
 src/ops/sampling.cu                      | 24 +++++--
 src/runtime/request_manager.cc           | 86 +++++++++++++-----------
 6 files changed, 89 insertions(+), 57 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 7f85294aa..82da0d879 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -32,9 +32,9 @@ struct GenerationConfig {
                    float _temperature = 0.8,
                    float _topp = 0.6,
                    bool _spec_sample = false,
-                   int topk = 16)
+                   int _topk = 16)
       : do_sample(_do_sample), temperature(_temperature), topp(_topp),
-        spec_sample(_spec_sample), topk(topk) {
+        spec_sample(_spec_sample), topk(_topk) {
     assert(temperature > 0.0);
     assert(topk <= BatchConfig::MAX_K_LOGITS);
   }
diff --git a/include/flexflow/ops/sampling.h b/include/flexflow/ops/sampling.h
index 1696582cc..026be221f 100644
--- a/include/flexflow/ops/sampling.h
+++ b/include/flexflow/ops/sampling.h
@@ -46,10 +46,10 @@ class Sampling : public Op {
   using Params = SamplingParams;
   using Input = ParallelTensor;
   Sampling(FFModel &model,
-           const ParallelTensor input,
+           ParallelTensor const input,
            float top_p,
            char const *name);
-  Sampling(FFModel &model, Sampling const &other, const ParallelTensor input);
+  Sampling(FFModel &model, Sampling const &other, ParallelTensor const input);
   Sampling(FFModel &model,
            Params const &params,
            Input const input,
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 99abf1f78..723441a79 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -47,7 +47,8 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
-                      int &max_sequence_length) {
+                      int &max_sequence_length,
+                      int &sampling_seed) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -105,6 +106,10 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--sampling-seed")) {
+      sampling_seed = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -138,6 +143,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 256;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
+  int sampling_seed = 0;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -153,7 +159,8 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                   max_sequence_length);
+                   max_sequence_length,
+                   sampling_seed);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -208,6 +215,7 @@ void FlexFlow::top_level_task(Task const *task,
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
 
+  srand(sampling_seed);
   GenerationConfig generationConfig(do_sample, temperature, topp);
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index f43b0f197..0955020df 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -65,7 +65,8 @@ void parse_input_args(char **argv,
                       int &max_sequence_length,
                       int &expansion_degree,
                       bool &spec_sampling,
-                      bool &do_sample) {
+                      bool &do_sample,
+                      int &sampling_seed) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -124,12 +125,16 @@ void parse_input_args(char **argv,
       expansion_degree = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "--spec_sampling")) {
+    if (!strcmp(argv[i], "--sampling-seed")) {
+      sampling_seed = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--spec-sampling")) {
       spec_sampling = true;
       do_sample = true;
       continue;
     }
-    if (!strcmp(argv[i], "--do_sample")) {
+    if (!strcmp(argv[i], "--do-sample")) {
       do_sample = true;
       continue;
     }
@@ -296,6 +301,7 @@ void FlexFlow::top_level_task(Task const *task,
       RequestManager::SPECULATIVE_DECODING;
   bool spec_sampling = false;
   bool do_sample = false;
+  int sampling_seed = 0;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -311,7 +317,8 @@ void FlexFlow::top_level_task(Task const *task,
                    max_sequence_length,
                    expansion_degree,
                    spec_sampling,
-                   do_sample);
+                   do_sample,
+                   sampling_seed);
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -320,6 +327,7 @@ void FlexFlow::top_level_task(Task const *task,
          ffconfig.numNodes * ffconfig.workersPerNode);
 
   // Create SentencePiece tokenizer or OPT tokenizer
+  srand(sampling_seed);
   GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16);
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestManager *rm = RequestManager::get_request_manager();
diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu
index 461d72ec7..494a5ab3f 100644
--- a/src/ops/sampling.cu
+++ b/src/ops/sampling.cu
@@ -45,8 +45,12 @@ __global__ void init_idxs(int batch_size,
                           int *idx,
                           int *begin_offset,
                           int *end_offset) {
-  CUDA_KERNEL_LOOP(i, total_eles) {
-    idx[i] = i % vocab_size;
+  // +1 to include the upper boundary
+  CUDA_KERNEL_LOOP(i, total_eles + 1) {
+    if (i < total_eles) {
+      // Exclude the last element
+      idx[i] = i % vocab_size;
+    }
     if (i % vocab_size == 0) {
       begin_offset[i / vocab_size] = i;
       end_offset[i / vocab_size] = i;
@@ -55,9 +59,9 @@ __global__ void init_idxs(int batch_size,
 }
 
 __global__ void
-    init_random_kernel(curandState *state, int batch_size, long rand) {
+    init_random_kernel(curandState *state, int batch_size, long seed) {
   CUDA_KERNEL_LOOP(i, batch_size) {
-    curand_init(rand, i, 0, &state[i]);
+    curand_init(seed, i, 0, &state[i]);
   }
 }
 
@@ -74,11 +78,14 @@ __global__ void sampling_topp_kernel(int batch_size,
   int const batch_idx = blockIdx.x;
   __shared__ float random_n;
   __shared__ long long result_idx;
+  __shared__ bool is_end;
 
   // random num
   if (threadIdx.x == 0) {
     // number must < topp
     random_n = curand_uniform(state + batch_idx) * topp;
+    is_end = false;
+    result_idx = vocab_size - 1;
     // printf("batch idx: %d, random num%f\n", batch_idx, random_n);
   }
 
@@ -91,14 +98,19 @@ __global__ void sampling_topp_kernel(int batch_size,
   int offset = batch_idx * vocab_size;
   float prefix_sum = 0.0f;
   BlockPrefixCallbackOp prefix_op(0);
-  result_idx = vocab_size - 1;
 
   for (long long j = threadIdx.x; j < vocab_size; j += blockDim.x) {
     float logit = (float)(sorted_logits[offset + j]);
     BlockScan(temp_storage).InclusiveSum(logit, prefix_sum, prefix_op);
-    prefix_sum /= topp;
+    __syncthreads();
     if (prefix_sum >= random_n) {
       atomicMin(&result_idx, j);
+      is_end = true;
+    }
+    // Synchronize to make sure all threads see the updated flag
+    __syncthreads();
+    if (is_end) {
+      break;
     }
   }
   indices_ptr[batch_idx] = sorted_idx[offset + result_idx];
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index fcf5c5eaf..7b842d98e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -33,7 +33,8 @@ using tokenizers::Tokenizer;
 
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
-void write_to_output_file(std::string const &output_filepath, std::string const &str) {
+void write_to_output_file(std::string const &output_filepath,
+                          std::string const &str) {
   std::ostream *os = &std::cout;
   std::ofstream output_file;
   if (!output_filepath.empty()) {
@@ -523,7 +524,8 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     *os << "Request " << guid << " profiling: " << std::endl;
     if (profile_info.start_decoding_time != 0) {
       *os << "Decoding time: "
-          << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
+          << (profile_info.finish_time - profile_info.start_decoding_time) *
+                 1e-3
           << " ms" << std::endl;
     } else {
       *os << "Decoding time: 0 ms" << std::endl;
@@ -531,34 +533,33 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     *os << "Total time: "
         << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
         << std::endl;
-    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps << std::endl;
-      if (decoding_mode == SPECULATIVE_DECODING) {
-        *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
-            << std::endl;
-      }
-      *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
+    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps
+        << std::endl;
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
+          << std::endl;
+    }
+    *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
 
-      if (!output_filepath.empty()) {
-        output_file.close();
-      }
+    if (!output_filepath.empty()) {
+      output_file.close();
+    }
   }
   RequestProfileInfo profile_info = profiling_requests[guid];
-  std::string str = "[" + std::to_string(guid) + "] Request completed:" + 
-                      " decoding_time_ms(" + std::to_string(
-                        (profile_info.finish_time-
-                          profile_info.start_decoding_time)
-                          *1e-3) + ")" + 
-                      " total_time_ms(" + std::to_string(
-                        (profile_info.finish_time-
-                          profile_info.start_time)
-                          *1e-3) + ")" + 
-                      " LLM_decoding_steps(" + std::to_string(
-                        profile_info.llm_decoding_steps) 
-                        + ")";
+  std::string str =
+      "[" + std::to_string(guid) +
+      "] Request completed:" + " decoding_time_ms(" +
+      std::to_string(
+          (profile_info.finish_time - profile_info.start_decoding_time) *
+          1e-3) +
+      ")" + " total_time_ms(" +
+      std::to_string((profile_info.finish_time - profile_info.start_time) *
+                     1e-3) +
+      ")" + " LLM_decoding_steps(" +
+      std::to_string(profile_info.llm_decoding_steps) + ")";
   if (decoding_mode == SPECULATIVE_DECODING) {
-    str = str + " SSM_decoding_steps(" + std::to_string(
-      profile_info.ssm_decoding_steps) 
-      + ")";
+    str = str + " SSM_decoding_steps(" +
+          std::to_string(profile_info.ssm_decoding_steps) + ")";
   }
   write_to_output_file("", str);
 
@@ -803,9 +804,10 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
                 << output << std::endl;
     }
   }
-  profiling.llm_step_times.push_back((
-        Realm::Clock::current_time_in_microseconds() - 
-        profiling.llm_step_start) * 1e-3);
+  profiling.llm_step_times.push_back(
+      (Realm::Clock::current_time_in_microseconds() -
+       profiling.llm_step_start) *
+      1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
   profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
   return request_completed;
@@ -1139,8 +1141,7 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
       profiling_requests[guid].start_decoding_time =
           Realm::Clock::current_time_in_microseconds();
     }
-    profiling.ssm_step_start = 
-      Realm::Clock::current_time_in_microseconds();
+    profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
   }
   if (verbose) {
     std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
@@ -1389,9 +1390,10 @@ bool RequestManager::update_llm_verify_results(
   // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
 
-  profiling.llm_step_times.push_back((
-        Realm::Clock::current_time_in_microseconds() - 
-        profiling.llm_step_start) * 1e-3);
+  profiling.llm_step_times.push_back(
+      (Realm::Clock::current_time_in_microseconds() -
+       profiling.llm_step_start) *
+      1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
 
   // Clear the token tree node pool
@@ -1498,10 +1500,10 @@ bool RequestManager::update_ssm_inference_results(
   // Stop conditions
   if (all_request_last_layer_empty) {
     // Update profiling statistics before returning
-    profiling.ssm_step_times.push_back((
-        Realm::Clock::current_time_in_microseconds() -
-        profiling.ssm_step_start) * 1e-3
-      );
+    profiling.ssm_step_times.push_back(
+        (Realm::Clock::current_time_in_microseconds() -
+         profiling.ssm_step_start) *
+        1e-3);
     return true;
   }
   return false;
@@ -1959,7 +1961,7 @@ void RequestManager::get_verify_results_greedy(
     request.llm_committed = false;
     request.ssm_committed = false;
 
-    total_nb_generated_tokens += request.committed_tokens.size();
+    total_nb_generated_tokens += request.committed_tokens.size() - 1;
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
       for (auto const &committed_token : request.committed_tokens) {
@@ -2242,7 +2244,8 @@ void RequestManager::terminate_background_server_at_exit() {
 
 void RequestManager::terminate_background_server() {
   if (background_server_status == SERVING) {
-    assert(profiling.llm_step_times.size() == profiling.requests_per_step.size());
+    assert(profiling.llm_step_times.size() ==
+           profiling.requests_per_step.size());
     // Write the last profiling statistics to output file
     std::string str = "[Profiling Statistics]\n llm_step_times_ms(";
     std::string llm_step_times_ms = " ";
@@ -2259,7 +2262,8 @@ void RequestManager::terminate_background_server() {
     req_per_step += ")";
     str += req_per_step;
     if (profiling.ssm_step_times.size() > 0) {
-      assert(profiling.ssm_step_times.size() == profiling.llm_step_times.size());
+      //   assert(profiling.ssm_step_times.size() ==
+      //   profiling.llm_step_times.size());
       str += "\n ssm_step_times_ms(";
       std::string ssm_step_times_ms = " ";
       for (double time : profiling.ssm_step_times) {

From b0caa8f029f665325dd629622b5052397f37e095 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 7 Jun 2024 00:17:24 -0700
Subject: [PATCH 343/667] feat: add queryTmp into attentionMeta

---
 .../ops/inc_multihead_self_attention.h        |  2 +-
 src/ops/inc_multihead_self_attention.cu       | 34 +++++++++++++++----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 779f55127..977645367 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -187,7 +187,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   bool *position_bias;
   float scaling_factor;
   void *weight_ptr, *bias_ptr; // for weight offload
-  void *devQKVProjArray, *keyCache, *valueCache;
+  void *devQKVProjArray, *queryTmp, *keyCache, *valueCache;
   void *qk_prods, *qk_prods_softmax;
   void *attn_heads;
   char *quantized_weight_ptr;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index cf42245f5..23548ef56 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1373,7 +1373,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
-    size_t key_cache_size = 0, value_cache_size = 0, qk_prod_size = 0;
+    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0, qk_prod_size = 0;
     switch (infer_mode) {
       case INC_DECODING_MODE: {
         key_cache_size = num_q_heads * kProjSize *
@@ -1386,8 +1386,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                        BatchConfig::max_sequence_length() * num_q_heads;
         break;
       }
-      case TREE_SEARCH_MODE:
+      case TREE_SEARCH_MODE: {
+        key_cache_size = num_q_heads * kProjSize *
+                         BatchConfig::max_requests_per_batch() *
+                         (BatchConfig::max_sequence_length() +
+                          BatchConfig::max_spec_tree_token_num());
+        value_cache_size = num_q_heads * vProjSize *
+                           BatchConfig::max_requests_per_batch() *
+                           (BatchConfig::max_sequence_length() +
+                            BatchConfig::max_spec_tree_token_num());
+        qk_prod_size = BatchConfig::max_sequence_length() *
+                       (BatchConfig::max_sequence_length() +
+                        BatchConfig::max_spec_tree_token_num()) *
+                       num_q_heads;
+        break;
+      }
       case TREE_VERIFY_MODE: {
+        query_tmp_size = num_q_heads * qProjSize *
+                         BatchConfig::max_tokens_per_batch();
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
                          BatchConfig::max_requests_per_batch() *
@@ -1411,7 +1427,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                    kProjSize * num_q_heads)) /
                           2;
     size_t totalSize =
-        (qkv_max_proj_size + key_cache_size + value_cache_size +
+        (qkv_max_proj_size + query_tmp_size + key_cache_size + value_cache_size +
          2 * qk_prod_size + attn_heads_size) *
             size_of_dt +
         complex_size * sizeof(cuFloatComplex); // more components will
@@ -1421,15 +1437,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       size_t totalSharedSize =
           infer_mode == TREE_VERIFY_MODE
               ? totalSize -
-                    (key_cache_size + value_cache_size + qkv_max_proj_size) *
+                    (query_tmp_size + key_cache_size + value_cache_size + qkv_max_proj_size) *
                         size_of_dt
-              : totalSize - (key_cache_size + value_cache_size) * size_of_dt;
+              : totalSize - (query_tmp_size + key_cache_size + value_cache_size) * size_of_dt;
 
       size_t instance_size =
           size_of_dt *
           (infer_mode == TREE_VERIFY_MODE
-               ? key_cache_size + value_cache_size + qkv_max_proj_size
-               : key_cache_size + value_cache_size);
+               ? query_tmp_size + key_cache_size + value_cache_size + qkv_max_proj_size
+               : query_tmp_size + key_cache_size + value_cache_size);
 
       if (quantization_type != DT_NONE) {
         totalSharedSize += quantized_weightSize;
@@ -1452,6 +1468,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     }
 
     // use key value cache in all mode.
+    if (query_tmp_size > 0) {
+      queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size *
+                                                             size_of_dt);
+    }
     keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size *
                                                            size_of_dt);
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *

From cbba29b3b15676552a1fc82fa1b41707ecf087b7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 7 Jun 2024 00:18:38 -0700
Subject: [PATCH 344/667] feat: tree attn support parallelly update qkv cache

---
 src/ops/tree_inc_multihead_self_attention.cu | 66 +++++++++++---------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c3d3e5bb6..c8f956e40 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -213,6 +213,7 @@ void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
 template <typename DT>
 __global__ void update_qkv_cache_kernel(
     DT *devQKVProjArray,
+    DT *qTmp_ptr,
     DT *kCache_ptr,
     DT *vCache_ptr,
     BatchConfig::PerTokenInfo const *tokenInfos,
@@ -220,9 +221,16 @@ __global__ void update_qkv_cache_kernel(
     int qProjSize,
     int kProjSize,
     int vProjSize,
-    int token_idx,
     int max_seq_len,
-    int hidden_size) {
+    int hidden_size,
+    int num_new_tokens) {
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / hidden_size;
+  int const offset = thread_idx % hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
   int const req_idx = tokenInfos[token_idx].request_index;
   int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
@@ -231,14 +239,12 @@ __global__ void update_qkv_cache_kernel(
   size_t to_idx = req_idx * (hidden_size * max_seq_len) +
                   token_abs_idx * hidden_size;
 
-  CUDA_KERNEL_LOOP(offset, hidden_size) {
-    kCache_ptr[to_idx + offset] = 
-               devQKVProjArray[from_idx + hidden_size + offset];
-    vCache_ptr[to_idx + offset] = 
-               devQKVProjArray[from_idx + hidden_size * 2 + offset];
-    devQKVProjArray[token_idx * hidden_size + offset] = 
-               devQKVProjArray[from_idx + offset];
-  }
+  kCache_ptr[to_idx + offset] = 
+              devQKVProjArray[from_idx + hidden_size + offset];
+  vCache_ptr[to_idx + offset] = 
+              devQKVProjArray[from_idx + hidden_size * 2 + offset];
+  qTmp_ptr[token_idx * hidden_size + offset] = 
+              devQKVProjArray[from_idx + offset];
 }
 
 template <typename DT>
@@ -247,26 +253,24 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
                                  cudaStream_t stream) {
   // update the kv cache, compact the q array
   int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size;
-  // TODO: parallel across queries
-  for (int i = 0; i < num_new_tokens; i++) {
-    update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-                              min(CUDA_NUM_THREADS, parallelism),
-                              0,
-                              stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
-        m->token_infos,
-        m->request_infos,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        i,
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        m->hidden_size);
-  }
+  int parallelism = m->hidden_size * num_new_tokens;
+  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<DT *>(m->queryTmp),
+      static_cast<DT *>(m->keyCache),
+      static_cast<DT *>(m->valueCache),
+      m->token_infos,
+      m->request_infos,
+      m->qProjSize,
+      m->kProjSize,
+      m->vProjSize,
+      BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num(),
+      m->hidden_size,
+      num_new_tokens);
 }
 
 template <typename DT>
@@ -305,7 +309,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
              q_start = req->first_token_index_in_request,
              kv_len = q_len + q_start;
 
-    DT* q = static_cast<DT *>(m->devQKVProjArray) + req->first_token_offset_in_batch * hidden_size,
+    DT* q = static_cast<DT *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
       * k = static_cast<DT *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
       * v = static_cast<DT *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
       * o = output_ptr + req->first_token_offset_in_batch * hidden_size;

From 7721caaa881ea311bffac63d38a84e9223fdd74d Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 7 Jun 2024 03:55:27 -0400
Subject: [PATCH 345/667] Tried to support gumbel sampling. Left some todos.

---
 include/flexflow/request_manager.h |  33 ++++++-
 inference/models/llama.cc          |   2 +-
 src/runtime/request_manager.cc     | 141 ++++++++++++++++++++++-------
 3 files changed, 140 insertions(+), 36 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5ae430d5e..46ce05658 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -130,28 +130,48 @@ class TokenTreeNode {
   float log_accumulated_prob;
   int parent_pos;
   bool pruned = false;
+  bool gumbel = false;
+  float gumbel_logit = 0.0f;
 
   TokenTreeNode(BatchConfig::TokenId id,
                 float log_accumulated_prob,
-                int parent_pos)
+                int parent_pos,
+                bool gumbel = false,
+                float gumbel_logit = 0.0f)
       : id(id), log_accumulated_prob(log_accumulated_prob),
-        parent_pos(parent_pos) {}
+        parent_pos(parent_pos), gumbel(gumbel), gumbel_logit(gumbel_logit) {}
 };
 
+bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
+               std::shared_ptr<TokenTreeNode> const &rhs);
+
+bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
+                std::shared_ptr<TokenTreeNode> const &rhs);
+
 // A comparator for std::shared_ptr<TokenTreeNode>
+// This is used in to sort the token tree nodes in descending order
 struct CompareSharedTokenTreeNodePtr {
   bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
                   std::shared_ptr<TokenTreeNode> const &rhs) const {
+    if (lhs->gumbel) {
+      assert(rhs->gumbel);
+      return lhs->gumbel_logit < rhs->gumbel_logit;
+    }
     return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
   }
 };
 
 // A comparator for std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
+// This is used to sort the token tree nodes in ascending order
 struct CompareSharedTokenTreeNodePtrRequestGuidPair {
   bool operator()(std::pair<std::shared_ptr<TokenTreeNode>,
                             BatchConfig::RequestGuid> const &lhs,
                   std::pair<std::shared_ptr<TokenTreeNode>,
                             BatchConfig::RequestGuid> const &rhs) const {
+    if (lhs.first->gumbel) {
+      assert(rhs.first->gumbel);
+      return lhs.first->gumbel_logit > rhs.first->gumbel_logit;
+    }
     return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
   }
 };
@@ -225,6 +245,7 @@ class RequestManager {
   void set_max_tree_depth(int max_tree_depth);
   int get_max_tree_width();
   void set_max_tree_width(int max_tree_width);
+  void set_speculative_sampling(bool speculative_sampling);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -298,6 +319,7 @@ class RequestManager {
   BackgroundServerStatus background_server_status;
   DecodingMode decoding_mode;
   PrefillModel prefill_model;
+  bool speculative_sampling = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -354,8 +376,9 @@ class RequestManager {
     long long start_time = 0, start_decoding_time = 0, finish_time = 0;
   };
   struct ProfileInfo {
-    // For SpecInfer: One step is comprised of one ssm speculation phase + a single llm verification phase (forward pass + verification)
-    // For Incr Decoding: One step is one LLM decoding phase
+    // For SpecInfer: One step is comprised of one ssm speculation phase + a
+    // single llm verification phase (forward pass + verification) For Incr
+    // Decoding: One step is one LLM decoding phase
     long long llm_step_start = 0, ssm_step_start = 0;
     // Times for each LLM verification phase (in ms)
     std::vector<double> llm_step_times;
@@ -421,6 +444,8 @@ class RequestManager {
       reject_sampling(std::vector<std::pair<TokenId, float>> &D,
                       std::unordered_map<TokenId, float> &R,
                       int k);
+  void gumbel_conditioned_on_max(float target_max,
+                                 std::vector<std::pair<float, int>> &logits);
 };
 
 }; // namespace FlexFlow
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 0fa846bc0..94f58b0ef 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -262,7 +262,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       dense = ff.scalar_truediv(dense, generation_config.temperature, false);
       Tensor softmax = ff.softmax(dense, -1);
       if (generation_config.spec_sample) {
-        output = ff.arg_top_k(softmax, generation_config.topk, false, true);
+        output = ff.gumbel_top_k(softmax, generation_config.topk, false, true);
       } else {
         output = ff.sampling(softmax, generation_config.topp);
       }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7b842d98e..32a539070 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -33,6 +33,24 @@ using tokenizers::Tokenizer;
 
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
+bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
+               std::shared_ptr<TokenTreeNode> const &rhs) {
+  if (lhs->gumbel) {
+    assert(rhs->gumbel);
+    return lhs->gumbel_logit < rhs->gumbel_logit;
+  }
+  return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
+}
+
+bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
+                std::shared_ptr<TokenTreeNode> const &rhs) {
+  if (lhs->gumbel) {
+    assert(rhs->gumbel);
+    return lhs->gumbel_logit <= rhs->gumbel_logit;
+  }
+  return lhs->log_accumulated_prob <= rhs->log_accumulated_prob;
+}
+
 void write_to_output_file(std::string const &output_filepath,
                           std::string const &str) {
   std::ostream *os = &std::cout;
@@ -184,6 +202,10 @@ void RequestManager::set_max_tree_width(int max_tree_width) {
   this->max_tree_width = max_tree_width;
 }
 
+void RequestManager::set_speculative_sampling(bool speculative_sampling_) {
+  speculative_sampling = speculative_sampling_;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -1388,7 +1410,11 @@ bool RequestManager::update_llm_verify_results(
   }
 
   // Process the LLM results greedily
-  get_verify_results_greedy(llm_verify_result);
+  if (speculative_sampling) {
+    get_verify_results_sample(llm_verify_result);
+  } else {
+    get_verify_results_greedy(llm_verify_result);
+  }
 
   profiling.llm_step_times.push_back(
       (Realm::Clock::current_time_in_microseconds() -
@@ -1649,6 +1675,18 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   return llm_bitmask;
 }
 /* --------- Bitmask Related Functions --------- */
+void RequestManager::gumbel_conditioned_on_max(
+    float target_max, std::vector<std::pair<float, int>> &logits) {
+  // Assume the logits are sorted in descending order
+  if (logits.size() == 0) {
+    return;
+  }
+  float max_logit = logits[0].first;
+  for (auto &logit_n_idx : logits) {
+    logit_n_idx.first =
+        -log(exp(-target_max) - exp(-max_logit) + exp(-logit_n_idx.first));
+  }
+}
 
 void RequestManager::renormalize(std::vector<std::pair<TokenId, float>> &D,
                                  std::unordered_map<TokenId, float> &R,
@@ -2323,6 +2361,9 @@ void RequestManager::add_root_to_spec_token_tree(
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
   speculative_token_tree.add_layer();
   auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
+  if (speculative_sampling) {
+    node_ptr->gumbel = true;
+  }
   token_tree_node_pool.push(std::make_pair(node_ptr, guid));
   speculative_token_tree.tree_layers.front().push_back(node_ptr);
   speculative_token_tree.tree_size++;
@@ -2375,19 +2416,32 @@ bool RequestManager::add_tokens_to_spec_token_tree(
         int child_start_idx =
             result_offset +
             parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+        // TODO: rename child_probs to child_logits after change the output of
+        // argmax from prob to logprob
         std::vector<std::pair<float, int>> child_probs(
             BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
         for (int child_pos = 0;
              child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
              child_pos++) {
           int result_idx = child_start_idx + child_pos;
-          child_probs[child_pos] = std::make_pair(
-              log(ssm_inference_result.probs[result_idx]), result_idx);
+          if (!speculative_sampling) {
+            // TODO: the argmax will return log prob instead of prob
+            child_probs[child_pos] = std::make_pair(
+                log(ssm_inference_result.probs[result_idx]), result_idx);
+          } else {
+            // Use gumbel perturbed logits here
+            child_probs[child_pos] = std::make_pair(
+                ssm_inference_result.gumbel_logits[result_idx], result_idx);
+          }
         }
         // Sort in descending order
         std::sort(child_probs.begin(),
                   child_probs.end(),
                   std::greater<std::pair<float, int>>());
+        if (speculative_sampling) {
+          // Condition the gumbel perturbed logits on the maximum
+          gumbel_conditioned_on_max(parent_ptr->gumbel_logit, child_probs);
+        }
 
         // for (int child_pos = 0;
         //      child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
@@ -2399,8 +2453,16 @@ bool RequestManager::add_tokens_to_spec_token_tree(
           //       parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES +
           //       child_pos;
 
-          float log_prob = child_prob.first;
-          float log_accumulated_prob = log_prob + parent_log_prob;
+          float logit = child_prob.first;
+          // The value used to compare between tokens
+          float accumulated_log_prob = logit + parent_log_prob;
+          float gumbel_logit = 0.0f;
+          float cmp_value;
+          if (speculative_sampling) {
+            cmp_value = gumbel_logit = logit;
+          } else {
+            cmp_value = accumulated_log_prob;
+          }
           int result_idx = child_prob.second;
 
           //   std::cout << "Probability at result index " << result_idx << ":
@@ -2409,39 +2471,55 @@ bool RequestManager::add_tokens_to_spec_token_tree(
           //   std::cout << "Token id: "
           //             << ssm_inference_result.token_ids[result_idx] <<
           //             std::endl;
-          assert(log_prob != -std::numeric_limits<float>::infinity() &&
+          assert(logit != -std::numeric_limits<float>::infinity() &&
                  "Child log probability should not be -inf.");
 
           if (tokens.size() == empty_slots_in_layer and
-              log_accumulated_prob <= (*tokens.begin())->log_accumulated_prob) {
-            // The token tree is full, and the new token has a lower joint
-            // probability than the minimum node in the pool, we don't need to
-            // add the new token and the following tokens belong to the same
-            // parent to the tree, because the tokens are sorted by their
-            // probability
+              cmp_value <= (speculative_sampling
+                                ? (*tokens.begin())->gumbel_logit
+                                : (*tokens.begin())->log_accumulated_prob)) {
+            // The token tree is full, and the new token has a lower compare
+            // value than the minimum node in the pool, we don't need to add the
+            // new token and the following tokens belong to the same parent to
+            // the tree, because the tokens are sorted by their compare value
             break;
           } else if (token_pool_full and
-                     log_accumulated_prob <= token_tree_node_pool.top()
-                                                 .first->log_accumulated_prob) {
+                     cmp_value <=
+                         (speculative_sampling
+                              ? token_tree_node_pool.top().first->gumbel_logit
+                              : token_tree_node_pool.top()
+                                    .first->log_accumulated_prob)) {
             // The token tree is not full, but the token pool is full, and the
-            // new token has a lower joint probability than the minimum node
+            // new token has a lower compare value than the minimum node
             // in the pool, we don't need to add the new token and the
             // following tokens belong to the same parent to the tree, because
-            // the tokens are sorted by their probability
+            // the tokens are sorted by their compare value
             break;
           } else {
-            std::shared_ptr<TokenTreeNode> node_ptr =
-                std::make_shared<TokenTreeNode>(
-                    ssm_inference_result.token_ids[result_idx],
-                    log_accumulated_prob,
-                    parent_pos);
+            std::shared_ptr<TokenTreeNode> node_ptr(nullptr);
+            if (speculative_sampling) {
+              node_ptr = std::make_shared<TokenTreeNode>(
+                  ssm_inference_result.token_ids[result_idx],
+                  accumulated_log_prob,
+                  parent_pos,
+                  true,
+                  gumbel_logit);
+            } else {
+              node_ptr = std::make_shared<TokenTreeNode>(
+                  ssm_inference_result.token_ids[result_idx],
+                  accumulated_log_prob,
+                  parent_pos);
+            }
+            // if (tokens.size() == empty_slots_in_layer and
+            //     cmp_value > (speculative_sampling
+            //                      ? (*tokens.begin())->gumbel_logit
+            //                      : (*tokens.begin())->log_accumulated_prob))
+            //                      {
             if (tokens.size() == empty_slots_in_layer and
-                log_accumulated_prob >
-                    (*tokens.begin())->log_accumulated_prob) {
-              // The token tree is full, and the new token has a higher joint
-              // probability than the minimum node in the pool, we need to
-              // remove the minimum node from the pool and add the new token
-              // to the tree
+                *tokens.begin() < node_ptr) {
+              // The token tree is full, and the new token has a higher compare
+              // value than the minimum node in the pool, we need to remove the
+              // minimum node from the pool and add the new token to the tree
               tokens.erase(tokens.begin());
             }
             tokens.insert(node_ptr);
@@ -2452,15 +2530,16 @@ bool RequestManager::add_tokens_to_spec_token_tree(
     }
 
     // Now add all tokens in the set to the token tree, in descending order of
-    // their joint probability
+    // their compare value
     spec_token_tree.add_layer();
     for (auto token_it = tokens.crbegin(); token_it != tokens.crend();
          token_it++) {
       token_pool_full =
           token_tree_node_pool.size() == get_max_tokens_per_batch();
-      if (token_pool_full and
-          token_tree_node_pool.top().first->log_accumulated_prob >=
-              (*token_it)->log_accumulated_prob) {
+      if (token_pool_full and (*token_it) <= token_tree_node_pool.top().first) {
+        //   if (token_pool_full and
+        //       token_tree_node_pool.top().first->log_accumulated_prob >=
+        //           (*token_it)->log_accumulated_prob) {
         break;
       } else if (token_pool_full) {
         token_tree_node_pool.top().first->pruned = true;

From 84d1ba80fe3f920baae585cc49ca55973896c847 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 7 Jun 2024 07:50:10 -0700
Subject: [PATCH 346/667] feat: support full precision attention

---
 .../ops/inc_multihead_self_attention.h        |  1 +
 src/ops/inc_multihead_self_attention.cu       |  6 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 90 +++++++++++--------
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 977645367..6dd43e333 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -188,6 +188,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   float scaling_factor;
   void *weight_ptr, *bias_ptr; // for weight offload
   void *devQKVProjArray, *queryTmp, *keyCache, *valueCache;
+  half *outputTmp;
   void *qk_prods, *qk_prods_softmax;
   void *attn_heads;
   char *quantized_weight_ptr;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 23548ef56..ab20f2963 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1423,13 +1423,14 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         assert(false && "Unkown inference mode");
     }
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
+    size_t output_tmp_size = max_tokens_per_batch * num_q_heads * vProjSize;
     size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads +
                                                    kProjSize * num_q_heads)) /
                           2;
     size_t totalSize =
         (qkv_max_proj_size + query_tmp_size + key_cache_size + value_cache_size +
-         2 * qk_prod_size + attn_heads_size) *
-            size_of_dt +
+         2 * qk_prod_size + attn_heads_size) * size_of_dt +
+        output_tmp_size * data_type_size(DT_HALF) +
         complex_size * sizeof(cuFloatComplex); // more components will
                                                // be added here later
     if (offload) {
@@ -1476,6 +1477,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                            size_of_dt);
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
                                                              size_of_dt);
+    outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
 
     token_infos =
         static_cast<BatchConfig::PerTokenInfo *>(handler.batch_config_metadata);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c8f956e40..9d0ac3541 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -84,10 +84,9 @@ using flashinfer::PosEncodingMode;
 using flashinfer::MaskMode;
 using flashinfer::SinglePrefillWithKVCacheDispatched;
 
-template <typename DT>
 __global__ void commit_tokens_kernel(
-    DT *kCache_ptr,
-    DT *vCache_ptr,
+    half *kCache_ptr,
+    half *vCache_ptr,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int qProjSize,
     int kProjSize,
@@ -116,7 +115,6 @@ __global__ void commit_tokens_kernel(
   }
 }
 
-template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
@@ -128,8 +126,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                            min(CUDA_NUM_THREADS, parallelism),
                            0,
                            stream>>>(
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
+        static_cast<half *>(m->keyCache),
+        static_cast<half *>(m->valueCache),
         m->committed_token_infos,
         m->qProjSize,
         m->kProjSize,
@@ -213,9 +211,9 @@ void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
 template <typename DT>
 __global__ void update_qkv_cache_kernel(
     DT *devQKVProjArray,
-    DT *qTmp_ptr,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
+    half *qTmp_ptr,
+    half *kCache_ptr,
+    half *vCache_ptr,
     BatchConfig::PerTokenInfo const *tokenInfos,
     BatchConfig::PerRequestInfo *request_infos,
     int qProjSize,
@@ -240,11 +238,11 @@ __global__ void update_qkv_cache_kernel(
                   token_abs_idx * hidden_size;
 
   kCache_ptr[to_idx + offset] = 
-              devQKVProjArray[from_idx + hidden_size + offset];
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
   vCache_ptr[to_idx + offset] = 
-              devQKVProjArray[from_idx + hidden_size * 2 + offset];
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
   qTmp_ptr[token_idx * hidden_size + offset] = 
-              devQKVProjArray[from_idx + offset];
+      static_cast<half>(devQKVProjArray[from_idx + offset]);
 }
 
 template <typename DT>
@@ -259,9 +257,9 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
                             0,
                             stream>>>(
       static_cast<DT *>(m->devQKVProjArray),
-      static_cast<DT *>(m->queryTmp),
-      static_cast<DT *>(m->keyCache),
-      static_cast<DT *>(m->valueCache),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->keyCache),
+      static_cast<half *>(m->valueCache),
       m->token_infos,
       m->request_infos,
       m->qProjSize,
@@ -273,6 +271,15 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       num_new_tokens);
 }
 
+template <typename DT>
+__global__ void produce_output_kernel(half const *input_ptr,
+                           DT *output_ptr,
+                           int parallelism) {
+  CUDA_KERNEL_LOOP(idx, parallelism) {
+    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
+  }
+}
+
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                           BatchConfig const *bc,
@@ -309,10 +316,10 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
              q_start = req->first_token_index_in_request,
              kv_len = q_len + q_start;
 
-    DT* q = static_cast<DT *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
-      * k = static_cast<DT *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
-      * v = static_cast<DT *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
-      * o = output_ptr + req->first_token_offset_in_batch * hidden_size;
+    half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
+        * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
+        * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
+        * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
     float* tmp = m->scratch_space;
     float* custom_mask = m->custom_mask + req_idx * max_q_length * max_kv_length;
 
@@ -323,14 +330,14 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     if (bc->prompt_phase) {
       flashinfer::SinglePrefillWithKVCacheDispatched<
         GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-        false, MaskMode::kCausal, DT, DT>(
+        false, MaskMode::kCausal, half, half>(
           q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
           /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
           /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
     } else {
       flashinfer::SinglePrefillWithKVCacheDispatched<
           GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, DT, DT>(
+          false, MaskMode::kCustom, half, half>(
             q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
             num_kv_heads, q_len, kv_len, sm_scale,
             /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
@@ -338,6 +345,13 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     })});
   }
 
+  int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+  produce_output_kernel<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(
+      m->outputTmp, output_ptr, parallelism);
+
   // cudaEventRecord(t_end, stream);
   // checkCUDA(cudaEventSynchronize(t_end));
   // float elapsed = 0;
@@ -380,7 +394,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
   // "\n";
 
-  commit_tokens<DT>(m, bc, stream);
+  commit_tokens(m, bc, stream);
 
   // After commit we update m->num_active_tokens to be the number of active
   // tokens for the current batch
@@ -491,22 +505,22 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         output.get_half_ptr(),
         bias_ptr,
         stream);
-  // } else if (input.data_type == DT_FLOAT) {
-  //   if (m->offload) {
-  //     pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-  //   }
-  //   float const *bias_ptr =
-  //       use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-  //   Kernels::TreeIncMultiHeadAttention::inference_kernel<float>(
-  //       m,
-  //       bc,
-  //       shard_id,
-  //       input.get_float_ptr(),
-  //       m->offload ? static_cast<float *>(m->weight_ptr)
-  //                  : weight.get_float_ptr(),
-  //       output.get_float_ptr(),
-  //       bias_ptr,
-  //       stream);
+  } else if (input.data_type == DT_FLOAT) {
+    if (m->offload) {
+      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+    }
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::TreeIncMultiHeadAttention::inference_kernel<float>(
+        m,
+        bc,
+        shard_id,
+        input.get_float_ptr(),
+        m->offload ? static_cast<float *>(m->weight_ptr)
+                   : weight.get_float_ptr(),
+        output.get_float_ptr(),
+        bias_ptr,
+        stream);
   } else {
     assert(false && "Unspported data type");
   }

From 01c4c0c171c1f1d11a5dfec97a819eec697e45f8 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 8 Jun 2024 09:14:24 -0700
Subject: [PATCH 347/667] feat: key and value cache allocate together

---
 src/ops/inc_multihead_self_attention.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ab20f2963..e51eb7b96 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1473,10 +1473,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size *
                                                              size_of_dt);
     }
-    keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size *
+    keyCache = gpu_mem_allocator.allocate_instance_untyped((key_cache_size + value_cache_size) *
                                                            size_of_dt);
-    valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
-                                                             size_of_dt);
+    valueCache = static_cast<void *>(static_cast<char *>(keyCache) +
+                                     key_cache_size * size_of_dt);
     outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
 
     token_infos =

From a83b0bab52a266bad4aa5a8aee0568636274d874 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 00:16:41 -0700
Subject: [PATCH 348/667] feat: add data structure for flashinfer batch
 inference

---
 .../ops/tree_inc_multihead_self_attention.h    | 11 ++++++++---
 src/ops/tree_inc_multihead_self_attention.cu   | 18 ++++++++++++++----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 1a5ff7a9a..3a0336d1b 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -145,11 +145,16 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 
 public:
   int num_active_tokens;
-  Realm::RegionInstance flashinfer_reserve_inst;
-  float *custom_mask;
-  float *scratch_space;
   BatchConfig::CommittedTokensInfo *committed_token_infos;
   BatchConfig::BitMask *causalMask;
+  // For flashinfer attention
+  Realm::RegionInstance flashinfer_reserve_inst;
+  int32_t *q_indptr;
+  int32_t *kv_indptr;
+  int32_t *kv_indices;
+  int32_t *kv_last_page_len;
+  float *custom_mask;
+  void *workspace;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 9d0ac3541..82d7911cb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -320,7 +320,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
         * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
         * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
-    float* tmp = m->scratch_space;
+    float* tmp = static_cast<float *>(m->workspace);
     float* custom_mask = m->custom_mask + req_idx * max_q_length * max_kv_length;
 
     DISPATCH_GROUP_SIZE(
@@ -575,15 +575,25 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
 
   {
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t indices_size = ((batch_size + 1) * 2 + batch_size * 2);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               BatchConfig::max_spec_tree_token_num() *
                               (BatchConfig::max_spec_tree_token_num() +
                                 BatchConfig::max_sequence_length());
-    size_t scratch_space_size = 8 * 1024 * 1024; // 32 MB float
+    size_t workspace_size = 32 * 1024 * 1024; // 32MB
+    
     gpu_mem_allocator.create_legion_instance(flashinfer_reserve_inst, 
-                sizeof(float) * (custom_mask_size + scratch_space_size));
+                sizeof(int32_t) * indices_size +
+                sizeof(float) * custom_mask_size + workspace_size);
+
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
-    scratch_space = gpu_mem_allocator.allocate_instance<float>(scratch_space_size);
+    workspace = static_cast<void *>(gpu_mem_allocator.allocate_instance<char>(workspace_size));
+    // Why we should allocate these after workspace?? (else we will get index out of bound)
+    q_indptr = gpu_mem_allocator.allocate_instance<int32_t>(indices_size);
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + batch_size;
   }
 
   // allocate memory for the seqArray and reserve space

From 8fb8bfb4b6b642768a402c09dac9975512da63ce Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 00:50:33 -0700
Subject: [PATCH 349/667] feat: add batch template instance

---
 .../tree_inc_multihead_self_attention_impl.cu | 341 +++++++++++++++++-
 1 file changed, 334 insertions(+), 7 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention_impl.cu b/src/ops/tree_inc_multihead_self_attention_impl.cu
index 07ae82453..5c1b0777e 100644
--- a/src/ops/tree_inc_multihead_self_attention_impl.cu
+++ b/src/ops/tree_inc_multihead_self_attention_impl.cu
@@ -172,11 +172,338 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
     float sm_scale, float rope_scale,
     float rope_theta, cudaStream_t stream);
 
-  // template cudaError_t SinglePrefillWithKVCacheDispatched<
-  //     1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  //     false, MaskMode::kCustom, float, float>(
-  //       float* q, float* k, float* v, float* custom_mask, float* o,
-  //       float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-  //       float sm_scale, float rope_scale,
-  //       float rope_theta, cudaStream_t stream);
+
+constexpr uint32_t kPagesize = 512 + 64;
+// num_frags_x[] = {1, 2};
+// group_size[] = {1, 4, 8};
+// head_dim[] = {64, 128, 256};
+
+/********** batch append instantiations for half precision **********/
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  1, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  1, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  1, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  4, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  4, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  4, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  8, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  8, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  8, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  1, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  1, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  1, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  4, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  4, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  4, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  8, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  8, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  8, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+
+/********** batch prefill instantiations for half precision **********/
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  1, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  1, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  1, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  4, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  4, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  4, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  8, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  8, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
+  8, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  1, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  1, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  1, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  4, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  4, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  4, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  8, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  8, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
+  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
+  8, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
+  half, half, int32_t>(
+    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
+    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
+    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
+    float rope_scale, float rope_theta, cudaStream_t stream);
 } // namespace flashinfer

From c891fd1fdb06b5159e0f310f6b1fd8da9ef8f16b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 06:59:22 -0700
Subject: [PATCH 350/667] feat: add some aux function for batch inference

---
 src/ops/tree_inc_multihead_self_attention.cu | 117 ++++++++++++++-----
 1 file changed, 91 insertions(+), 26 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 82d7911cb..9eeab312d 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -25,7 +25,7 @@
 #include <sstream>
 #include <stdexcept>
 
-#define DISPATCH_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
+#define DISPATCH_GROUPSIZE(group_size, GROUP_SIZE, ...)      \
   if (group_size == 1) {                                     \
     constexpr size_t GROUP_SIZE = 1;                         \
     __VA_ARGS__                                              \
@@ -41,7 +41,7 @@
     throw std::invalid_argument(err_msg.str());              \
   }
 
-#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...)     \
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)      \
   switch (head_dim) {                                  \
     case 64: {                                         \
       constexpr size_t HEAD_DIM = 64;                  \
@@ -65,6 +65,18 @@
     }                                                  \
   }
 
+// kPagesize also defined in tree_inc_multihead_self_attention_impl.cu
+// for template instantiation
+constexpr uint32_t kPagesize = 512 + 64;
+#define DISPATCH_PAGESIZE(page_size, PAGE_SIZE, ...)  \
+  if (page_size == kPagesize) {                        \
+    constexpr size_t PAGE_SIZE = kPagesize;            \
+    __VA_ARGS__                                        \
+  } else {                                             \
+    std::ostringstream err_msg;                        \
+    err_msg << "Unsupported page_size: " << page_size; \
+    throw std::invalid_argument(err_msg.str());        \
+  }
 
 namespace FlexFlow {
 
@@ -79,10 +91,13 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
-using flashinfer::QKVLayout;
-using flashinfer::PosEncodingMode;
 using flashinfer::MaskMode;
-using flashinfer::SinglePrefillWithKVCacheDispatched;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
+using flashinfer::paged_kv_t;
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
 
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
@@ -213,12 +228,8 @@ __global__ void update_qkv_cache_kernel(
     DT *devQKVProjArray,
     half *qTmp_ptr,
     half *kCache_ptr,
-    half *vCache_ptr,
     BatchConfig::PerTokenInfo const *tokenInfos,
     BatchConfig::PerRequestInfo *request_infos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
     int max_seq_len,
     int hidden_size,
     int num_new_tokens) {
@@ -234,12 +245,12 @@ __global__ void update_qkv_cache_kernel(
 
   size_t from_idx =
         token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_idx = req_idx * (hidden_size * max_seq_len) +
-                  token_abs_idx * hidden_size;
+  size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size * 2;
 
+  // key and value cache should be stored interleaved
   kCache_ptr[to_idx + offset] = 
       static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  vCache_ptr[to_idx + offset] = 
+  kCache_ptr[to_idx + hidden_size + offset] = 
       static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
   qTmp_ptr[token_idx * hidden_size + offset] = 
       static_cast<half>(devQKVProjArray[from_idx + offset]);
@@ -259,12 +270,65 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       static_cast<DT *>(m->devQKVProjArray),
       static_cast<half *>(m->queryTmp),
       static_cast<half *>(m->keyCache),
+      m->token_infos,
+      m->request_infos,
+      BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num(),
+      m->hidden_size,
+      num_new_tokens);
+}
+
+template <typename DT>
+__global__ void orig_update_qkv_cache_kernel(
+    DT *devQKVProjArray,
+    half *qTmp_ptr,
+    half *kCache_ptr,
+    half *vCache_ptr,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerRequestInfo *request_infos,
+    int max_seq_len,
+    int hidden_size,
+    int num_new_tokens) {
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / hidden_size;
+  int const offset = thread_idx % hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size;
+
+  // key and value cache should be stored interleaved
+  kCache_ptr[to_idx + offset] = 
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
+  vCache_ptr[to_idx + offset] = 
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
+  qTmp_ptr[token_idx * hidden_size + offset] = 
+      static_cast<half>(devQKVProjArray[from_idx + offset]);
+}
+
+template <typename DT>
+void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
+                                 BatchConfig const *bc,
+                                 cudaStream_t stream) {
+  // update the kv cache, compact the q array
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  orig_update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->keyCache),
       static_cast<half *>(m->valueCache),
       m->token_infos,
       m->request_infos,
-      m->qProjSize,
-      m->kProjSize,
-      m->vProjSize,
       BatchConfig::max_sequence_length() +
           BatchConfig::max_spec_tree_token_num(),
       m->hidden_size,
@@ -304,8 +368,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
   uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
                             BatchConfig::max_sequence_length();
-  
-  // flashinfer parameters
+
 
   for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
     if (!bc->request_available[req_idx]) {
@@ -323,9 +386,9 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     float* tmp = static_cast<float *>(m->workspace);
     float* custom_mask = m->custom_mask + req_idx * max_q_length * max_kv_length;
 
-    DISPATCH_GROUP_SIZE(
+    DISPATCH_GROUPSIZE(
       group_size, GROUP_SIZE,
-        {DISPATCH_HEAD_DIM(
+        {DISPATCH_HEADDIM(
           head_dim, HEAD_DIM, {
     if (bc->prompt_phase) {
       flashinfer::SinglePrefillWithKVCacheDispatched<
@@ -345,12 +408,14 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     })});
   }
 
-  int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
-  produce_output_kernel<<<GET_BLOCKS(parallelism),
-                          min(CUDA_NUM_THREADS, parallelism),
-                          0,
-                          stream>>>(
-      m->outputTmp, output_ptr, parallelism);
+  {
+    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+    produce_output_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(
+        m->outputTmp, output_ptr, parallelism);
+  }
 
   // cudaEventRecord(t_end, stream);
   // checkCUDA(cudaEventSynchronize(t_end));
@@ -422,7 +487,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
 
   // Update key-val cache, compact q array
-  update_qkv_cache<DT>(m, bc, stream);
+  orig_update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
   tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);

From 255ad0c5d06d4043d7f5c6b6de4dc059ecddbc00 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 07:40:43 -0700
Subject: [PATCH 351/667] feat: updating function for batch inference

---
 src/ops/tree_inc_multihead_self_attention.cu | 66 ++++++++++++++++++--
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 9eeab312d..d839c181a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -172,13 +172,18 @@ __global__ void update_custom_mask_kernel(
   
   // request id in batch config
   int requext_idx_in_batch = -1;
-  int cnt_1 = 0;
+  int cnt_1 = 0, mask_offset = 0, mask_lens = 0;
   while (cnt_1 < request_idx + 1) {
     requext_idx_in_batch++;
     if (request_available[requext_idx_in_batch]) {
       cnt_1++;
+      mask_offset = mask_lens;
+      int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch,
+          k_len = q_len + request_infos[requext_idx_in_batch].first_token_index_in_request;
+      mask_lens += q_len * k_len;
     }
   }
+  __syncthreads();
 
   int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch;
   int const q_start =
@@ -188,8 +193,7 @@ __global__ void update_custom_mask_kernel(
   }
   assert(q_start + q_length <= max_kv_length);
 
-  float *mask = custom_mask + request_idx * max_q_length * max_kv_length +
-                q_idx * (q_start + q_length);
+  float *mask = custom_mask + mask_offset + q_idx * (q_start + q_length);
   // update custom mask
   for (int i = 0; i < q_start; i++) {
     mask[i] = 0.0f;
@@ -335,6 +339,41 @@ void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       num_new_tokens);
 }
 
+__global__ void prepare_inference_params_kernel(int const num_requests,
+                          BatchConfig::PerRequestInfo *request_infos,
+                          bool *request_available,
+                          int32_t *q_indptr,
+                          int32_t *kv_indptr,
+                          int32_t *kv_indices,
+                          int32_t *kv_last_page_len) {
+  int const request_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (request_idx >= num_requests) {
+    return;
+  }
+
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0, q_lens = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+      q_lens += request_infos[requext_idx_in_batch].num_tokens_in_batch;
+    }
+  }
+
+  if (request_idx == 0) {
+    q_indptr[0] = 0;
+    kv_indptr[0] = 0;
+  }
+  __syncthreads();
+  q_indptr[request_idx + 1] = q_lens;
+  kv_indptr[request_idx + 1] = request_idx + 1;
+  kv_indices[request_idx] = requext_idx_in_batch;
+  kv_last_page_len[request_idx] = request_infos[requext_idx_in_batch].num_tokens_in_batch +
+                                  request_infos[requext_idx_in_batch].first_token_index_in_request;
+}
+
 template <typename DT>
 __global__ void produce_output_kernel(half const *input_ptr,
                            DT *output_ptr,
@@ -359,6 +398,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const group_size = num_q_heads / num_kv_heads;
   uint32_t const head_dim = m->qProjSize;
+  uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
 
   // for finding q, k, v, custom_mask pointers
@@ -369,7 +409,22 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
                             BatchConfig::max_sequence_length();
 
+  {
+    int parallelism = batch_size;
+    prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+        batch_size,
+        m->request_infos,
+        m->request_available,
+        m->q_indptr,
+        m->kv_indptr,
+        m->kv_indices,
+        m->kv_last_page_len);
+  }
 
+  int mask_lens = 0, mask_offset = 0;
   for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
     if (!bc->request_available[req_idx]) {
       continue;
@@ -379,12 +434,15 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
              q_start = req->first_token_index_in_request,
              kv_len = q_len + q_start;
 
+    mask_offset = mask_lens;
+    mask_lens += q_len * kv_len;
+
     half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
         * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
         * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
         * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
     float* tmp = static_cast<float *>(m->workspace);
-    float* custom_mask = m->custom_mask + req_idx * max_q_length * max_kv_length;
+    float* custom_mask = m->custom_mask + mask_offset;
 
     DISPATCH_GROUPSIZE(
       group_size, GROUP_SIZE,

From 17f61a30f44b4f9d77e6564b0d40b528b3068a37 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 07:46:54 -0700
Subject: [PATCH 352/667] feat: add batch inference

---
 src/ops/tree_inc_multihead_self_attention.cu | 130 ++++++++++++-------
 1 file changed, 86 insertions(+), 44 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d839c181a..3a4bf6991 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -402,12 +402,12 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
 
   // for finding q, k, v, custom_mask pointers
-  uint32_t const hidden_size = m->hidden_size;
-  uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
-                          BatchConfig::max_spec_tree_token_num();
-  uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
-  uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
-                            BatchConfig::max_sequence_length();
+  // uint32_t const hidden_size = m->hidden_size;
+  // uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
+  //                         BatchConfig::max_spec_tree_token_num();
+  // uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
+  // uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+  //                           BatchConfig::max_sequence_length();
 
   {
     int parallelism = batch_size;
@@ -424,47 +424,89 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kv_last_page_len);
   }
 
-  int mask_lens = 0, mask_offset = 0;
-  for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
-    if (!bc->request_available[req_idx]) {
-      continue;
-    }
-    BatchConfig::PerRequestInfo const *req = bc->requestsInfo + req_idx;
-    uint32_t q_len = req->num_tokens_in_batch,
-             q_start = req->first_token_index_in_request,
-             kv_len = q_len + q_start;
-
-    mask_offset = mask_lens;
-    mask_lens += q_len * kv_len;
-
-    half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
-        * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
-        * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
-        * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
-    float* tmp = static_cast<float *>(m->workspace);
-    float* custom_mask = m->custom_mask + mask_offset;
-
-    DISPATCH_GROUPSIZE(
-      group_size, GROUP_SIZE,
-        {DISPATCH_HEADDIM(
-          head_dim, HEAD_DIM, {
+  half* q = static_cast<half *>(m->queryTmp),
+      * kv = static_cast<half *>(m->keyCache),
+      * o = static_cast<half *>(m->outputTmp);
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
+    num_kv_heads, kPagesize, head_dim, batch_size, kv,
+    m->kv_indices, m->kv_indptr, m->kv_last_page_len);
+  BatchPrefillHandler handler;
+  size_t workspace_size = 32 * 1024 * 1024;
+  handler.BeginForward(
+      m->workspace, workspace_size, m->q_indptr, batch_size,
+      num_q_heads, num_kv_heads, head_dim);
+
+  DISPATCH_GROUPSIZE(
+    group_size, GROUP_SIZE, {
+      DISPATCH_HEADDIM(
+        head_dim, HEAD_DIM, {
+          DISPATCH_PAGESIZE(
+            kPagesize, PAGE_SIZE, {
+    cudaError_t result;
     if (bc->prompt_phase) {
-      flashinfer::SinglePrefillWithKVCacheDispatched<
-        GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-        false, MaskMode::kCausal, half, half>(
-          q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
-          /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
-          /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+      result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+        PageStorage::kIndices, QKVLayout::kNHD, PAGE_SIZE, GROUP_SIZE,
+        HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCausal,
+        half, half, int32_t>(
+          &handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
+          /*custom_mask=*/nullptr, /*qk_indptr=*/nullptr, o, /*lse=*/nullptr,
+          sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
     } else {
-      flashinfer::SinglePrefillWithKVCacheDispatched<
-          GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half>(
-            q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
-            num_kv_heads, q_len, kv_len, sm_scale,
-            /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+      result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+        PageStorage::kIndices, QKVLayout::kNHD, PAGE_SIZE, GROUP_SIZE,
+        HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCustom,
+        half, half, int32_t>(
+          &handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
+          m->custom_mask, /*qk_indptr=*/nullptr, o, /*lse=*/nullptr,
+          sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
     }
-    })});
-  }
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to run BatchPrefillWithPagedKVCacheWrapperDispatched"
+        + std::string(cudaGetErrorString(result)));
+    }
+  })})});
+
+  // int mask_lens = 0, mask_offset = 0;
+  // for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
+  //   if (!bc->request_available[req_idx]) {
+  //     continue;
+  //   }
+  //   BatchConfig::PerRequestInfo const *req = bc->requestsInfo + req_idx;
+  //   uint32_t q_len = req->num_tokens_in_batch,
+  //            q_start = req->first_token_index_in_request,
+  //            kv_len = q_len + q_start;
+
+  //   mask_offset = mask_lens;
+  //   mask_lens += q_len * kv_len;
+
+  //   half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
+  //       * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
+  //       * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
+  //       * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
+  //   float* tmp = static_cast<float *>(m->workspace);
+  //   float* custom_mask = m->custom_mask + mask_offset;
+
+  //   DISPATCH_GROUPSIZE(
+  //     group_size, GROUP_SIZE,
+  //       {DISPATCH_HEADDIM(
+  //         head_dim, HEAD_DIM, {
+  //   if (bc->prompt_phase) {
+  //     flashinfer::SinglePrefillWithKVCacheDispatched<
+  //       GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
+  //       false, MaskMode::kCausal, half, half>(
+  //         q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
+  //         /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
+  //         /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+  //   } else {
+  //     flashinfer::SinglePrefillWithKVCacheDispatched<
+  //         GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
+  //         false, MaskMode::kCustom, half, half>(
+  //           q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
+  //           num_kv_heads, q_len, kv_len, sm_scale,
+  //           /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+  //   }
+  //   })});
+  // }
 
   {
     int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();

From 1e3febd56757efa8d893897e40778c7589d096db Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 12:10:43 -0700
Subject: [PATCH 353/667] chore:minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 132 +++++++++----------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 3a4bf6991..d3e459c0b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -282,62 +282,62 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       num_new_tokens);
 }
 
-template <typename DT>
-__global__ void orig_update_qkv_cache_kernel(
-    DT *devQKVProjArray,
-    half *qTmp_ptr,
-    half *kCache_ptr,
-    half *vCache_ptr,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    BatchConfig::PerRequestInfo *request_infos,
-    int max_seq_len,
-    int hidden_size,
-    int num_new_tokens) {
-  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / hidden_size;
-  int const offset = thread_idx % hidden_size;
-  if (token_idx >= num_new_tokens) {
-    return;
-  }
-
-  int const req_idx = tokenInfos[token_idx].request_index;
-  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-  size_t from_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size;
-
-  // key and value cache should be stored interleaved
-  kCache_ptr[to_idx + offset] = 
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  vCache_ptr[to_idx + offset] = 
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] = 
-      static_cast<half>(devQKVProjArray[from_idx + offset]);
-}
-
-template <typename DT>
-void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                 BatchConfig const *bc,
-                                 cudaStream_t stream) {
-  // update the kv cache, compact the q array
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  orig_update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<half *>(m->queryTmp),
-      static_cast<half *>(m->keyCache),
-      static_cast<half *>(m->valueCache),
-      m->token_infos,
-      m->request_infos,
-      BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num(),
-      m->hidden_size,
-      num_new_tokens);
-}
+// template <typename DT>
+// __global__ void orig_update_qkv_cache_kernel(
+//     DT *devQKVProjArray,
+//     half *qTmp_ptr,
+//     half *kCache_ptr,
+//     half *vCache_ptr,
+//     BatchConfig::PerTokenInfo const *tokenInfos,
+//     BatchConfig::PerRequestInfo *request_infos,
+//     int max_seq_len,
+//     int hidden_size,
+//     int num_new_tokens) {
+//   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+//   int const token_idx = thread_idx / hidden_size;
+//   int const offset = thread_idx % hidden_size;
+//   if (token_idx >= num_new_tokens) {
+//     return;
+//   }
+
+//   int const req_idx = tokenInfos[token_idx].request_index;
+//   int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+//   size_t from_idx =
+//         token_idx * QKV_WEIGHT_NUM * hidden_size;
+//   size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size;
+
+//   // key and value cache should be stored interleaved
+//   kCache_ptr[to_idx + offset] = 
+//       static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
+//   vCache_ptr[to_idx + offset] = 
+//       static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
+//   qTmp_ptr[token_idx * hidden_size + offset] = 
+//       static_cast<half>(devQKVProjArray[from_idx + offset]);
+// }
+
+// template <typename DT>
+// void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
+//                                  BatchConfig const *bc,
+//                                  cudaStream_t stream) {
+//   // update the kv cache, compact the q array
+//   int num_new_tokens = bc->num_active_tokens();
+//   int parallelism = m->hidden_size * num_new_tokens;
+//   orig_update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+//                             min(CUDA_NUM_THREADS, parallelism),
+//                             0,
+//                             stream>>>(
+//       static_cast<DT *>(m->devQKVProjArray),
+//       static_cast<half *>(m->queryTmp),
+//       static_cast<half *>(m->keyCache),
+//       static_cast<half *>(m->valueCache),
+//       m->token_infos,
+//       m->request_infos,
+//       BatchConfig::max_sequence_length() +
+//           BatchConfig::max_spec_tree_token_num(),
+//       m->hidden_size,
+//       num_new_tokens);
+// }
 
 __global__ void prepare_inference_params_kernel(int const num_requests,
                           BatchConfig::PerRequestInfo *request_infos,
@@ -401,14 +401,6 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
 
-  // for finding q, k, v, custom_mask pointers
-  // uint32_t const hidden_size = m->hidden_size;
-  // uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
-  //                         BatchConfig::max_spec_tree_token_num();
-  // uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
-  // uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
-  //                           BatchConfig::max_sequence_length();
-
   {
     int parallelism = batch_size;
     prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
@@ -430,8 +422,8 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
     num_kv_heads, kPagesize, head_dim, batch_size, kv,
     m->kv_indices, m->kv_indptr, m->kv_last_page_len);
-  BatchPrefillHandler handler;
   size_t workspace_size = 32 * 1024 * 1024;
+  BatchPrefillHandler handler(workspace_size);
   handler.BeginForward(
       m->workspace, workspace_size, m->q_indptr, batch_size,
       num_q_heads, num_kv_heads, head_dim);
@@ -466,6 +458,14 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     }
   })})});
 
+  // for finding q, k, v, custom_mask pointers
+  // uint32_t const hidden_size = m->hidden_size;
+  // uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
+  //                         BatchConfig::max_spec_tree_token_num();
+  // uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
+  // uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+  //                           BatchConfig::max_sequence_length();
+
   // int mask_lens = 0, mask_offset = 0;
   // for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
   //   if (!bc->request_available[req_idx]) {
@@ -587,7 +587,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
 
   // Update key-val cache, compact q array
-  orig_update_qkv_cache<DT>(m, bc, stream);
+  update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
   tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);

From cade300534883adebd559b73d35159e65d0352e7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 12:58:33 -0700
Subject: [PATCH 354/667] feat: split orig_attn

---
 src/ops/tree_inc_multihead_self_attention.cu | 238 ++++++++++---------
 1 file changed, 132 insertions(+), 106 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d3e459c0b..21f32e481 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -282,62 +282,62 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       num_new_tokens);
 }
 
-// template <typename DT>
-// __global__ void orig_update_qkv_cache_kernel(
-//     DT *devQKVProjArray,
-//     half *qTmp_ptr,
-//     half *kCache_ptr,
-//     half *vCache_ptr,
-//     BatchConfig::PerTokenInfo const *tokenInfos,
-//     BatchConfig::PerRequestInfo *request_infos,
-//     int max_seq_len,
-//     int hidden_size,
-//     int num_new_tokens) {
-//   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-//   int const token_idx = thread_idx / hidden_size;
-//   int const offset = thread_idx % hidden_size;
-//   if (token_idx >= num_new_tokens) {
-//     return;
-//   }
-
-//   int const req_idx = tokenInfos[token_idx].request_index;
-//   int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-//   size_t from_idx =
-//         token_idx * QKV_WEIGHT_NUM * hidden_size;
-//   size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size;
-
-//   // key and value cache should be stored interleaved
-//   kCache_ptr[to_idx + offset] = 
-//       static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-//   vCache_ptr[to_idx + offset] = 
-//       static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-//   qTmp_ptr[token_idx * hidden_size + offset] = 
-//       static_cast<half>(devQKVProjArray[from_idx + offset]);
-// }
-
-// template <typename DT>
-// void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
-//                                  BatchConfig const *bc,
-//                                  cudaStream_t stream) {
-//   // update the kv cache, compact the q array
-//   int num_new_tokens = bc->num_active_tokens();
-//   int parallelism = m->hidden_size * num_new_tokens;
-//   orig_update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-//                             min(CUDA_NUM_THREADS, parallelism),
-//                             0,
-//                             stream>>>(
-//       static_cast<DT *>(m->devQKVProjArray),
-//       static_cast<half *>(m->queryTmp),
-//       static_cast<half *>(m->keyCache),
-//       static_cast<half *>(m->valueCache),
-//       m->token_infos,
-//       m->request_infos,
-//       BatchConfig::max_sequence_length() +
-//           BatchConfig::max_spec_tree_token_num(),
-//       m->hidden_size,
-//       num_new_tokens);
-// }
+template <typename DT>
+__global__ void orig_update_qkv_cache_kernel(
+    DT *devQKVProjArray,
+    half *qTmp_ptr,
+    half *kCache_ptr,
+    half *vCache_ptr,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerRequestInfo *request_infos,
+    int max_seq_len,
+    int hidden_size,
+    int num_new_tokens) {
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / hidden_size;
+  int const offset = thread_idx % hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size;
+
+  // key and value cache should be stored interleaved
+  kCache_ptr[to_idx + offset] = 
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
+  vCache_ptr[to_idx + offset] = 
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
+  qTmp_ptr[token_idx * hidden_size + offset] = 
+      static_cast<half>(devQKVProjArray[from_idx + offset]);
+}
+
+template <typename DT>
+void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
+                                 BatchConfig const *bc,
+                                 cudaStream_t stream) {
+  // update the kv cache, compact the q array
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  orig_update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->keyCache),
+      static_cast<half *>(m->valueCache),
+      m->token_infos,
+      m->request_infos,
+      BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num(),
+      m->hidden_size,
+      num_new_tokens);
+}
 
 __global__ void prepare_inference_params_kernel(int const num_requests,
                           BatchConfig::PerRequestInfo *request_infos,
@@ -383,6 +383,79 @@ __global__ void produce_output_kernel(half const *input_ptr,
   }
 }
 
+template <typename DT>
+void orig_tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
+                          BatchConfig const *bc,
+                          DT *output_ptr,
+                          cudaStream_t stream) {
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const group_size = num_q_heads / num_kv_heads;
+  uint32_t const head_dim = m->qProjSize;
+  uint32_t const batch_size = bc->num_active_requests();
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+
+  // for finding q, k, v, custom_mask pointers
+  uint32_t const hidden_size = m->hidden_size;
+  uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
+                          BatchConfig::max_spec_tree_token_num();
+  uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
+  uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+                            BatchConfig::max_sequence_length();
+
+  int mask_lens = 0, mask_offset = 0;
+  for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
+    if (!bc->request_available[req_idx]) {
+      continue;
+    }
+    BatchConfig::PerRequestInfo const *req = bc->requestsInfo + req_idx;
+    uint32_t q_len = req->num_tokens_in_batch,
+             q_start = req->first_token_index_in_request,
+             kv_len = q_len + q_start;
+
+    mask_offset = mask_lens;
+    mask_lens += q_len * kv_len;
+
+    half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
+        * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
+        * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
+        * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
+    float* tmp = static_cast<float *>(m->workspace);
+    float* custom_mask = m->custom_mask + mask_offset;
+
+    DISPATCH_GROUPSIZE(
+      group_size, GROUP_SIZE,
+        {DISPATCH_HEADDIM(
+          head_dim, HEAD_DIM, {
+    if (bc->prompt_phase) {
+      flashinfer::SinglePrefillWithKVCacheDispatched<
+        GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
+        false, MaskMode::kCausal, half, half>(
+          q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
+          /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
+          /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+    } else {
+      flashinfer::SinglePrefillWithKVCacheDispatched<
+          GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half>(
+            q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
+            num_kv_heads, q_len, kv_len, sm_scale,
+            /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
+    }
+    })});
+  }
+
+  {
+    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+    produce_output_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(
+        m->outputTmp, output_ptr, parallelism);
+  }
+}
+
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                           BatchConfig const *bc,
@@ -458,56 +531,6 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     }
   })})});
 
-  // for finding q, k, v, custom_mask pointers
-  // uint32_t const hidden_size = m->hidden_size;
-  // uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
-  //                         BatchConfig::max_spec_tree_token_num();
-  // uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
-  // uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
-  //                           BatchConfig::max_sequence_length();
-
-  // int mask_lens = 0, mask_offset = 0;
-  // for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
-  //   if (!bc->request_available[req_idx]) {
-  //     continue;
-  //   }
-  //   BatchConfig::PerRequestInfo const *req = bc->requestsInfo + req_idx;
-  //   uint32_t q_len = req->num_tokens_in_batch,
-  //            q_start = req->first_token_index_in_request,
-  //            kv_len = q_len + q_start;
-
-  //   mask_offset = mask_lens;
-  //   mask_lens += q_len * kv_len;
-
-  //   half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
-  //       * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
-  //       * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
-  //       * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
-  //   float* tmp = static_cast<float *>(m->workspace);
-  //   float* custom_mask = m->custom_mask + mask_offset;
-
-  //   DISPATCH_GROUPSIZE(
-  //     group_size, GROUP_SIZE,
-  //       {DISPATCH_HEADDIM(
-  //         head_dim, HEAD_DIM, {
-  //   if (bc->prompt_phase) {
-  //     flashinfer::SinglePrefillWithKVCacheDispatched<
-  //       GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-  //       false, MaskMode::kCausal, half, half>(
-  //         q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
-  //         /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
-  //         /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-  //   } else {
-  //     flashinfer::SinglePrefillWithKVCacheDispatched<
-  //         GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-  //         false, MaskMode::kCustom, half, half>(
-  //           q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
-  //           num_kv_heads, q_len, kv_len, sm_scale,
-  //           /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-  //   }
-  //   })});
-  // }
-
   {
     int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
     produce_output_kernel<<<GET_BLOCKS(parallelism),
@@ -613,6 +636,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   delete[] temp_output;
   // }
 
+  // orig_update_qkv_cache<DT>(m, bc, stream);
+  // orig_tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+
   int processed_tokens_in_batch = bc->num_active_tokens();
 
   compute_o_prod_bias(m,

From 116aada89b77f04892bb8e128204744442c8c4dd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 23:47:16 -0700
Subject: [PATCH 355/667] chore: minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 21f32e481..62fe16c27 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -767,7 +767,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
 
   {
     size_t batch_size = BatchConfig::max_requests_per_batch();
-    size_t indices_size = ((batch_size + 1) * 2 + batch_size * 2);
+    size_t indices_size = std::max((batch_size + 1) * 4, 1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               BatchConfig::max_spec_tree_token_num() *
                               (BatchConfig::max_spec_tree_token_num() +
@@ -778,13 +778,12 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                 sizeof(int32_t) * indices_size +
                 sizeof(float) * custom_mask_size + workspace_size);
 
+    q_indptr = gpu_mem_allocator.allocate_instance<int32_t>(indices_size);
+    kv_indptr = q_indptr + indices_size / 4;
+    kv_indices = kv_indptr + indices_size / 4;
+    kv_last_page_len = kv_indices + indices_size / 4;
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
     workspace = static_cast<void *>(gpu_mem_allocator.allocate_instance<char>(workspace_size));
-    // Why we should allocate these after workspace?? (else we will get index out of bound)
-    q_indptr = gpu_mem_allocator.allocate_instance<int32_t>(indices_size);
-    kv_indptr = q_indptr + batch_size + 1;
-    kv_indices = kv_indptr + batch_size + 1;
-    kv_last_page_len = kv_indices + batch_size;
   }
 
   // allocate memory for the seqArray and reserve space

From 96855dcd1273d11f74f18ffa9f48504d84979b27 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 10 Jun 2024 02:51:51 -0400
Subject: [PATCH 356/667] Fixed request manager.

---
 include/flexflow/request_manager.h |   7 +-
 src/runtime/request_manager.cc     | 331 +++++++++++------------------
 2 files changed, 126 insertions(+), 212 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 52e01b53b..21b1f1b59 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -120,8 +120,6 @@ struct Request {
         : from_index(from_index), to_index(to_index), token_id(token_id) {}
   };
   std::vector<CommittedToken> committed_tokens;
-  bool llm_committed = true;
-  bool ssm_committed = true;
 };
 
 class TokenTreeNode {
@@ -354,8 +352,9 @@ class RequestManager {
     long long start_time = 0, start_decoding_time = 0, finish_time = 0;
   };
   struct ProfileInfo {
-    // For SpecInfer: One step is comprised of one ssm speculation phase + a single llm verification phase (forward pass + verification)
-    // For Incr Decoding: One step is one LLM decoding phase
+    // For SpecInfer: One step is comprised of one ssm speculation phase + a
+    // single llm verification phase (forward pass + verification) For Incr
+    // Decoding: One step is one LLM decoding phase
     long long llm_step_start = 0, ssm_step_start = 0;
     // Times for each LLM verification phase (in ms)
     std::vector<double> llm_step_times;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0e0d7e448..ca535fbeb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -32,7 +32,8 @@ using tokenizers::Tokenizer;
 
 LegionRuntime::Logger::Category log_req_mgr("RequestManager");
 
-void write_to_output_file(std::string const &output_filepath, std::string const &str) {
+void write_to_output_file(std::string const &output_filepath,
+                          std::string const &str) {
   std::ostream *os = &std::cout;
   std::ofstream output_file;
   if (!output_filepath.empty()) {
@@ -522,7 +523,8 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     *os << "Request " << guid << " profiling: " << std::endl;
     if (profile_info.start_decoding_time != 0) {
       *os << "Decoding time: "
-          << (profile_info.finish_time - profile_info.start_decoding_time) * 1e-3
+          << (profile_info.finish_time - profile_info.start_decoding_time) *
+                 1e-3
           << " ms" << std::endl;
     } else {
       *os << "Decoding time: 0 ms" << std::endl;
@@ -530,34 +532,33 @@ void RequestManager::request_complete_clean_up(int batch_index) {
     *os << "Total time: "
         << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
         << std::endl;
-    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps << std::endl;
-      if (decoding_mode == SPECULATIVE_DECODING) {
-        *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
-            << std::endl;
-      }
-      *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
+    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps
+        << std::endl;
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
+          << std::endl;
+    }
+    *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
 
-      if (!output_filepath.empty()) {
-        output_file.close();
-      }
+    if (!output_filepath.empty()) {
+      output_file.close();
+    }
   }
   RequestProfileInfo profile_info = profiling_requests[guid];
-  std::string str = "[" + std::to_string(guid) + "] Request completed:" + 
-                      " decoding_time_ms(" + std::to_string(
-                        (profile_info.finish_time-
-                          profile_info.start_decoding_time)
-                          *1e-3) + ")" + 
-                      " total_time_ms(" + std::to_string(
-                        (profile_info.finish_time-
-                          profile_info.start_time)
-                          *1e-3) + ")" + 
-                      " LLM_decoding_steps(" + std::to_string(
-                        profile_info.llm_decoding_steps) 
-                        + ")";
+  std::string str =
+      "[" + std::to_string(guid) +
+      "] Request completed:" + " decoding_time_ms(" +
+      std::to_string(
+          (profile_info.finish_time - profile_info.start_decoding_time) *
+          1e-3) +
+      ")" + " total_time_ms(" +
+      std::to_string((profile_info.finish_time - profile_info.start_time) *
+                     1e-3) +
+      ")" + " LLM_decoding_steps(" +
+      std::to_string(profile_info.llm_decoding_steps) + ")";
   if (decoding_mode == SPECULATIVE_DECODING) {
-    str = str + " SSM_decoding_steps(" + std::to_string(
-      profile_info.ssm_decoding_steps) 
-      + ")";
+    str = str + " SSM_decoding_steps(" +
+          std::to_string(profile_info.ssm_decoding_steps) + ")";
   }
   write_to_output_file("", str);
 
@@ -699,24 +700,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
-  int committed_token_offset = prefill_request->llm_cache_size;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
-  prefill_request->committed_tokens.clear();
-
-  if (decoding_mode == SPECULATIVE_DECODING) {
-    // Modified the state because the last commitment completes
-    prefill_request->llm_committed = true;
-    assert(prefill_request->ssm_committed and prefill_request->llm_committed);
-
-    for (int i = 0; i < prefill_request->num_tokens_in_batch; i++) {
-      prefill_request->committed_tokens.push_back(Request::CommittedToken{
-          -1,
-          committed_token_offset + i,
-          prefill_request->tokens[i + committed_token_offset]});
-    }
-    // Modified the state because the new commitment is unfinished
-    prefill_request->llm_committed = false;
-  }
 
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
@@ -730,13 +714,12 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
     if (decoding_mode == SPECULATIVE_DECODING) {
       // Add the last token to the token tree
+      assert(prefill_request->committed_tokens.empty() &&
+             "The committed tokens should be empty.");
       prefill_request->committed_tokens.push_back(
           Request::CommittedToken{-1,
                                   (int)prefill_request->tokens.size() - 1,
                                   prefill_request->tokens.back()});
-      // Modified the state because the ssm also need to commit the last token
-      prefill_request->ssm_committed = false;
-
       init_token_tree(prefill_request->guid);
       add_root_to_spec_token_tree(prefill_request->guid,
                                   prefill_request->tokens.back());
@@ -746,29 +729,6 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 
   profiling_requests[prefill_request->guid].llm_prefilling_steps++;
 
-  // Manages the committed states for other requests in the batch
-  for (int request_index = 0; request_index < get_max_requests_per_batch();
-       ++request_index) {
-    if (!request_available[request_index]) {
-      continue;
-    }
-    int guid = guid_of_requests[request_index];
-    Request &request = all_requests[guid];
-    assert(request.status == Request::RUNNING);
-
-    if (request_index == prefill_request->batch_index) {
-
-      continue;
-    }
-
-    if (!request.llm_committed) {
-      request.llm_committed = true;
-      if (request.ssm_committed and request.llm_committed) {
-        request.llm_cache_size = request.tokens.size() - 1;
-        request.committed_tokens.clear();
-      }
-    }
-  }
   return prefill_completed;
 }
 
@@ -802,9 +762,10 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
                 << output << std::endl;
     }
   }
-  profiling.llm_step_times.push_back((
-        Realm::Clock::current_time_in_microseconds() - 
-        profiling.llm_step_start) * 1e-3);
+  profiling.llm_step_times.push_back(
+      (Realm::Clock::current_time_in_microseconds() -
+       profiling.llm_step_start) *
+      1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
   profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
   return request_completed;
@@ -881,68 +842,39 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
   }
   bc.prompt_phase = true;
-  std::copy(std::begin(request_available),
-            std::end(request_available),
-            std::begin(bc.request_available));
-  bc.num_available_requests = num_available_requests;
+  bc.request_available[prefill_request->batch_index] = true;
+  bc.num_available_requests = 1;
 
-  for (int request_index = 0; request_index < get_max_requests_per_batch();
-       ++request_index) {
-    if (!request_available[request_index]) {
-      continue;
-    }
-    RequestGuid guid = guid_of_requests[request_index];
-    Request &request = all_requests[guid];
-    assert(request.status == Request::RUNNING);
+  int request_index = prefill_request->batch_index;
+  RequestGuid guid = guid_of_requests[request_index];
+  Request &request = all_requests[guid];
+  assert(request.status == Request::RUNNING);
 
-    if (request_index == prefill_request->batch_index) {
-      // Request Info
-      bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-      bc.requestsInfo[request_index].first_token_index_in_request =
-          prefill_request->llm_cache_size;
-      bc.requestsInfo[request_index].num_tokens_in_batch =
-          std::min(get_max_tokens_per_batch(),
-                   (int)prefill_request->tokens.size() -
-                       prefill_request->llm_cache_size);
-
-      prefill_request->first_token_offset_in_batch = 0;
-      prefill_request->num_tokens_in_batch =
-          bc.requestsInfo[request_index].num_tokens_in_batch;
-
-      // Token Info
-      for (int token_idx = 0;
-           token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
-           token_idx++) {
-        int abs_idx = prefill_request->llm_cache_size + token_idx;
-        assert(abs_idx < prefill_request->tokens.size());
-
-        bc.tokensInfo[token_idx].request_index = request_index;
-        bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
-        bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-        bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
-
-        bc.num_tokens++;
-      }
+  // Request Info
+  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+  bc.requestsInfo[request_index].first_token_index_in_request =
+      prefill_request->llm_cache_size;
+  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
+      get_max_tokens_per_batch(),
+      (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
 
-    } else {
-      bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-      bc.requestsInfo[request_index].first_token_index_in_request =
-          request.llm_cache_size;
-      bc.requestsInfo[request_index].num_tokens_in_batch = 0;
-
-      if (!request.llm_committed) {
-        // Committed tokens
-        for (int i = 0; i < request.committed_tokens.size() - 1; i++) {
-          bc.committed_tokens[bc.num_tokens_to_commit].index_in_kv_cache =
-              request.committed_tokens[i].from_index;
-          bc.committed_tokens[bc.num_tokens_to_commit].request_index =
-              request_index;
-          bc.committed_tokens[bc.num_tokens_to_commit].token_depth =
-              request.committed_tokens[i].to_index;
-          bc.num_tokens_to_commit++;
-        }
-      }
-    }
+  prefill_request->first_token_offset_in_batch = 0;
+  prefill_request->num_tokens_in_batch =
+      bc.requestsInfo[request_index].num_tokens_in_batch;
+
+  // Token Info
+  for (int token_idx = 0;
+       token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
+       token_idx++) {
+    int abs_idx = prefill_request->llm_cache_size + token_idx;
+    assert(abs_idx < prefill_request->tokens.size());
+
+    bc.tokensInfo[token_idx].request_index = request_index;
+    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+    bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
+
+    bc.num_tokens++;
   }
 
   if (verbose) {
@@ -1103,33 +1035,41 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
         new_bc.num_tokens;
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.ssm_cache_size;
-    // We don't directly use committed_tokens.size() here because there is a
-    // case where committed_tokens.size() != request.tokens.size() -
-    // request.ssm_cache_size, that's when the LLM prefilling is just finished
-    new_bc.requestsInfo[request_index].num_tokens_in_batch =
-        request.tokens.size() - request.ssm_cache_size;
-
-    request.first_token_offset_in_batch = new_bc.num_tokens;
-    request.num_tokens_in_batch =
-        request.tokens.size() - request.ssm_cache_size;
 
     // Store committed tokens to tokensInfo
-    int start_offset = committed_tokens.size() - request.tokens.size() +
-                       request.ssm_cache_size;
-    assert(start_offset >= 0 && "Invalid start offset.");
-    for (int committed_token_index = start_offset;
-         committed_token_index < committed_tokens.size();
-         committed_token_index++) {
-      new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-          committed_tokens[committed_token_index].to_index;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-          committed_tokens[committed_token_index].to_index;
-      new_bc.tokensInfo[new_bc.num_tokens].token_id =
-          committed_tokens[committed_token_index].token_id;
-      new_bc.num_tokens++;
+    int num_committed_tokens = committed_tokens.size();
+    if (num_committed_tokens == 1) {
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 1;
+      // The case where the prefilling is just finished. Although the last
+      // token's kv cache is already there, the we need to decode the last token
+      // because it's the root of the token tree.
+      new_bc.tokensInfo[0].request_index = request_index;
+      new_bc.tokensInfo[0].abs_index_in_request = committed_tokens[0].to_index;
+      new_bc.tokensInfo[0].abs_depth_in_request = committed_tokens[0].to_index;
+      new_bc.tokensInfo[0].token_id = committed_tokens[0].token_id;
+      new_bc.num_tokens = 1;
+    } else {
+      for (int committed_token_index = 1;
+           committed_token_index < committed_tokens.size();
+           committed_token_index++) {
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            committed_tokens[committed_token_index].to_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            committed_tokens[committed_token_index].to_index;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+            committed_tokens[committed_token_index].token_id;
+        new_bc.num_tokens++;
+      }
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
+          num_committed_tokens - 1;
     }
 
+    request.first_token_offset_in_batch =
+        new_bc.requestsInfo[request_index].first_token_offset_in_batch;
+    request.num_tokens_in_batch =
+        new_bc.requestsInfo[request_index].num_tokens_in_batch;
+
     // Copy the causal mask, it should already been updated in
     // update_llm_verify_results
     new_bc.causalMask[request_index] = request.causal_mask;
@@ -1138,8 +1078,7 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
       profiling_requests[guid].start_decoding_time =
           Realm::Clock::current_time_in_microseconds();
     }
-    profiling.ssm_step_start = 
-      Realm::Clock::current_time_in_microseconds();
+    profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
   }
   if (verbose) {
     std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
@@ -1282,22 +1221,20 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // BatchConfig.committed_tokens.
     // Note here, we shouldn't put the last token in request.committed_tokens
     // into new_bc. Because the LLM don't have that token's KV cache.
-    if (!request.llm_committed) {
-      std::vector<Request::CommittedToken> &committed_tokens =
-          request.committed_tokens;
-      for (int committed_token_index = 0;
-           committed_token_index < committed_tokens.size() - 1;
-           committed_token_index++) {
-        Request::CommittedToken &committed_token =
-            committed_tokens.at(committed_token_index);
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-            request_index;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
-            committed_token.from_index;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-            committed_token.to_index;
-        new_bc.num_tokens_to_commit++;
-      }
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
+    for (int committed_token_index = 0;
+         committed_token_index < committed_tokens.size() - 1;
+         committed_token_index++) {
+      Request::CommittedToken &committed_token =
+          committed_tokens.at(committed_token_index);
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+          request_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
+          committed_token.from_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+          committed_token.to_index;
+      new_bc.num_tokens_to_commit++;
     }
 
     // Load the tokens on the token tree that are not yet pruned to
@@ -1366,20 +1303,8 @@ bool RequestManager::update_llm_verify_results(
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    if (!request.llm_committed) {
-      request.llm_committed = true;
-      request.llm_cache_size +=
-          request.committed_tokens.size() - 1; // Exclude the last token
-      // Check if both the KV cache of SSM and LLM are committed, because
-      // sometimes the LLM KV cache is committed by a verifying batch config,
-      // sometimes it is committed by a LLM prefilling batch config. We don't
-      // know when the tokens are committed, so we have to add these checks
-      // whenever the SSM or the LLM commits tokens. If the both caches are
-      // committed, we can clear the committed tokens.
-      if (request.ssm_committed and request.llm_committed) {
-        request.committed_tokens.clear();
-      }
-    }
+    request.llm_cache_size += request.committed_tokens.size() - 1;
+    request.committed_tokens.clear();
 
     profiling_requests[guid].llm_decoding_steps++;
     nb_requests_decoded++;
@@ -1388,9 +1313,10 @@ bool RequestManager::update_llm_verify_results(
   // Process the LLM results greedily
   get_verify_results_greedy(llm_verify_result);
 
-  profiling.llm_step_times.push_back((
-        Realm::Clock::current_time_in_microseconds() - 
-        profiling.llm_step_start) * 1e-3);
+  profiling.llm_step_times.push_back(
+      (Realm::Clock::current_time_in_microseconds() -
+       profiling.llm_step_start) *
+      1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
 
   // Clear the token tree node pool
@@ -1473,16 +1399,6 @@ bool RequestManager::update_ssm_inference_results(
     assert(request.status == Request::RUNNING);
 
     if (current_ssm_step == 1) {
-      request.ssm_committed = true;
-      // Check if both the KV cache of SSM and LLM are committed, because
-      // sometimes the LLM KV cache is committed by a verifying batch config,
-      // sometimes it is committed by a LLM prefilling batch config. We don't
-      // know when the tokens are committed, so we have to add these checks
-      // whenever the SSM or the LLM commits tokens. If the both caches are
-      // committed, we can clear the committed tokens.
-      if (request.ssm_committed and request.llm_committed) {
-        request.committed_tokens.clear();
-      }
       request.ssm_cache_size = request.tokens.size();
     }
 
@@ -1497,10 +1413,10 @@ bool RequestManager::update_ssm_inference_results(
   // Stop conditions
   if (all_request_last_layer_empty) {
     // Update profiling statistics before returning
-    profiling.ssm_step_times.push_back((
-        Realm::Clock::current_time_in_microseconds() -
-        profiling.ssm_step_start) * 1e-3
-      );
+    profiling.ssm_step_times.push_back(
+        (Realm::Clock::current_time_in_microseconds() -
+         profiling.ssm_step_start) *
+        1e-3);
     return true;
   }
   return false;
@@ -1747,9 +1663,6 @@ void RequestManager::get_verify_results_greedy(
         llm_verify_result
             .token_ids[llm_result_offset + last_accepted_token_index]);
 
-    request.llm_committed = false;
-    request.ssm_committed = false;
-
     total_nb_generated_tokens += request.committed_tokens.size();
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
@@ -2033,7 +1946,8 @@ void RequestManager::terminate_background_server_at_exit() {
 
 void RequestManager::terminate_background_server() {
   if (background_server_status == SERVING) {
-    assert(profiling.llm_step_times.size() == profiling.requests_per_step.size());
+    assert(profiling.llm_step_times.size() ==
+           profiling.requests_per_step.size());
     // Write the last profiling statistics to output file
     std::string str = "[Profiling Statistics]\n llm_step_times_ms(";
     std::string llm_step_times_ms = " ";
@@ -2050,7 +1964,8 @@ void RequestManager::terminate_background_server() {
     req_per_step += ")";
     str += req_per_step;
     if (profiling.ssm_step_times.size() > 0) {
-      assert(profiling.ssm_step_times.size() == profiling.llm_step_times.size());
+      assert(profiling.ssm_step_times.size() ==
+             profiling.llm_step_times.size());
       str += "\n ssm_step_times_ms(";
       std::string ssm_step_times_ms = " ";
       for (double time : profiling.ssm_step_times) {

From 2438b7af0d78387a6fb05205943edcc4e7207b6f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 23:54:20 -0700
Subject: [PATCH 357/667] feat: update commit_token_kernel

---
 src/ops/tree_inc_multihead_self_attention.cu | 54 +++++++++++++++++++-
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 62fe16c27..1fdb15f54 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -100,6 +100,54 @@ using flashinfer::BatchPrefillHandler;
 using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
 
 __global__ void commit_tokens_kernel(
+    half *kCache_ptr,
+    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
+    int token_pos,
+    int num_active_tokens_in_last_batch,
+    int max_seq_len,
+    int hidden_size) {
+  int const index_in_kv_cache = committedTokenInfos[token_pos].index_in_kv_cache;
+  if (index_in_kv_cache == -1) {
+    return;
+  }
+
+  int const req_id = committedTokenInfos[token_pos].request_index;
+  int const tok_id = committedTokenInfos[token_pos].token_depth;
+
+  size_t from_idx = (req_idx * max_seq_len + index_in_kv_cache)
+                    * hidden_size * 2;
+  size_t to_idx = (req_idx * max_seq_len + tok_id)
+                    * hidden_size * 2;
+  assert(to_idx <= from_idx);
+
+  CUDA_KERNEL_LOOP(offset, hidden_size) {
+    kCache_ptr[to_idx + offset] = kCache_ptr[from_idx + offset];
+    kCache_ptr[to_idx + hidden_size + offset] = kCache_ptr[from_idx + hidden_size + offset];
+  }
+}
+
+void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
+                   BatchConfig const *bc,
+                   cudaStream_t stream) {
+  int num_tokens_to_commit = bc->num_tokens_to_commit;
+  // TODO: parallel across queries
+  for (int i = 0; i < num_tokens_to_commit; i++) {
+    int parallelism = m->hidden_size;
+    commit_tokens_kernel<<<GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream>>>(
+        static_cast<half *>(m->keyCache),
+        m->committed_token_infos,
+        i,
+        m->num_active_tokens, // number of active tokens in previous batch
+        BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num(),
+        m->hidden_size);
+  }
+}
+
+__global__ void orig_commit_tokens_kernel(
     half *kCache_ptr,
     half *vCache_ptr,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
@@ -130,14 +178,14 @@ __global__ void commit_tokens_kernel(
   }
 }
 
-void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
+void orig_commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   // TODO: parallel across queries
   for (int i = 0; i < num_tokens_to_commit; i++) {
     int parallelism = m->hidden_size;
-    commit_tokens_kernel<<<GET_BLOCKS(parallelism),
+    orig_commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                            min(CUDA_NUM_THREADS, parallelism),
                            0,
                            stream>>>(
@@ -584,6 +632,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   commit_tokens(m, bc, stream);
 
+  // orig_commit_tokens(m, bc, stream);
+
   // After commit we update m->num_active_tokens to be the number of active
   // tokens for the current batch
   m->num_active_tokens = bc->num_active_tokens();

From 60b79235d3fb4a54578425d8fa74e637c3548fa5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 9 Jun 2024 23:59:20 -0700
Subject: [PATCH 358/667] fix: minor

---
 src/ops/tree_inc_multihead_self_attention.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 1fdb15f54..d7f7aff60 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -114,9 +114,9 @@ __global__ void commit_tokens_kernel(
   int const req_id = committedTokenInfos[token_pos].request_index;
   int const tok_id = committedTokenInfos[token_pos].token_depth;
 
-  size_t from_idx = (req_idx * max_seq_len + index_in_kv_cache)
+  size_t from_idx = (req_id * max_seq_len + index_in_kv_cache)
                     * hidden_size * 2;
-  size_t to_idx = (req_idx * max_seq_len + tok_id)
+  size_t to_idx = (req_id * max_seq_len + tok_id)
                     * hidden_size * 2;
   assert(to_idx <= from_idx);
 

From 389ee7a2ede6f48cad3f60ccdd974b392d2226c3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 10 Jun 2024 04:30:03 -0700
Subject: [PATCH 359/667] fix: get wrong batchconfig from future

---
 src/ops/fused.cu                             | 16 ++++++++++------
 src/ops/spec_inc_multihead_self_attention.cc |  9 +++++----
 src/ops/tree_inc_multihead_self_attention.cc | 13 +++++++------
 src/ops/tree_inc_multihead_self_attention.cu | 14 ++++++++------
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index a89b16d43..4ef5f8446 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -937,8 +937,10 @@ __host__ void
                 (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
             // TreeVerifyBatchConfig const *verify_bc =
             //     (TreeVerifyBatchConfig *)task->args;
-            BatchConfig const &verify_bc =
-                Future(task->futures[0]).get_result<BatchConfig>();
+            // BatchConfig const &verify_bc =
+            //     Future(task->futures[0]).get_result<BatchConfig>();
+            BatchConfig const *verify_bc =
+                BatchConfig::from_future(task->futures[0]);
             assert(fused->op_num_weights[op] ==
                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
             GenericTensorAccessorR biases;
@@ -948,7 +950,7 @@ __host__ void
             }
             TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
                 m,
-                &verify_bc,
+                verify_bc,
                 task->index_point.point_data[0],
                 my_input_accessor[0],
                 my_weight_accessor[0],
@@ -963,8 +965,10 @@ __host__ void
                 (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
             // TreeSearchBatchConfig const *search_bc =
             //     (TreeSearchBatchConfig *)task->args;
-            BatchConfig const &search_bc =
-                Future(task->futures[0]).get_result<BatchConfig>();
+            // BatchConfig const &search_bc =
+            //     Future(task->futures[0]).get_result<BatchConfig>();
+            BatchConfig const *search_bc =
+                BatchConfig::from_future(task->futures[0]);
             assert(fused->op_num_weights[op] ==
                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
             GenericTensorAccessorR biases;
@@ -974,7 +978,7 @@ __host__ void
             }
             SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
                 m,
-                &search_bc,
+                search_bc,
                 task->index_point.point_data[0],
                 my_input_accessor[0],
                 my_weight_accessor[0],
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 0e71af411..5c3d097de 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -736,8 +736,9 @@ void SpecIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
-  if (bc.num_tokens == 0) {
+  // BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_tokens == 0) {
     return;
   }
 
@@ -777,7 +778,7 @@ void SpecIncMultiHeadSelfAttention::inference_task(
 
   assert(task->index_point.get_dim() == 1);
   SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, bc, task->index_point.point_data[0], input, weight, output, biases);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -787,7 +788,7 @@ void SpecIncMultiHeadSelfAttention::inference_task(
       weights_accessors.push_back(biases);
     }
     SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, bc, {input}, weights_accessors, {output});
   }
 }
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index c1d1e53a5..efe4657a2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -806,11 +806,12 @@ void TreeIncMultiHeadSelfAttention::inference_task(
     Runtime *runtime) {
   assert(task->regions.size() == regions.size());
 
-  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  // BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   log_tree_verify.debug("BatchConfig, num_tokens: %d, num_requests: %d",
-                        bc.num_tokens,
-                        bc.num_active_requests());
-  if (bc.num_tokens == 0) {
+                        bc->num_tokens,
+                        bc->num_active_requests());
+  if (bc->num_tokens == 0) {
     return;
   }
 
@@ -856,7 +857,7 @@ void TreeIncMultiHeadSelfAttention::inference_task(
   assert(task->index_point.get_dim() == 1);
 
   TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-      m, &bc, task->index_point.point_data[0], input, weight, output, biases);
+      m, bc, task->index_point.point_data[0], input, weight, output, biases);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -867,7 +868,7 @@ void TreeIncMultiHeadSelfAttention::inference_task(
       weights_accessors.push_back(biases);
     }
     TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file(
-        m, shard_id, &bc, {input}, weights_accessors, {output});
+        m, shard_id, bc, {input}, weights_accessors, {output});
   }
 }
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d7f7aff60..c260d79a9 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -630,9 +630,11 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
   // "\n";
 
-  commit_tokens(m, bc, stream);
+  if (!bc->prompt_phase) {
+    // commit_tokens(m, bc, stream);
 
-  // orig_commit_tokens(m, bc, stream);
+    orig_commit_tokens(m, bc, stream);
+  }
 
   // After commit we update m->num_active_tokens to be the number of active
   // tokens for the current batch
@@ -660,10 +662,10 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
 
   // Update key-val cache, compact q array
-  update_qkv_cache<DT>(m, bc, stream);
+  // update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
-  tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   // {
@@ -686,8 +688,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   delete[] temp_output;
   // }
 
-  // orig_update_qkv_cache<DT>(m, bc, stream);
-  // orig_tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  orig_update_qkv_cache<DT>(m, bc, stream);
+  orig_tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 

From 3c9089251acd3a9c1143d316bd8cb9b50e7b3f70 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Mon, 10 Jun 2024 13:13:50 -0400
Subject: [PATCH 360/667] Fixed request manager bug.

---
 src/runtime/request_manager.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ca535fbeb..c1eef7b47 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1043,11 +1043,14 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
       // The case where the prefilling is just finished. Although the last
       // token's kv cache is already there, the we need to decode the last token
       // because it's the root of the token tree.
-      new_bc.tokensInfo[0].request_index = request_index;
-      new_bc.tokensInfo[0].abs_index_in_request = committed_tokens[0].to_index;
-      new_bc.tokensInfo[0].abs_depth_in_request = committed_tokens[0].to_index;
-      new_bc.tokensInfo[0].token_id = committed_tokens[0].token_id;
-      new_bc.num_tokens = 1;
+      new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+          committed_tokens[0].to_index;
+      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+          committed_tokens[0].to_index;
+      new_bc.tokensInfo[new_bc.num_tokens].token_id =
+          committed_tokens[0].token_id;
+      new_bc.num_tokens++;
     } else {
       for (int committed_token_index = 1;
            committed_token_index < committed_tokens.size();

From 3fb17c4587264ec846fc17def9b0984aaee92d58 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 11 Jun 2024 01:32:49 -0700
Subject: [PATCH 361/667] chore: minor add some fields

---
 .../ops/tree_inc_multihead_self_attention.h   |  1 +
 src/ops/tree_inc_multihead_self_attention.cu  | 42 +++++++++++++------
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 3a0336d1b..1e0f44b9b 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -154,6 +154,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   int32_t *kv_indices;
   int32_t *kv_last_page_len;
   float *custom_mask;
+  size_t workspace_size;
   void *workspace;
 };
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c260d79a9..38be1bfbc 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -519,6 +519,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const group_size = num_q_heads / num_kv_heads;
   uint32_t const head_dim = m->qProjSize;
+  uint32_t const page_size = BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num();
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
 
@@ -541,12 +542,13 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       * kv = static_cast<half *>(m->keyCache),
       * o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-    num_kv_heads, kPagesize, head_dim, batch_size, kv,
+    num_kv_heads, page_size, head_dim, batch_size, kv,
     m->kv_indices, m->kv_indptr, m->kv_last_page_len);
-  size_t workspace_size = 32 * 1024 * 1024;
-  BatchPrefillHandler handler(workspace_size);
+
+  BatchPrefillHandler handler;
+  handler.SetCUDAStream(stream);
   handler.BeginForward(
-      m->workspace, workspace_size, m->q_indptr, batch_size,
+      m->workspace, m->workspace_size, m->q_indptr, batch_size,
       num_q_heads, num_kv_heads, head_dim);
 
   DISPATCH_GROUPSIZE(
@@ -554,7 +556,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       DISPATCH_HEADDIM(
         head_dim, HEAD_DIM, {
           DISPATCH_PAGESIZE(
-            kPagesize, PAGE_SIZE, {
+            page_size, PAGE_SIZE, {
     cudaError_t result;
     if (bc->prompt_phase) {
       result = BatchPrefillWithPagedKVCacheWrapperDispatched<
@@ -631,9 +633,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // "\n";
 
   if (!bc->prompt_phase) {
-    // commit_tokens(m, bc, stream);
+    commit_tokens(m, bc, stream);
 
-    orig_commit_tokens(m, bc, stream);
+    // orig_commit_tokens(m, bc, stream);
   }
 
   // After commit we update m->num_active_tokens to be the number of active
@@ -662,10 +664,10 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
 
   // Update key-val cache, compact q array
-  // update_qkv_cache<DT>(m, bc, stream);
+  update_qkv_cache<DT>(m, bc, stream);
 
   // Compute attention
-  // tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   // {
@@ -688,8 +690,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   delete[] temp_output;
   // }
 
-  orig_update_qkv_cache<DT>(m, bc, stream);
-  orig_tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
+  // orig_update_qkv_cache<DT>(m, bc, stream);
+  // orig_tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
@@ -701,6 +703,22 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       bias_ptr,
                       processed_tokens_in_batch,
                       stream);
+  
+  // {
+  //   int size = m->oProjSize;
+  //   DT *temp_output = new DT[size];
+  //   cudaDeviceSynchronize();
+  //   cudaMemcpy(
+  //       temp_output, output_ptr + m->oProjSize * (bc->num_active_tokens() - 1), size * sizeof(DT),
+  //       cudaMemcpyDeviceToHost);
+  //   printf("Output :");
+  //   for (int i = 0; i < size; ++i) {
+  //     printf("%.6f ", static_cast<float>(temp_output[i]));
+  //   }
+  //   printf("\n");
+
+  //   delete[] temp_output;
+  // }
 }
 
 } // namespace TreeIncMultiHeadAttention
@@ -824,7 +842,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                               BatchConfig::max_spec_tree_token_num() *
                               (BatchConfig::max_spec_tree_token_num() +
                                 BatchConfig::max_sequence_length());
-    size_t workspace_size = 32 * 1024 * 1024; // 32MB
+    workspace_size = 32 * 1024 * 1024; // 32MB
     
     gpu_mem_allocator.create_legion_instance(flashinfer_reserve_inst, 
                 sizeof(int32_t) * indices_size +

From 4a537bed95076c5bb5f4182bb6ec951356582c65 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 11 Jun 2024 01:43:32 -0700
Subject: [PATCH 362/667] fix: kvlayout

---
 src/ops/tree_inc_multihead_self_attention.cu | 21 ++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 38be1bfbc..d13b5ff6e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -114,15 +114,15 @@ __global__ void commit_tokens_kernel(
   int const req_id = committedTokenInfos[token_pos].request_index;
   int const tok_id = committedTokenInfos[token_pos].token_depth;
 
-  size_t from_idx = (req_id * max_seq_len + index_in_kv_cache)
-                    * hidden_size * 2;
-  size_t to_idx = (req_id * max_seq_len + tok_id)
-                    * hidden_size * 2;
-  assert(to_idx <= from_idx);
+  size_t from_k_idx = (req_id * max_seq_len * 2 + index_in_kv_cache) * hidden_size,
+         from_v_idx = (req_id * max_seq_len * 2 + max_seq_len + index_in_kv_cache) * hidden_size;
+  size_t to_k_idx = (req_id * max_seq_len * 2 + tok_id) * hidden_size,
+          to_v_idx = (req_id * max_seq_len * 2 + max_seq_len + tok_id) * hidden_size;
+  assert(to_k_idx <= from_k_idx);
 
   CUDA_KERNEL_LOOP(offset, hidden_size) {
-    kCache_ptr[to_idx + offset] = kCache_ptr[from_idx + offset];
-    kCache_ptr[to_idx + hidden_size + offset] = kCache_ptr[from_idx + hidden_size + offset];
+    kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
+    kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
   }
 }
 
@@ -297,12 +297,13 @@ __global__ void update_qkv_cache_kernel(
 
   size_t from_idx =
         token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size * 2;
+  size_t to_k_idx = (req_idx * max_seq_len * 2 + token_abs_idx) * hidden_size,
+         to_v_idx = (req_idx * max_seq_len * 2 + max_seq_len + token_abs_idx) * hidden_size;
 
   // key and value cache should be stored interleaved
-  kCache_ptr[to_idx + offset] = 
+  kCache_ptr[to_k_idx + offset] = 
       static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  kCache_ptr[to_idx + hidden_size + offset] = 
+  kCache_ptr[to_v_idx + offset] = 
       static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
   qTmp_ptr[token_idx * hidden_size + offset] = 
       static_cast<half>(devQKVProjArray[from_idx + offset]);

From 54266eebe8aba29e7aa64e0fbb2c09437c8f9ec1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 11 Jun 2024 05:59:48 -0700
Subject: [PATCH 363/667] feat: switch to appropriate page_size (64)

---
 .../ops/inc_multihead_self_attention.h        |  4 +
 src/ops/inc_multihead_self_attention.cu       | 12 +--
 src/ops/tree_inc_multihead_self_attention.cu  | 79 +++++++++++++------
 .../tree_inc_multihead_self_attention_impl.cu |  2 +-
 4 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 6dd43e333..8bf34ca7c 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -18,6 +18,10 @@
 #include <hip/hip_complex.h>
 #endif
 
+// kPagesize also defined in tree_inc_multihead_self_attention_impl.cu
+// for template instantiation
+constexpr uint32_t kPagesize = 64;
+
 namespace FlexFlow {
 
 class IncMultiHeadSelfAttentionMeta;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index e51eb7b96..037a72611 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1374,6 +1374,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
     size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0, qk_prod_size = 0;
+    assert((BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num()) % kPagesize == 0);
+    size_t max_num_pages = (BatchConfig::max_sequence_length() +
+                BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
     switch (infer_mode) {
       case INC_DECODING_MODE: {
         key_cache_size = num_q_heads * kProjSize *
@@ -1407,15 +1410,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
                          BatchConfig::max_requests_per_batch() *
-                         (BatchConfig::max_sequence_length() +
-                          BatchConfig::max_spec_tree_token_num());
+                         max_num_pages * kPagesize;
         value_cache_size = num_q_heads * vProjSize *
                            BatchConfig::max_requests_per_batch() *
-                           (BatchConfig::max_sequence_length() +
-                            BatchConfig::max_spec_tree_token_num());
+                           max_num_pages * kPagesize;
         qk_prod_size = BatchConfig::max_sequence_length() *
-                       (BatchConfig::max_sequence_length() +
-                        BatchConfig::max_spec_tree_token_num()) *
+                       max_num_pages * kPagesize *
                        num_q_heads;
         break;
       }
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d13b5ff6e..ec6320305 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -65,9 +65,6 @@
     }                                                  \
   }
 
-// kPagesize also defined in tree_inc_multihead_self_attention_impl.cu
-// for template instantiation
-constexpr uint32_t kPagesize = 512 + 64;
 #define DISPATCH_PAGESIZE(page_size, PAGE_SIZE, ...)  \
   if (page_size == kPagesize) {                        \
     constexpr size_t PAGE_SIZE = kPagesize;            \
@@ -99,12 +96,28 @@ using flashinfer::paged_kv_t;
 using flashinfer::BatchPrefillHandler;
 using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
 
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                    int const token_idx,
+                                                    int const max_num_pages,
+                                                    int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize *2 +
+          token_idx % kPagesize) * hidden_size;
+}
+
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                    int const token_idx,
+                                                    int const max_num_pages,
+                                                    int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize *2 +
+          kPagesize + token_idx % kPagesize) * hidden_size;
+}
+
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     int token_pos,
     int num_active_tokens_in_last_batch,
-    int max_seq_len,
+    int const max_num_pages,
     int hidden_size) {
   int const index_in_kv_cache = committedTokenInfos[token_pos].index_in_kv_cache;
   if (index_in_kv_cache == -1) {
@@ -114,10 +127,10 @@ __global__ void commit_tokens_kernel(
   int const req_id = committedTokenInfos[token_pos].request_index;
   int const tok_id = committedTokenInfos[token_pos].token_depth;
 
-  size_t from_k_idx = (req_id * max_seq_len * 2 + index_in_kv_cache) * hidden_size,
-         from_v_idx = (req_id * max_seq_len * 2 + max_seq_len + index_in_kv_cache) * hidden_size;
-  size_t to_k_idx = (req_id * max_seq_len * 2 + tok_id) * hidden_size,
-          to_v_idx = (req_id * max_seq_len * 2 + max_seq_len + tok_id) * hidden_size;
+  size_t from_k_idx = get_k_entry_offset(req_id, index_in_kv_cache, max_num_pages, hidden_size),
+         from_v_idx = get_v_entry_offset(req_id, index_in_kv_cache, max_num_pages, hidden_size);
+  size_t to_k_idx = get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
+         to_v_idx = get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
   assert(to_k_idx <= from_k_idx);
 
   CUDA_KERNEL_LOOP(offset, hidden_size) {
@@ -130,6 +143,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
+  int const max_num_pages = (BatchConfig::max_sequence_length() + 
+          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
   // TODO: parallel across queries
   for (int i = 0; i < num_tokens_to_commit; i++) {
     int parallelism = m->hidden_size;
@@ -141,8 +156,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->committed_token_infos,
         i,
         m->num_active_tokens, // number of active tokens in previous batch
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
+        max_num_pages,
         m->hidden_size);
   }
 }
@@ -282,7 +296,7 @@ __global__ void update_qkv_cache_kernel(
     half *kCache_ptr,
     BatchConfig::PerTokenInfo const *tokenInfos,
     BatchConfig::PerRequestInfo *request_infos,
-    int max_seq_len,
+    int const max_num_pages,
     int hidden_size,
     int num_new_tokens) {
   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -297,8 +311,8 @@ __global__ void update_qkv_cache_kernel(
 
   size_t from_idx =
         token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = (req_idx * max_seq_len * 2 + token_abs_idx) * hidden_size,
-         to_v_idx = (req_idx * max_seq_len * 2 + max_seq_len + token_abs_idx) * hidden_size;
+  size_t to_k_idx = get_k_entry_offset(req_idx, token_abs_idx, max_num_pages, hidden_size),
+         to_v_idx = get_v_entry_offset(req_idx, token_abs_idx, max_num_pages, hidden_size);
 
   // key and value cache should be stored interleaved
   kCache_ptr[to_k_idx + offset] = 
@@ -316,6 +330,8 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
   // update the kv cache, compact the q array
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = m->hidden_size * num_new_tokens;
+  int const max_num_pages = (BatchConfig::max_sequence_length() + 
+          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
   update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
                             min(CUDA_NUM_THREADS, parallelism),
                             0,
@@ -325,8 +341,7 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       static_cast<half *>(m->keyCache),
       m->token_infos,
       m->request_infos,
-      BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num(),
+      max_num_pages,
       m->hidden_size,
       num_new_tokens);
 }
@@ -391,6 +406,7 @@ void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
 __global__ void prepare_inference_params_kernel(int const num_requests,
                           BatchConfig::PerRequestInfo *request_infos,
                           bool *request_available,
+                          uint32_t const max_num_pages,
                           int32_t *q_indptr,
                           int32_t *kv_indptr,
                           int32_t *kv_indices,
@@ -403,11 +419,16 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
   // request id in batch config
   int requext_idx_in_batch = -1;
   int cnt_1 = 0, q_lens = 0;
+  int indices_offset = 0, indices_lens = 0, kv_len = 0;
   while (cnt_1 < request_idx + 1) {
     requext_idx_in_batch++;
     if (request_available[requext_idx_in_batch]) {
       cnt_1++;
       q_lens += request_infos[requext_idx_in_batch].num_tokens_in_batch;
+      kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch +
+                  request_infos[requext_idx_in_batch].first_token_index_in_request;
+      indices_offset = indices_lens;
+      indices_lens += (kv_len + kPagesize - 1) / kPagesize;
     }
   }
 
@@ -417,10 +438,11 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
   }
   __syncthreads();
   q_indptr[request_idx + 1] = q_lens;
-  kv_indptr[request_idx + 1] = request_idx + 1;
-  kv_indices[request_idx] = requext_idx_in_batch;
-  kv_last_page_len[request_idx] = request_infos[requext_idx_in_batch].num_tokens_in_batch +
-                                  request_infos[requext_idx_in_batch].first_token_index_in_request;
+  kv_indptr[request_idx + 1] = indices_lens;
+  for (int i = indices_offset; i < indices_lens; i++) {
+    kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset);
+  }
+  kv_last_page_len[request_idx] = kv_len % kPagesize;
 }
 
 template <typename DT>
@@ -520,7 +542,8 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const group_size = num_q_heads / num_kv_heads;
   uint32_t const head_dim = m->qProjSize;
-  uint32_t const page_size = BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num();
+  uint32_t const max_num_pages = (BatchConfig::max_sequence_length() + 
+                      BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
 
@@ -533,6 +556,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         batch_size,
         m->request_infos,
         m->request_available,
+        max_num_pages,
         m->q_indptr,
         m->kv_indptr,
         m->kv_indices,
@@ -543,7 +567,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       * kv = static_cast<half *>(m->keyCache),
       * o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-    num_kv_heads, page_size, head_dim, batch_size, kv,
+    num_kv_heads, kPagesize, head_dim, batch_size, kv,
     m->kv_indices, m->kv_indptr, m->kv_last_page_len);
 
   BatchPrefillHandler handler;
@@ -557,7 +581,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       DISPATCH_HEADDIM(
         head_dim, HEAD_DIM, {
           DISPATCH_PAGESIZE(
-            page_size, PAGE_SIZE, {
+            kPagesize, PAGE_SIZE, {
     cudaError_t result;
     if (bc->prompt_phase) {
       result = BatchPrefillWithPagedKVCacheWrapperDispatched<
@@ -838,7 +862,10 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
 
   {
     size_t batch_size = BatchConfig::max_requests_per_batch();
-    size_t indices_size = std::max((batch_size + 1) * 4, 1ul * 1024 * 1024);
+    size_t max_num_pages = (BatchConfig::max_spec_tree_token_num() + 
+                  BatchConfig::max_sequence_length() + kPagesize - 1) / kPagesize;
+    size_t indices_size = std::max((batch_size + 1) * 3 + max_num_pages * batch_size,
+                            1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               BatchConfig::max_spec_tree_token_num() *
                               (BatchConfig::max_spec_tree_token_num() +
@@ -850,9 +877,9 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                 sizeof(float) * custom_mask_size + workspace_size);
 
     q_indptr = gpu_mem_allocator.allocate_instance<int32_t>(indices_size);
-    kv_indptr = q_indptr + indices_size / 4;
-    kv_indices = kv_indptr + indices_size / 4;
-    kv_last_page_len = kv_indices + indices_size / 4;
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + max_num_pages * batch_size;
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
     workspace = static_cast<void *>(gpu_mem_allocator.allocate_instance<char>(workspace_size));
   }
diff --git a/src/ops/tree_inc_multihead_self_attention_impl.cu b/src/ops/tree_inc_multihead_self_attention_impl.cu
index 5c1b0777e..0af0529af 100644
--- a/src/ops/tree_inc_multihead_self_attention_impl.cu
+++ b/src/ops/tree_inc_multihead_self_attention_impl.cu
@@ -173,7 +173,7 @@ template cudaError_t SinglePrefillWithKVCacheDispatched<
     float rope_theta, cudaStream_t stream);
 
 
-constexpr uint32_t kPagesize = 512 + 64;
+constexpr uint32_t kPagesize = 64;
 // num_frags_x[] = {1, 2};
 // group_size[] = {1, 4, 8};
 // head_dim[] = {64, 128, 256};

From 184460e9ec9c454836cdf9056070edd11dbee49c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 11 Jun 2024 10:10:19 -0700
Subject: [PATCH 364/667] feat: add qk_indtr for flashinfer forward

---
 .../ops/tree_inc_multihead_self_attention.h   |  1 +
 src/ops/tree_inc_multihead_self_attention.cu  | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 1e0f44b9b..200ff9fde 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -153,6 +153,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   int32_t *kv_indptr;
   int32_t *kv_indices;
   int32_t *kv_last_page_len;
+  int32_t *qk_indptr;
   float *custom_mask;
   size_t workspace_size;
   void *workspace;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index ec6320305..02d258c5b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -410,7 +410,8 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
                           int32_t *q_indptr,
                           int32_t *kv_indptr,
                           int32_t *kv_indices,
-                          int32_t *kv_last_page_len) {
+                          int32_t *kv_last_page_len,
+                          int32_t *qk_indptr) {
   int const request_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (request_idx >= num_requests) {
     return;
@@ -418,15 +419,17 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
 
   // request id in batch config
   int requext_idx_in_batch = -1;
-  int cnt_1 = 0, q_lens = 0;
+  int cnt_1 = 0, q_lens = 0, qk_len = 0;
   int indices_offset = 0, indices_lens = 0, kv_len = 0;
   while (cnt_1 < request_idx + 1) {
     requext_idx_in_batch++;
     if (request_available[requext_idx_in_batch]) {
       cnt_1++;
-      q_lens += request_infos[requext_idx_in_batch].num_tokens_in_batch;
+      int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch;
+      q_lens += q_len;
       kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch +
                   request_infos[requext_idx_in_batch].first_token_index_in_request;
+      qk_len += q_len * kv_len;
       indices_offset = indices_lens;
       indices_lens += (kv_len + kPagesize - 1) / kPagesize;
     }
@@ -435,6 +438,7 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
   if (request_idx == 0) {
     q_indptr[0] = 0;
     kv_indptr[0] = 0;
+    qk_indptr[0] = 0;
   }
   __syncthreads();
   q_indptr[request_idx + 1] = q_lens;
@@ -443,6 +447,7 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
     kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset);
   }
   kv_last_page_len[request_idx] = kv_len % kPagesize;
+  qk_indptr[request_idx + 1] = qk_len;
 }
 
 template <typename DT>
@@ -560,7 +565,8 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->q_indptr,
         m->kv_indptr,
         m->kv_indices,
-        m->kv_last_page_len);
+        m->kv_last_page_len,
+        m->qk_indptr);
   }
 
   half* q = static_cast<half *>(m->queryTmp),
@@ -597,7 +603,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCustom,
         half, half, int32_t>(
           &handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
-          m->custom_mask, /*qk_indptr=*/nullptr, o, /*lse=*/nullptr,
+          m->custom_mask, m->qk_indptr, o, /*lse=*/nullptr,
           sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
     }
     if (result != cudaSuccess) {
@@ -864,7 +870,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     size_t batch_size = BatchConfig::max_requests_per_batch();
     size_t max_num_pages = (BatchConfig::max_spec_tree_token_num() + 
                   BatchConfig::max_sequence_length() + kPagesize - 1) / kPagesize;
-    size_t indices_size = std::max((batch_size + 1) * 3 + max_num_pages * batch_size,
+    size_t indices_size = std::max((batch_size + 1) * 4 + max_num_pages * batch_size,
                             1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               BatchConfig::max_spec_tree_token_num() *
@@ -880,6 +886,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     kv_indptr = q_indptr + batch_size + 1;
     kv_indices = kv_indptr + batch_size + 1;
     kv_last_page_len = kv_indices + max_num_pages * batch_size;
+    qk_indptr = kv_last_page_len + batch_size + 1;
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
     workspace = static_cast<void *>(gpu_mem_allocator.allocate_instance<char>(workspace_size));
   }

From 578545d2002a7615cc3da04b9ddb991f15843529 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 11 Jun 2024 10:11:26 -0700
Subject: [PATCH 365/667] fix: cornor bug

---
 src/ops/tree_inc_multihead_self_attention.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 02d258c5b..961794a3f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -446,7 +446,7 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
   for (int i = indices_offset; i < indices_lens; i++) {
     kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset);
   }
-  kv_last_page_len[request_idx] = kv_len % kPagesize;
+  kv_last_page_len[request_idx] = (kv_len - 1) % kPagesize + 1;
   qk_indptr[request_idx + 1] = qk_len;
 }
 

From 40e7c305b89174b7e10787e882a7f635674a6950 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 11 Jun 2024 12:09:08 -0700
Subject: [PATCH 366/667] feat: move batch_prefill_handler into meta for
 performance

---
 .../flexflow/ops/tree_inc_multihead_self_attention.h |  1 +
 src/ops/tree_inc_multihead_self_attention.cu         | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 200ff9fde..3f3568803 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -157,6 +157,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   float *custom_mask;
   size_t workspace_size;
   void *workspace;
+  void *batch_prefill_handler;
 };
 
 }; // namespace FlexFlow
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 961794a3f..c0a9ed923 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -576,9 +576,9 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
     num_kv_heads, kPagesize, head_dim, batch_size, kv,
     m->kv_indices, m->kv_indptr, m->kv_last_page_len);
 
-  BatchPrefillHandler handler;
-  handler.SetCUDAStream(stream);
-  handler.BeginForward(
+  BatchPrefillHandler *handler = static_cast<BatchPrefillHandler *>(m->batch_prefill_handler);
+  handler->SetCUDAStream(stream);
+  handler->BeginForward(
       m->workspace, m->workspace_size, m->q_indptr, batch_size,
       num_q_heads, num_kv_heads, head_dim);
 
@@ -594,7 +594,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         PageStorage::kIndices, QKVLayout::kNHD, PAGE_SIZE, GROUP_SIZE,
         HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCausal,
         half, half, int32_t>(
-          &handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
+          handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
           /*custom_mask=*/nullptr, /*qk_indptr=*/nullptr, o, /*lse=*/nullptr,
           sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
     } else {
@@ -602,7 +602,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
         PageStorage::kIndices, QKVLayout::kNHD, PAGE_SIZE, GROUP_SIZE,
         HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCustom,
         half, half, int32_t>(
-          &handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
+          handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
           m->custom_mask, m->qk_indptr, o, /*lse=*/nullptr,
           sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
     }
@@ -889,6 +889,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     qk_indptr = kv_last_page_len + batch_size + 1;
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
     workspace = static_cast<void *>(gpu_mem_allocator.allocate_instance<char>(workspace_size));
+    batch_prefill_handler = static_cast<void *>(new flashinfer::BatchPrefillHandler);
   }
 
   // allocate memory for the seqArray and reserve space
@@ -913,6 +914,7 @@ TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
   if (flashinfer_reserve_inst != Realm::RegionInstance::NO_INST) {
     flashinfer_reserve_inst.destroy();
   }
+  delete static_cast<flashinfer::BatchPrefillHandler *>(batch_prefill_handler);
 }
 
 }; // namespace FlexFlow

From 3c1b03904939597fe2ae9245bfa0278218a3b576 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 12 Jun 2024 11:14:47 -0700
Subject: [PATCH 367/667] feat: remove unused legacy code

---
 src/ops/tree_inc_multihead_self_attention.cu | 191 -------------------
 1 file changed, 191 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c0a9ed923..be9768afb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -161,62 +161,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-__global__ void orig_commit_tokens_kernel(
-    half *kCache_ptr,
-    half *vCache_ptr,
-    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int token_pos,
-    int num_active_tokens_in_last_batch,
-    int max_seq_len,
-    int hidden_size) {
-  int const index_in_kv_cache = committedTokenInfos[token_pos].index_in_kv_cache;
-  if (index_in_kv_cache == -1) {
-    return;
-  }
-
-  int const req_id = committedTokenInfos[token_pos].request_index;
-  int const tok_id = committedTokenInfos[token_pos].token_depth;
-
-  size_t from_idx = req_id * (hidden_size * max_seq_len) +
-                    index_in_kv_cache * hidden_size;
-  size_t to_idx = req_id * (hidden_size * max_seq_len) +
-                  tok_id * hidden_size;
-  assert(to_idx <= from_idx);
-
-  CUDA_KERNEL_LOOP(offset, hidden_size) {
-    kCache_ptr[to_idx + offset] = kCache_ptr[from_idx + offset];
-    vCache_ptr[to_idx + offset] = vCache_ptr[from_idx + offset];
-  }
-}
-
-void orig_commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
-                   BatchConfig const *bc,
-                   cudaStream_t stream) {
-  int num_tokens_to_commit = bc->num_tokens_to_commit;
-  // TODO: parallel across queries
-  for (int i = 0; i < num_tokens_to_commit; i++) {
-    int parallelism = m->hidden_size;
-    orig_commit_tokens_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(
-        static_cast<half *>(m->keyCache),
-        static_cast<half *>(m->valueCache),
-        m->committed_token_infos,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        i,
-        m->num_active_tokens, // number of active tokens in previous batch
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        m->hidden_size);
-  }
-}
-
 __global__ void update_custom_mask_kernel(
     float *custom_mask,
     BatchConfig::BitMask *causalMask,
@@ -346,63 +290,6 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
       num_new_tokens);
 }
 
-template <typename DT>
-__global__ void orig_update_qkv_cache_kernel(
-    DT *devQKVProjArray,
-    half *qTmp_ptr,
-    half *kCache_ptr,
-    half *vCache_ptr,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    BatchConfig::PerRequestInfo *request_infos,
-    int max_seq_len,
-    int hidden_size,
-    int num_new_tokens) {
-  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / hidden_size;
-  int const offset = thread_idx % hidden_size;
-  if (token_idx >= num_new_tokens) {
-    return;
-  }
-
-  int const req_idx = tokenInfos[token_idx].request_index;
-  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-  size_t from_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_idx = (req_idx * max_seq_len + token_abs_idx) * hidden_size;
-
-  // key and value cache should be stored interleaved
-  kCache_ptr[to_idx + offset] = 
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  vCache_ptr[to_idx + offset] = 
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] = 
-      static_cast<half>(devQKVProjArray[from_idx + offset]);
-}
-
-template <typename DT>
-void orig_update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                 BatchConfig const *bc,
-                                 cudaStream_t stream) {
-  // update the kv cache, compact the q array
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  orig_update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<half *>(m->queryTmp),
-      static_cast<half *>(m->keyCache),
-      static_cast<half *>(m->valueCache),
-      m->token_infos,
-      m->request_infos,
-      BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num(),
-      m->hidden_size,
-      num_new_tokens);
-}
-
 __global__ void prepare_inference_params_kernel(int const num_requests,
                           BatchConfig::PerRequestInfo *request_infos,
                           bool *request_available,
@@ -459,79 +346,6 @@ __global__ void produce_output_kernel(half const *input_ptr,
   }
 }
 
-template <typename DT>
-void orig_tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
-                          BatchConfig const *bc,
-                          DT *output_ptr,
-                          cudaStream_t stream) {
-  // global constant parameters
-  uint32_t const num_q_heads = m->num_q_heads;
-  uint32_t const num_kv_heads = m->num_kv_heads;
-  uint32_t const group_size = num_q_heads / num_kv_heads;
-  uint32_t const head_dim = m->qProjSize;
-  uint32_t const batch_size = bc->num_active_requests();
-  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-
-  // for finding q, k, v, custom_mask pointers
-  uint32_t const hidden_size = m->hidden_size;
-  uint32_t const max_seq_len = BatchConfig::max_sequence_length() +
-                          BatchConfig::max_spec_tree_token_num();
-  uint32_t const max_q_length = BatchConfig::max_spec_tree_token_num();
-  uint32_t const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
-                            BatchConfig::max_sequence_length();
-
-  int mask_lens = 0, mask_offset = 0;
-  for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
-    if (!bc->request_available[req_idx]) {
-      continue;
-    }
-    BatchConfig::PerRequestInfo const *req = bc->requestsInfo + req_idx;
-    uint32_t q_len = req->num_tokens_in_batch,
-             q_start = req->first_token_index_in_request,
-             kv_len = q_len + q_start;
-
-    mask_offset = mask_lens;
-    mask_lens += q_len * kv_len;
-
-    half* q = static_cast<half *>(m->queryTmp) + req->first_token_offset_in_batch * hidden_size,
-        * k = static_cast<half *>(m->keyCache) + req_idx * max_seq_len * hidden_size,
-        * v = static_cast<half *>(m->valueCache) + req_idx * max_seq_len * hidden_size,
-        * o = m->outputTmp + req->first_token_offset_in_batch * hidden_size;
-    float* tmp = static_cast<float *>(m->workspace);
-    float* custom_mask = m->custom_mask + mask_offset;
-
-    DISPATCH_GROUPSIZE(
-      group_size, GROUP_SIZE,
-        {DISPATCH_HEADDIM(
-          head_dim, HEAD_DIM, {
-    if (bc->prompt_phase) {
-      flashinfer::SinglePrefillWithKVCacheDispatched<
-        GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-        false, MaskMode::kCausal, half, half>(
-          q, k, v, /*custom_mask=*/static_cast<float *>(nullptr), o, tmp,
-          /*lse=*/static_cast<float *>(nullptr), num_kv_heads, q_len, kv_len, sm_scale,
-          /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-    } else {
-      flashinfer::SinglePrefillWithKVCacheDispatched<
-          GROUP_SIZE, HEAD_DIM, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half>(
-            q, k, v, custom_mask, o, tmp, /*lse=*/static_cast<float *>(nullptr),
-            num_kv_heads, q_len, kv_len, sm_scale,
-            /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-    }
-    })});
-  }
-
-  {
-    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
-    produce_output_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(
-        m->outputTmp, output_ptr, parallelism);
-  }
-}
-
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                           BatchConfig const *bc,
@@ -665,8 +479,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   if (!bc->prompt_phase) {
     commit_tokens(m, bc, stream);
-
-    // orig_commit_tokens(m, bc, stream);
   }
 
   // After commit we update m->num_active_tokens to be the number of active
@@ -721,9 +533,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   delete[] temp_output;
   // }
 
-  // orig_update_qkv_cache<DT>(m, bc, stream);
-  // orig_tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
-
   int processed_tokens_in_batch = bc->num_active_tokens();
 
   compute_o_prod_bias(m,

From e77c9fdde5a1542f46810c3a2dc8e750148e88ef Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 12 Jun 2024 20:48:47 -0700
Subject: [PATCH 368/667] chore: minor output

---
 src/ops/tree_inc_multihead_self_attention.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index be9768afb..27addfcdb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -352,6 +352,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                           DT *output_ptr,
                           cudaStream_t stream) {
   // cudaEvent_t t_start, t_end;
+  // float elapsed = 0;
   // cudaEventCreate(&t_start);
   // cudaEventCreate(&t_end);
   // cudaEventRecord(t_start, stream);
@@ -437,9 +438,8 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
 
   // cudaEventRecord(t_end, stream);
   // checkCUDA(cudaEventSynchronize(t_end));
-  // float elapsed = 0;
   // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
+  // printf("TreeIncMultiHeadSelfAttention time: %.2f ms\n", elapsed);
   // cudaEventDestroy(t_start);
   // cudaEventDestroy(t_end);
 }

From ed4b4ee40b838b1845c20ea49a9ce8844a119008 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 13 Jun 2024 02:17:22 -0400
Subject: [PATCH 369/667] Modified the update_custom_mask kernel.

---
 src/ops/tree_inc_multihead_self_attention.cu | 484 +++++++++++--------
 1 file changed, 276 insertions(+), 208 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index be9768afb..aadee321e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -25,54 +25,54 @@
 #include <sstream>
 #include <stdexcept>
 
-#define DISPATCH_GROUPSIZE(group_size, GROUP_SIZE, ...)      \
-  if (group_size == 1) {                                     \
-    constexpr size_t GROUP_SIZE = 1;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 4) {                              \
-    constexpr size_t GROUP_SIZE = 4;                         \
-    __VA_ARGS__                                              \
-  } else if (group_size == 8) {                              \
-    constexpr size_t GROUP_SIZE = 8;                         \
-    __VA_ARGS__                                              \
-  } else {                                                   \
-    std::ostringstream err_msg;                              \
-    err_msg << "Unsupported group_size: " << group_size;     \
-    throw std::invalid_argument(err_msg.str());              \
+#define DISPATCH_GROUPSIZE(group_size, GROUP_SIZE, ...)                        \
+  if (group_size == 1) {                                                       \
+    constexpr size_t GROUP_SIZE = 1;                                           \
+    __VA_ARGS__                                                                \
+  } else if (group_size == 4) {                                                \
+    constexpr size_t GROUP_SIZE = 4;                                           \
+    __VA_ARGS__                                                                \
+  } else if (group_size == 8) {                                                \
+    constexpr size_t GROUP_SIZE = 8;                                           \
+    __VA_ARGS__                                                                \
+  } else {                                                                     \
+    std::ostringstream err_msg;                                                \
+    err_msg << "Unsupported group_size: " << group_size;                       \
+    throw std::invalid_argument(err_msg.str());                                \
   }
 
-#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)      \
-  switch (head_dim) {                                  \
-    case 64: {                                         \
-      constexpr size_t HEAD_DIM = 64;                  \
-      __VA_ARGS__                                      \
-      break;                                           \
-    }                                                  \
-    case 128: {                                        \
-      constexpr size_t HEAD_DIM = 128;                 \
-      __VA_ARGS__                                      \
-      break;                                           \
-    }                                                  \
-    case 256: {                                        \
-      constexpr size_t HEAD_DIM = 256;                 \
-      __VA_ARGS__                                      \
-      break;                                           \
-    }                                                  \
-    default: {                                         \
-      std::ostringstream err_msg;                      \
-      err_msg << "Unsupported head_dim: " << head_dim; \
-      throw std::invalid_argument(err_msg.str());      \
-    }                                                  \
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
   }
 
-#define DISPATCH_PAGESIZE(page_size, PAGE_SIZE, ...)  \
-  if (page_size == kPagesize) {                        \
-    constexpr size_t PAGE_SIZE = kPagesize;            \
-    __VA_ARGS__                                        \
-  } else {                                             \
-    std::ostringstream err_msg;                        \
-    err_msg << "Unsupported page_size: " << page_size; \
-    throw std::invalid_argument(err_msg.str());        \
+#define DISPATCH_PAGESIZE(page_size, PAGE_SIZE, ...)                           \
+  if (page_size == kPagesize) {                                                \
+    constexpr size_t PAGE_SIZE = kPagesize;                                    \
+    __VA_ARGS__                                                                \
+  } else {                                                                     \
+    std::ostringstream err_msg;                                                \
+    err_msg << "Unsupported page_size: " << page_size;                         \
+    throw std::invalid_argument(err_msg.str());                                \
   }
 
 namespace FlexFlow {
@@ -88,28 +88,30 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
 using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
 using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
-using flashinfer::paged_kv_t;
-using flashinfer::BatchPrefillHandler;
-using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
 
 __device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                    int const token_idx,
-                                                    int const max_num_pages,
-                                                    int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize *2 +
-          token_idx % kPagesize) * hidden_size;
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) *
+         hidden_size;
 }
 
 __device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                    int const token_idx,
-                                                    int const max_num_pages,
-                                                    int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize *2 +
-          kPagesize + token_idx % kPagesize) * hidden_size;
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) *
+         hidden_size;
 }
 
 __global__ void commit_tokens_kernel(
@@ -119,7 +121,8 @@ __global__ void commit_tokens_kernel(
     int num_active_tokens_in_last_batch,
     int const max_num_pages,
     int hidden_size) {
-  int const index_in_kv_cache = committedTokenInfos[token_pos].index_in_kv_cache;
+  int const index_in_kv_cache =
+      committedTokenInfos[token_pos].index_in_kv_cache;
   if (index_in_kv_cache == -1) {
     return;
   }
@@ -127,10 +130,14 @@ __global__ void commit_tokens_kernel(
   int const req_id = committedTokenInfos[token_pos].request_index;
   int const tok_id = committedTokenInfos[token_pos].token_depth;
 
-  size_t from_k_idx = get_k_entry_offset(req_id, index_in_kv_cache, max_num_pages, hidden_size),
-         from_v_idx = get_v_entry_offset(req_id, index_in_kv_cache, max_num_pages, hidden_size);
-  size_t to_k_idx = get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
+  size_t from_k_idx = get_k_entry_offset(
+             req_id, index_in_kv_cache, max_num_pages, hidden_size),
+         from_v_idx = get_v_entry_offset(
+             req_id, index_in_kv_cache, max_num_pages, hidden_size);
+  size_t to_k_idx =
+             get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
+         to_v_idx =
+             get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
   assert(to_k_idx <= from_k_idx);
 
   CUDA_KERNEL_LOOP(offset, hidden_size) {
@@ -143,8 +150,10 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
-  int const max_num_pages = (BatchConfig::max_sequence_length() + 
-          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+  int const max_num_pages =
+      (BatchConfig::max_sequence_length() +
+       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+      kPagesize;
   // TODO: parallel across queries
   for (int i = 0; i < num_tokens_to_commit; i++) {
     int parallelism = m->hidden_size;
@@ -161,21 +170,22 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-__global__ void update_custom_mask_kernel(
-    float *custom_mask,
-    BatchConfig::BitMask *causalMask,
-    BatchConfig::PerRequestInfo *request_infos,
-    bool *request_available,
-    int const num_requests,
-    int const max_q_length,
-    int const max_kv_length,
-    float mask_value) {
+__global__ void
+    update_custom_mask_kernel(float *custom_mask,
+                              BatchConfig::BitMask *causalMask,
+                              BatchConfig::PerRequestInfo *request_infos,
+                              bool *request_available,
+                              int const num_requests,
+                              int const max_q_length,
+                              int const max_kv_length,
+                              float mask_value) {
   // get thread idx in [0, num_requests * max_q_length)
   int const idx = blockIdx.x * blockDim.x + threadIdx.x;
   // get (request_idx, q_idx) from thread idx
-  int const request_idx = idx / max_q_length;
-  int const q_idx = idx % max_q_length;
-  
+  int const request_idx = idx / max_q_length / max_kv_length;
+  int const q_idx = (idx % (max_q_length * max_kv_length)) / max_kv_length;
+  int const kv_idx = idx % max_kv_length;
+
   // request id in batch config
   int requext_idx_in_batch = -1;
   int cnt_1 = 0, mask_offset = 0, mask_lens = 0;
@@ -185,11 +195,12 @@ __global__ void update_custom_mask_kernel(
       cnt_1++;
       mask_offset = mask_lens;
       int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch,
-          k_len = q_len + request_infos[requext_idx_in_batch].first_token_index_in_request;
+          k_len =
+              q_len +
+              request_infos[requext_idx_in_batch].first_token_index_in_request;
       mask_lens += q_len * k_len;
     }
   }
-  __syncthreads();
 
   int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch;
   int const q_start =
@@ -197,17 +208,20 @@ __global__ void update_custom_mask_kernel(
   if (q_idx >= q_length) {
     return;
   }
+  if (kv_idx >= q_start + q_length) {
+    return;
+  }
   assert(q_start + q_length <= max_kv_length);
 
   float *mask = custom_mask + mask_offset + q_idx * (q_start + q_length);
-  // update custom mask
-  for (int i = 0; i < q_start; i++) {
-    mask[i] = 0.0f;
-  }
-  BatchConfig::BitMask *bitmask = &causalMask[requext_idx_in_batch];
-  for (int i = 0; i < q_length; i++) {
-    mask[q_start + i] = test_bit_orig(bitmask->bit_mask, q_idx, i)
-                  ? 0.0f : mask_value;
+  if (kv_idx < q_start) {
+    mask[kv_idx] = 0.0f;
+  } else {
+    mask[kv_idx] = test_bit_orig(causalMask[requext_idx_in_batch].bit_mask,
+                                 q_idx,
+                                 kv_idx - q_start)
+                       ? 0.0f
+                       : mask_value;
   }
 }
 
@@ -216,33 +230,32 @@ void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
                         cudaStream_t stream) {
   int const num_requests = bc->num_active_requests();
   int const max_q_length = BatchConfig::max_spec_tree_token_num();
-  int const max_kv_length = BatchConfig::max_spec_tree_token_num() + 
+  int const max_kv_length = BatchConfig::max_spec_tree_token_num() +
                             BatchConfig::max_sequence_length();
-  int parallelism = num_requests * max_q_length;
+  int parallelism = num_requests * max_q_length * max_kv_length;
   update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
                               min(CUDA_NUM_THREADS, parallelism),
                               0,
-                              stream>>>(
-      m->custom_mask,
-      m->causalMask,
-      m->request_infos,
-      m->request_available,
-      num_requests,
-      max_q_length,
-      max_kv_length,
-      -5e4);
+                              stream>>>(m->custom_mask,
+                                        m->causalMask,
+                                        m->request_infos,
+                                        m->request_available,
+                                        num_requests,
+                                        max_q_length,
+                                        max_kv_length,
+                                        -5e4);
 }
 
 template <typename DT>
-__global__ void update_qkv_cache_kernel(
-    DT *devQKVProjArray,
-    half *qTmp_ptr,
-    half *kCache_ptr,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    BatchConfig::PerRequestInfo *request_infos,
-    int const max_num_pages,
-    int hidden_size,
-    int num_new_tokens) {
+__global__ void
+    update_qkv_cache_kernel(DT *devQKVProjArray,
+                            half *qTmp_ptr,
+                            half *kCache_ptr,
+                            BatchConfig::PerTokenInfo const *tokenInfos,
+                            BatchConfig::PerRequestInfo *request_infos,
+                            int const max_num_pages,
+                            int hidden_size,
+                            int num_new_tokens) {
   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int const token_idx = thread_idx / hidden_size;
   int const offset = thread_idx % hidden_size;
@@ -253,52 +266,55 @@ __global__ void update_qkv_cache_kernel(
   int const req_idx = tokenInfos[token_idx].request_index;
   int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
-  size_t from_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = get_k_entry_offset(req_idx, token_abs_idx, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(req_idx, token_abs_idx, max_num_pages, hidden_size);
+  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_k_idx = get_k_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size),
+         to_v_idx = get_v_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size);
 
   // key and value cache should be stored interleaved
-  kCache_ptr[to_k_idx + offset] = 
+  kCache_ptr[to_k_idx + offset] =
       static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  kCache_ptr[to_v_idx + offset] = 
+  kCache_ptr[to_v_idx + offset] =
       static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] = 
+  qTmp_ptr[token_idx * hidden_size + offset] =
       static_cast<half>(devQKVProjArray[from_idx + offset]);
 }
 
 template <typename DT>
 void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                 BatchConfig const *bc,
-                                 cudaStream_t stream) {
+                      BatchConfig const *bc,
+                      cudaStream_t stream) {
   // update the kv cache, compact the q array
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = m->hidden_size * num_new_tokens;
-  int const max_num_pages = (BatchConfig::max_sequence_length() + 
-          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+  int const max_num_pages =
+      (BatchConfig::max_sequence_length() +
+       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+      kPagesize;
   update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
                             min(CUDA_NUM_THREADS, parallelism),
                             0,
-                            stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<half *>(m->queryTmp),
-      static_cast<half *>(m->keyCache),
-      m->token_infos,
-      m->request_infos,
-      max_num_pages,
-      m->hidden_size,
-      num_new_tokens);
+                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                      static_cast<half *>(m->queryTmp),
+                                      static_cast<half *>(m->keyCache),
+                                      m->token_infos,
+                                      m->request_infos,
+                                      max_num_pages,
+                                      m->hidden_size,
+                                      num_new_tokens);
 }
 
-__global__ void prepare_inference_params_kernel(int const num_requests,
-                          BatchConfig::PerRequestInfo *request_infos,
-                          bool *request_available,
-                          uint32_t const max_num_pages,
-                          int32_t *q_indptr,
-                          int32_t *kv_indptr,
-                          int32_t *kv_indices,
-                          int32_t *kv_last_page_len,
-                          int32_t *qk_indptr) {
+__global__ void
+    prepare_inference_params_kernel(int const num_requests,
+                                    BatchConfig::PerRequestInfo *request_infos,
+                                    bool *request_available,
+                                    uint32_t const max_num_pages,
+                                    int32_t *q_indptr,
+                                    int32_t *kv_indptr,
+                                    int32_t *kv_indices,
+                                    int32_t *kv_last_page_len,
+                                    int32_t *qk_indptr) {
   int const request_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (request_idx >= num_requests) {
     return;
@@ -315,7 +331,7 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
       int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch;
       q_lens += q_len;
       kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch +
-                  request_infos[requext_idx_in_batch].first_token_index_in_request;
+               request_infos[requext_idx_in_batch].first_token_index_in_request;
       qk_len += q_len * kv_len;
       indices_offset = indices_lens;
       indices_lens += (kv_len + kPagesize - 1) / kPagesize;
@@ -339,8 +355,8 @@ __global__ void prepare_inference_params_kernel(int const num_requests,
 
 template <typename DT>
 __global__ void produce_output_kernel(half const *input_ptr,
-                           DT *output_ptr,
-                           int parallelism) {
+                                      DT *output_ptr,
+                                      int parallelism) {
   CUDA_KERNEL_LOOP(idx, parallelism) {
     output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
   }
@@ -348,9 +364,9 @@ __global__ void produce_output_kernel(half const *input_ptr,
 
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
-                          BatchConfig const *bc,
-                          DT *output_ptr,
-                          cudaStream_t stream) {
+                           BatchConfig const *bc,
+                           DT *output_ptr,
+                           cudaStream_t stream) {
   // cudaEvent_t t_start, t_end;
   // cudaEventCreate(&t_start);
   // cudaEventCreate(&t_end);
@@ -361,78 +377,125 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const group_size = num_q_heads / num_kv_heads;
   uint32_t const head_dim = m->qProjSize;
-  uint32_t const max_num_pages = (BatchConfig::max_sequence_length() + 
-                      BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+  uint32_t const max_num_pages =
+      (BatchConfig::max_sequence_length() +
+       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+      kPagesize;
   uint32_t const batch_size = bc->num_active_requests();
-  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  float const sm_scale =
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
 
   {
     int parallelism = batch_size;
     prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
-                                      stream>>>(
-        batch_size,
-        m->request_infos,
-        m->request_available,
-        max_num_pages,
-        m->q_indptr,
-        m->kv_indptr,
-        m->kv_indices,
-        m->kv_last_page_len,
-        m->qk_indptr);
+                                      stream>>>(batch_size,
+                                                m->request_infos,
+                                                m->request_available,
+                                                max_num_pages,
+                                                m->q_indptr,
+                                                m->kv_indptr,
+                                                m->kv_indices,
+                                                m->kv_last_page_len,
+                                                m->qk_indptr);
   }
 
-  half* q = static_cast<half *>(m->queryTmp),
-      * kv = static_cast<half *>(m->keyCache),
-      * o = static_cast<half *>(m->outputTmp);
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->keyCache),
+       *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-    num_kv_heads, kPagesize, head_dim, batch_size, kv,
-    m->kv_indices, m->kv_indptr, m->kv_last_page_len);
-
-  BatchPrefillHandler *handler = static_cast<BatchPrefillHandler *>(m->batch_prefill_handler);
+      num_kv_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      kv,
+      m->kv_indices,
+      m->kv_indptr,
+      m->kv_last_page_len);
+
+  BatchPrefillHandler *handler =
+      static_cast<BatchPrefillHandler *>(m->batch_prefill_handler);
   handler->SetCUDAStream(stream);
-  handler->BeginForward(
-      m->workspace, m->workspace_size, m->q_indptr, batch_size,
-      num_q_heads, num_kv_heads, head_dim);
+  handler->BeginForward(m->workspace,
+                        m->workspace_size,
+                        m->q_indptr,
+                        batch_size,
+                        num_q_heads,
+                        num_kv_heads,
+                        head_dim);
 
   DISPATCH_GROUPSIZE(
-    group_size, GROUP_SIZE, {
-      DISPATCH_HEADDIM(
-        head_dim, HEAD_DIM, {
-          DISPATCH_PAGESIZE(
-            kPagesize, PAGE_SIZE, {
-    cudaError_t result;
-    if (bc->prompt_phase) {
-      result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-        PageStorage::kIndices, QKVLayout::kNHD, PAGE_SIZE, GROUP_SIZE,
-        HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCausal,
-        half, half, int32_t>(
-          handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
-          /*custom_mask=*/nullptr, /*qk_indptr=*/nullptr, o, /*lse=*/nullptr,
-          sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-    } else {
-      result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-        PageStorage::kIndices, QKVLayout::kNHD, PAGE_SIZE, GROUP_SIZE,
-        HEAD_DIM, PosEncodingMode::kNone, false, MaskMode::kCustom,
-        half, half, int32_t>(
-          handler, q, m->q_indptr, /*q_offset=*/nullptr, paged_kv,
-          m->custom_mask, m->qk_indptr, o, /*lse=*/nullptr,
-          sm_scale, /*rope_scale=*/1.f, /*rope_theta=*/static_cast<float>(1e4), stream);
-    }
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to run BatchPrefillWithPagedKVCacheWrapperDispatched"
-        + std::string(cudaGetErrorString(result)));
-    }
-  })})});
+      group_size,
+      GROUP_SIZE,
+      {DISPATCH_HEADDIM(
+          head_dim, HEAD_DIM, {DISPATCH_PAGESIZE(kPagesize, PAGE_SIZE, {
+            cudaError_t result;
+            if (bc->prompt_phase) {
+              result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+                  PageStorage::kIndices,
+                  QKVLayout::kNHD,
+                  PAGE_SIZE,
+                  GROUP_SIZE,
+                  HEAD_DIM,
+                  PosEncodingMode::kNone,
+                  false,
+                  MaskMode::kCausal,
+                  half,
+                  half,
+                  int32_t>(handler,
+                           q,
+                           m->q_indptr,
+                           /*q_offset=*/nullptr,
+                           paged_kv,
+                           /*custom_mask=*/nullptr,
+                           /*qk_indptr=*/nullptr,
+                           o,
+                           /*lse=*/nullptr,
+                           sm_scale,
+                           /*rope_scale=*/1.f,
+                           /*rope_theta=*/static_cast<float>(1e4),
+                           stream);
+            } else {
+              result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+                  PageStorage::kIndices,
+                  QKVLayout::kNHD,
+                  PAGE_SIZE,
+                  GROUP_SIZE,
+                  HEAD_DIM,
+                  PosEncodingMode::kNone,
+                  false,
+                  MaskMode::kCustom,
+                  half,
+                  half,
+                  int32_t>(handler,
+                           q,
+                           m->q_indptr,
+                           /*q_offset=*/nullptr,
+                           paged_kv,
+                           m->custom_mask,
+                           m->qk_indptr,
+                           o,
+                           /*lse=*/nullptr,
+                           sm_scale,
+                           /*rope_scale=*/1.f,
+                           /*rope_theta=*/static_cast<float>(1e4),
+                           stream);
+            }
+            if (result != cudaSuccess) {
+              throw std::runtime_error(
+                  "Failed to run "
+                  "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                  std::string(cudaGetErrorString(result)));
+            }
+          })})});
 
   {
     int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
     produce_output_kernel<<<GET_BLOCKS(parallelism),
                             min(CUDA_NUM_THREADS, parallelism),
                             0,
-                            stream>>>(
-        m->outputTmp, output_ptr, parallelism);
+                            stream>>>(m->outputTmp, output_ptr, parallelism);
   }
 
   // cudaEventRecord(t_end, stream);
@@ -543,14 +606,14 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       bias_ptr,
                       processed_tokens_in_batch,
                       stream);
-  
+
   // {
   //   int size = m->oProjSize;
   //   DT *temp_output = new DT[size];
   //   cudaDeviceSynchronize();
   //   cudaMemcpy(
-  //       temp_output, output_ptr + m->oProjSize * (bc->num_active_tokens() - 1), size * sizeof(DT),
-  //       cudaMemcpyDeviceToHost);
+  //       temp_output, output_ptr + m->oProjSize * (bc->num_active_tokens() -
+  //       1), size * sizeof(DT), cudaMemcpyDeviceToHost);
   //   printf("Output :");
   //   for (int i = 0; i < size; ++i) {
   //     printf("%.6f ", static_cast<float>(temp_output[i]));
@@ -677,19 +740,22 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
 
   {
     size_t batch_size = BatchConfig::max_requests_per_batch();
-    size_t max_num_pages = (BatchConfig::max_spec_tree_token_num() + 
-                  BatchConfig::max_sequence_length() + kPagesize - 1) / kPagesize;
-    size_t indices_size = std::max((batch_size + 1) * 4 + max_num_pages * batch_size,
-                            1ul * 1024 * 1024);
+    size_t max_num_pages =
+        (BatchConfig::max_spec_tree_token_num() +
+         BatchConfig::max_sequence_length() + kPagesize - 1) /
+        kPagesize;
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               BatchConfig::max_spec_tree_token_num() *
                               (BatchConfig::max_spec_tree_token_num() +
-                                BatchConfig::max_sequence_length());
+                               BatchConfig::max_sequence_length());
     workspace_size = 32 * 1024 * 1024; // 32MB
-    
-    gpu_mem_allocator.create_legion_instance(flashinfer_reserve_inst, 
-                sizeof(int32_t) * indices_size +
-                sizeof(float) * custom_mask_size + workspace_size);
+
+    gpu_mem_allocator.create_legion_instance(
+        flashinfer_reserve_inst,
+        sizeof(int32_t) * indices_size + sizeof(float) * custom_mask_size +
+            workspace_size);
 
     q_indptr = gpu_mem_allocator.allocate_instance<int32_t>(indices_size);
     kv_indptr = q_indptr + batch_size + 1;
@@ -697,8 +763,10 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     kv_last_page_len = kv_indices + max_num_pages * batch_size;
     qk_indptr = kv_last_page_len + batch_size + 1;
     custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
-    workspace = static_cast<void *>(gpu_mem_allocator.allocate_instance<char>(workspace_size));
-    batch_prefill_handler = static_cast<void *>(new flashinfer::BatchPrefillHandler);
+    workspace = static_cast<void *>(
+        gpu_mem_allocator.allocate_instance<char>(workspace_size));
+    batch_prefill_handler =
+        static_cast<void *>(new flashinfer::BatchPrefillHandler);
   }
 
   // allocate memory for the seqArray and reserve space

From fb360c02c46e9f6a71c571b4dd0edf141d7dfa82 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 13 Jun 2024 03:44:15 -0400
Subject: [PATCH 370/667] Modified the commit_tokens kernel

---
 src/ops/tree_inc_multihead_self_attention.cu | 123 +++++++++++--------
 1 file changed, 74 insertions(+), 49 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index aadee321e..229cc90db 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -117,57 +117,82 @@ __device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
-    int token_pos,
-    int num_active_tokens_in_last_batch,
-    int const max_num_pages,
-    int hidden_size) {
-  int const index_in_kv_cache =
-      committedTokenInfos[token_pos].index_in_kv_cache;
-  if (index_in_kv_cache == -1) {
-    return;
+    bool const *request_available,
+    int num_requests,
+    int hidden_size,
+    int num_committed_tokens,
+    int const max_num_pages) {
+  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const request_compact_idx = idx / hidden_size;
+  int const offset = idx % hidden_size;
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0;
+  while (cnt_1 < request_compact_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+    }
   }
 
-  int const req_id = committedTokenInfos[token_pos].request_index;
-  int const tok_id = committedTokenInfos[token_pos].token_depth;
-
-  size_t from_k_idx = get_k_entry_offset(
-             req_id, index_in_kv_cache, max_num_pages, hidden_size),
-         from_v_idx = get_v_entry_offset(
-             req_id, index_in_kv_cache, max_num_pages, hidden_size);
-  size_t to_k_idx =
-             get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
-         to_v_idx =
-             get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
-  assert(to_k_idx <= from_k_idx);
-
-  CUDA_KERNEL_LOOP(offset, hidden_size) {
-    kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
-    kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
+  for (int i = 0; i < num_committed_tokens; i++) {
+    if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
+      int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache;
+      if (index_in_kv_cache == -1) {
+        continue;
+      }
+
+      int const req_id = committedTokenInfos[i].request_index;
+      int const tok_id = committedTokenInfos[i].token_depth;
+
+      size_t from_k_idx = get_k_entry_offset(
+                 req_id, index_in_kv_cache, max_num_pages, hidden_size),
+             from_v_idx = get_v_entry_offset(
+                 req_id, index_in_kv_cache, max_num_pages, hidden_size);
+      size_t to_k_idx =
+                 get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
+             to_v_idx =
+                 get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
+      assert(to_k_idx <= from_k_idx);
+
+      kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
+      kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
+    }
   }
 }
 
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    BatchConfig const *bc,
                    cudaStream_t stream) {
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   int const max_num_pages =
       (BatchConfig::max_sequence_length() +
        BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
       kPagesize;
-  // TODO: parallel across queries
-  for (int i = 0; i < num_tokens_to_commit; i++) {
-    int parallelism = m->hidden_size;
-    commit_tokens_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(
-        static_cast<half *>(m->keyCache),
-        m->committed_token_infos,
-        i,
-        m->num_active_tokens, // number of active tokens in previous batch
-        max_num_pages,
-        m->hidden_size);
-  }
+  int const num_requests = bc->num_active_requests();
+  int parallelism = m->hidden_size * num_requests;
+  commit_tokens_kernel_new<<<GET_BLOCKS(parallelism),
+                             min(CUDA_NUM_THREADS, parallelism),
+                             0,
+                             stream>>>(static_cast<half *>(m->keyCache),
+                                       m->committed_token_infos,
+                                       m->request_available,
+                                       num_requests,
+                                       m->hidden_size,
+                                       num_tokens_to_commit,
+                                       max_num_pages);
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   printf("Commit token time: %.2f ms\n", elapsed);
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 __global__ void
@@ -367,10 +392,10 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                            BatchConfig const *bc,
                            DT *output_ptr,
                            cudaStream_t stream) {
-  // cudaEvent_t t_start, t_end;
-  // cudaEventCreate(&t_start);
-  // cudaEventCreate(&t_end);
-  // cudaEventRecord(t_start, stream);
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
   // global constant parameters
   uint32_t const num_q_heads = m->num_q_heads;
@@ -498,13 +523,13 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                             stream>>>(m->outputTmp, output_ptr, parallelism);
   }
 
-  // cudaEventRecord(t_end, stream);
-  // checkCUDA(cudaEventSynchronize(t_end));
-  // float elapsed = 0;
-  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
-  // cudaEventDestroy(t_start);
-  // cudaEventDestroy(t_end);
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 template <typename DT>

From 78b8dac71f337f4b49382969a0491fedc625d627 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 13 Jun 2024 00:55:12 -0700
Subject: [PATCH 371/667] fix: minor typo

---
 src/ops/tree_inc_multihead_self_attention.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 229cc90db..cc2b6d179 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -176,7 +176,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
       kPagesize;
   int const num_requests = bc->num_active_requests();
   int parallelism = m->hidden_size * num_requests;
-  commit_tokens_kernel_new<<<GET_BLOCKS(parallelism),
+  commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                              min(CUDA_NUM_THREADS, parallelism),
                              0,
                              stream>>>(static_cast<half *>(m->keyCache),

From 9e84bd797f4ad5870d1d1299a0946166d344887c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 13 Jun 2024 03:24:05 -0700
Subject: [PATCH 372/667] fix: minor

---
 inference/incr_decoding/incr_decoding.cc | 3 +++
 src/runtime/request_manager.cc           | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 6a3667d70..51be21971 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -214,6 +214,9 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_decoding_mode(decoding_mode);
+  rm->set_max_spec_tree_token_num(64);
+  rm->set_max_tree_depth(8);
+  rm->set_max_tree_width(16);
   rm->set_verbose(verbose);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c1eef7b47..33b03ddc1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1967,8 +1967,8 @@ void RequestManager::terminate_background_server() {
     req_per_step += ")";
     str += req_per_step;
     if (profiling.ssm_step_times.size() > 0) {
-      assert(profiling.ssm_step_times.size() ==
-             profiling.llm_step_times.size());
+      // assert(profiling.ssm_step_times.size() ==
+      //        profiling.llm_step_times.size());
       str += "\n ssm_step_times_ms(";
       std::string ssm_step_times_ms = " ";
       for (double time : profiling.ssm_step_times) {

From e099405ea0d3ebcae009ac1f825aa519027add98 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 13 Jun 2024 23:17:14 -0700
Subject: [PATCH 373/667] feat: improve attention handler beginforward

---
 src/ops/tree_inc_multihead_self_attention.cu | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index cc2b6d179..bc92b1b0f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -24,6 +24,7 @@
 
 #include <sstream>
 #include <stdexcept>
+#include <vector>
 
 #define DISPATCH_GROUPSIZE(group_size, GROUP_SIZE, ...)                        \
   if (group_size == 1) {                                                       \
@@ -409,6 +410,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale =
       (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  std::vector<int32_t> q_indptr_h {0};
 
   {
     int parallelism = batch_size;
@@ -424,6 +426,13 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                                                 m->kv_indices,
                                                 m->kv_last_page_len,
                                                 m->qk_indptr);
+    for (int req_idx = 0; req_idx < bc->max_requests_per_batch();
+         req_idx++) {
+      if (bc->request_available[req_idx]) {
+        int q_len = bc->requestsInfo[req_idx].num_tokens_in_batch;
+        q_indptr_h.push_back(q_indptr_h.back() + q_len);
+      }
+    }
   }
 
   half *q = static_cast<half *>(m->queryTmp),
@@ -444,7 +453,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   handler->SetCUDAStream(stream);
   handler->BeginForward(m->workspace,
                         m->workspace_size,
-                        m->q_indptr,
+                        q_indptr_h.data(),
                         batch_size,
                         num_q_heads,
                         num_kv_heads,

From c2ab4ca8bce493c8fd34d70912a0358c1ae2b5d5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 13 Jun 2024 23:37:17 -0700
Subject: [PATCH 374/667] chore: minor

---
 inference/spec_infer/spec_infer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 5bb524095..62ccaf194 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -275,7 +275,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   int max_requests_per_batch = 8;
-  int max_tokens_per_batch = 256;
+  int max_tokens_per_batch = 128;
   int max_sequence_length = 512;
   int max_spec_tree_token_num = 64;
   int expansion_degree = 3;

From 6f9cb956188bea474ea33ae8587b0ed8995515f6 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 16 Jun 2024 01:31:18 -0400
Subject: [PATCH 375/667] Added some profiling output

---
 src/ops/inc_multihead_self_attention.cu      | 143 ++++++++++++++-----
 src/ops/tree_inc_multihead_self_attention.cu |  91 ++++++++++--
 2 files changed, 186 insertions(+), 48 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 037a72611..52dd97165 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -471,8 +471,10 @@ __global__ void
                           hidden_size * (q_tensor ? 0 : 1);
     int complex_part_index = real_part_index + (proj_size / 2);
 
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
+    // complex_input[i] = {input_ptr[real_part_index],
+    //                     input_ptr[complex_part_index]};
+    cuFloatComplex cii = {input_ptr[real_part_index],
+                          input_ptr[complex_part_index]};
 
     // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
     // apply a Cartesian coordinate transformation
@@ -488,9 +490,13 @@ __global__ void
     float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
     cuFloatComplex complex_pos = {cos(freq), sin(freq)};
 
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
+    // complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    // input_ptr[real_part_index] = complex_input[i].x;
+    // input_ptr[complex_part_index] = complex_input[i].y;
+
+    cii = cuCmulf(cii, complex_pos);
+    input_ptr[real_part_index] = cii.x;
+    input_ptr[complex_part_index] = cii.y;
   }
 }
 
@@ -520,6 +526,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 #endif
 
+  cudaError_t err = cudaStreamSynchronize(stream);
+  cudaEvent_t t_start, t_end;
+  checkCUDA(cudaEventCreate(&t_start));
+  checkCUDA(cudaEventCreate(&t_end));
+  checkCUDA(cudaEventRecord(t_start, stream));
+
   // Step 1: Compute QKV projections
   {
     DT alpha = 1.0f, beta = 0.0f;
@@ -560,6 +572,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
 
+  checkCUDA(cudaEventRecord(t_end, stream));
+  checkCUDA(cudaEventSynchronize(t_end));
+  float elapsed = 0;
+  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  cudaEventDestroy(t_start);
+  cudaEventDestroy(t_end);
+  if (bc->inference_mode == TREE_VERIFY_MODE) {
+    std::cout << "GEMM time: " << elapsed << " ms\n";
+  }
+
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
@@ -593,6 +615,10 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                      m->hidden_size);
   }
 
+  checkCUDA(cudaEventCreate(&t_start));
+  checkCUDA(cudaEventCreate(&t_end));
+  checkCUDA(cudaEventRecord(t_start, stream));
+
   // Step 3: apply rotary embedding if needed
   if (*m->apply_rotary_embedding) {
     /*q&k*/
@@ -609,6 +635,15 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                           q_array_size,
                                           m->hidden_size);
   }
+  checkCUDA(cudaEventRecord(t_end, stream));
+  checkCUDA(cudaEventSynchronize(t_end));
+  elapsed = 0;
+  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  cudaEventDestroy(t_start);
+  cudaEventDestroy(t_end);
+  if (bc->inference_mode == TREE_VERIFY_MODE) {
+    std::cout << "Rotary time: " << elapsed << " ms\n";
+  }
 }
 
 template <typename DT>
@@ -822,6 +857,11 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
 
+  cudaEvent_t t_start, t_end;
+  cudaEventCreate(&t_start);
+  cudaEventCreate(&t_end);
+  cudaEventRecord(t_start, stream);
+
   if (m->offload && m->biasSize > 0) {
     cudaMemcpyAsync(
         m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
@@ -839,6 +879,18 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
+  cudaEventRecord(t_end, stream);
+  checkCUDA(cudaEventSynchronize(t_end));
+  float elapsed = 0;
+  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  cudaEventDestroy(t_start);
+  cudaEventDestroy(t_end);
+  std::cout << "Prepare attn time: " << elapsed << " ms\n";
+
+  cudaEventCreate(&t_start);
+  cudaEventCreate(&t_end);
+  cudaEventRecord(t_start, stream);
+
   if (bc->prompt_phase) {
     // phase 3: Compute attention score for prompt tokens;
     compute_attention_kernel_prompt(
@@ -849,6 +901,14 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
+  cudaEventRecord(t_end, stream);
+  checkCUDA(cudaEventSynchronize(t_end));
+  elapsed = 0;
+  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  cudaEventDestroy(t_start);
+  cudaEventDestroy(t_end);
+  std::cout << "Attn time: " << elapsed << " ms\n";
+
   // Debug output:
   //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
   //   float *temp_output = new float[size];
@@ -1182,11 +1242,9 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
   bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
+  cudaEventCreate(&t_start);
+  cudaEventCreate(&t_end);
+  cudaEventRecord(t_start, stream);
 
   // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
@@ -1229,15 +1287,13 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     assert(false && "Unspported data type");
   }
 
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
-  }
+  cudaEventRecord(t_end, stream);
+  checkCUDA(cudaEventSynchronize(t_end));
+  float elapsed = 0;
+  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  cudaEventDestroy(t_start);
+  cudaEventDestroy(t_end);
+  printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
 }
 
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
@@ -1373,10 +1429,16 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
-    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0, qk_prod_size = 0;
-    assert((BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num()) % kPagesize == 0);
-    size_t max_num_pages = (BatchConfig::max_sequence_length() +
-                BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0,
+           qk_prod_size = 0;
+    assert((BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num()) %
+               kPagesize ==
+           0);
+    size_t max_num_pages =
+        (BatchConfig::max_sequence_length() +
+         BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+        kPagesize;
     switch (infer_mode) {
       case INC_DECODING_MODE: {
         key_cache_size = num_q_heads * kProjSize *
@@ -1405,18 +1467,17 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         break;
       }
       case TREE_VERIFY_MODE: {
-        query_tmp_size = num_q_heads * qProjSize *
-                         BatchConfig::max_tokens_per_batch();
+        query_tmp_size =
+            num_q_heads * qProjSize * BatchConfig::max_tokens_per_batch();
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
-                         BatchConfig::max_requests_per_batch() *
-                         max_num_pages * kPagesize;
+                         BatchConfig::max_requests_per_batch() * max_num_pages *
+                         kPagesize;
         value_cache_size = num_q_heads * vProjSize *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
-        qk_prod_size = BatchConfig::max_sequence_length() *
-                       max_num_pages * kPagesize *
-                       num_q_heads;
+        qk_prod_size = BatchConfig::max_sequence_length() * max_num_pages *
+                       kPagesize * num_q_heads;
         break;
       }
       default:
@@ -1428,8 +1489,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                    kProjSize * num_q_heads)) /
                           2;
     size_t totalSize =
-        (qkv_max_proj_size + query_tmp_size + key_cache_size + value_cache_size +
-         2 * qk_prod_size + attn_heads_size) * size_of_dt +
+        (qkv_max_proj_size + query_tmp_size + key_cache_size +
+         value_cache_size + 2 * qk_prod_size + attn_heads_size) *
+            size_of_dt +
         output_tmp_size * data_type_size(DT_HALF) +
         complex_size * sizeof(cuFloatComplex); // more components will
                                                // be added here later
@@ -1437,15 +1499,18 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       // assert that we have enough reserved work space left
       size_t totalSharedSize =
           infer_mode == TREE_VERIFY_MODE
-              ? totalSize -
-                    (query_tmp_size + key_cache_size + value_cache_size + qkv_max_proj_size) *
-                        size_of_dt
-              : totalSize - (query_tmp_size + key_cache_size + value_cache_size) * size_of_dt;
+              ? totalSize - (query_tmp_size + key_cache_size +
+                             value_cache_size + qkv_max_proj_size) *
+                                size_of_dt
+              : totalSize -
+                    (query_tmp_size + key_cache_size + value_cache_size) *
+                        size_of_dt;
 
       size_t instance_size =
           size_of_dt *
           (infer_mode == TREE_VERIFY_MODE
-               ? query_tmp_size + key_cache_size + value_cache_size + qkv_max_proj_size
+               ? query_tmp_size + key_cache_size + value_cache_size +
+                     qkv_max_proj_size
                : query_tmp_size + key_cache_size + value_cache_size);
 
       if (quantization_type != DT_NONE) {
@@ -1473,8 +1538,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size *
                                                              size_of_dt);
     }
-    keyCache = gpu_mem_allocator.allocate_instance_untyped((key_cache_size + value_cache_size) *
-                                                           size_of_dt);
+    keyCache = gpu_mem_allocator.allocate_instance_untyped(
+        (key_cache_size + value_cache_size) * size_of_dt);
     valueCache = static_cast<void *>(static_cast<char *>(keyCache) +
                                      key_cache_size * size_of_dt);
     outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index cc2b6d179..95911ad1f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -177,15 +177,15 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   int const num_requests = bc->num_active_requests();
   int parallelism = m->hidden_size * num_requests;
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
-                             min(CUDA_NUM_THREADS, parallelism),
-                             0,
-                             stream>>>(static_cast<half *>(m->keyCache),
-                                       m->committed_token_infos,
-                                       m->request_available,
-                                       num_requests,
-                                       m->hidden_size,
-                                       num_tokens_to_commit,
-                                       max_num_pages);
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream>>>(static_cast<half *>(m->keyCache),
+                                   m->committed_token_infos,
+                                   m->request_available,
+                                   num_requests,
+                                   m->hidden_size,
+                                   num_tokens_to_commit,
+                                   max_num_pages);
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
   //   float elapsed = 0;
@@ -439,6 +439,11 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       m->kv_indptr,
       m->kv_last_page_len);
 
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   BatchPrefillHandler *handler =
       static_cast<BatchPrefillHandler *>(m->batch_prefill_handler);
   handler->SetCUDAStream(stream);
@@ -450,6 +455,14 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                         num_kv_heads,
                         head_dim);
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   printf("paged KV construction: %.4f ms\n", elapsed);
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
   DISPATCH_GROUPSIZE(
       group_size,
       GROUP_SIZE,
@@ -541,6 +554,11 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   // additional processing for weight uploading
   if (m->handle.offload_reserve_space != nullptr) {
     // Note that we update weight_ptr and bias_ptr when uploading weight and
@@ -569,6 +587,18 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     commit_tokens(m, bc, stream);
   }
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   std::cout << "commit tokens time: " << elapsed << " ms\n";
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   // After commit we update m->num_active_tokens to be the number of active
   // tokens for the current batch
   m->num_active_tokens = bc->num_active_tokens();
@@ -589,17 +619,60 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                      bias_ptr,
                      stream);
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   std::cout << "Compute qkv time: " << elapsed << " ms\n";
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   // Update gpu-side custom mask referring from CaualMask
   if (!bc->prompt_phase) {
     update_custom_mask(m, bc, stream);
   }
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   std::cout << "update custom mask time: " << elapsed << " ms\n";
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
   // Update key-val cache, compact q array
   update_qkv_cache<DT>(m, bc, stream);
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   std::cout << "update qkv time: " << elapsed << " ms\n";
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   // Compute attention
   tree_verify_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   std::cout << "Attn time: " << elapsed << " ms\n";
+
   // Debug output:
   // {
   //   int size = m->hidden_size * bc->num_active_tokens();

From 0002b25ae4d67f475c37805b094b86101c91a336 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 22 Jun 2024 02:55:44 -0400
Subject: [PATCH 376/667] Fix 0 logit caused by half precision.

---
 src/runtime/request_manager.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 33b03ddc1..a44e7424b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2086,8 +2086,11 @@ bool RequestManager::add_tokens_to_spec_token_tree(
              child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
              child_pos++) {
           int result_idx = child_start_idx + child_pos;
-          child_probs[child_pos] = std::make_pair(
-              log(ssm_inference_result.probs[result_idx]), result_idx);
+          if (log(ssm_inference_result.probs[result_idx]) !=
+              -std::numeric_limits<float>::infinity()) {
+            child_probs[child_pos] = std::make_pair(
+                log(ssm_inference_result.probs[result_idx]), result_idx);
+          }
         }
         // Sort in descending order
         std::sort(child_probs.begin(),

From 93c3583f77733f41eb938995b72d4bbaa85a6149 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 22 Jun 2024 03:05:18 -0400
Subject: [PATCH 377/667] Commented out the profiling codes, but keep them
 there.

---
 src/ops/inc_multihead_self_attention.cu      |  53 ++++-----
 src/ops/tree_inc_multihead_self_attention.cu | 114 ++++++++++++++-----
 2 files changed, 114 insertions(+), 53 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 52dd97165..102010869 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -526,11 +526,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 #endif
 
-  cudaError_t err = cudaStreamSynchronize(stream);
-  cudaEvent_t t_start, t_end;
-  checkCUDA(cudaEventCreate(&t_start));
-  checkCUDA(cudaEventCreate(&t_end));
-  checkCUDA(cudaEventRecord(t_start, stream));
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   checkCUDA(cudaEventCreate(&t_start));
+  //   checkCUDA(cudaEventCreate(&t_end));
+  //   checkCUDA(cudaEventRecord(t_start, stream));
 
   // Step 1: Compute QKV projections
   {
@@ -572,15 +573,15 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
 
-  checkCUDA(cudaEventRecord(t_end, stream));
-  checkCUDA(cudaEventSynchronize(t_end));
-  float elapsed = 0;
-  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  cudaEventDestroy(t_start);
-  cudaEventDestroy(t_end);
-  if (bc->inference_mode == TREE_VERIFY_MODE) {
-    std::cout << "GEMM time: " << elapsed << " ms\n";
-  }
+  //   checkCUDA(cudaEventRecord(t_end, stream));
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
+  //     std::cout << "GEMM time: " << elapsed << " ms\n";
+  //   }
 
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
@@ -615,9 +616,9 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                      m->hidden_size);
   }
 
-  checkCUDA(cudaEventCreate(&t_start));
-  checkCUDA(cudaEventCreate(&t_end));
-  checkCUDA(cudaEventRecord(t_start, stream));
+  //   checkCUDA(cudaEventCreate(&t_start));
+  //   checkCUDA(cudaEventCreate(&t_end));
+  //   checkCUDA(cudaEventRecord(t_start, stream));
 
   // Step 3: apply rotary embedding if needed
   if (*m->apply_rotary_embedding) {
@@ -635,15 +636,15 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                           q_array_size,
                                           m->hidden_size);
   }
-  checkCUDA(cudaEventRecord(t_end, stream));
-  checkCUDA(cudaEventSynchronize(t_end));
-  elapsed = 0;
-  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  cudaEventDestroy(t_start);
-  cudaEventDestroy(t_end);
-  if (bc->inference_mode == TREE_VERIFY_MODE) {
-    std::cout << "Rotary time: " << elapsed << " ms\n";
-  }
+  //   checkCUDA(cudaEventRecord(t_end, stream));
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
+  //     std::cout << "Rotary time: " << elapsed << " ms\n";
+  //   }
 }
 
 template <typename DT>
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 8bd6b76fd..84f33372a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -393,6 +393,8 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                            BatchConfig const *bc,
                            DT *output_ptr,
                            cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
   //   cudaEvent_t t_start, t_end;
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -410,7 +412,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale =
       (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  std::vector<int32_t> q_indptr_h {0};
+  std::vector<int32_t> q_indptr_h{0};
 
   {
     int parallelism = batch_size;
@@ -426,8 +428,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
                                                 m->kv_indices,
                                                 m->kv_last_page_len,
                                                 m->qk_indptr);
-    for (int req_idx = 0; req_idx < bc->max_requests_per_batch();
-         req_idx++) {
+    for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
       if (bc->request_available[req_idx]) {
         int q_len = bc->requestsInfo[req_idx].num_tokens_in_batch;
         q_indptr_h.push_back(q_indptr_h.back() + q_len);
@@ -448,7 +449,16 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       m->kv_indptr,
       m->kv_last_page_len);
 
-  //   cudaEvent_t t_start, t_end;
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    attn prep time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
@@ -466,12 +476,18 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
-  //   float elapsed = 0;
+  //   elapsed = 0;
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  //   printf("paged KV construction: %.4f ms\n", elapsed);
+  //   if (device == 0) {
+  //     printf("    BeginForward time: %.4f ms\n", elapsed);
+  //   }
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
 
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   DISPATCH_GROUPSIZE(
       group_size,
       GROUP_SIZE,
@@ -537,6 +553,20 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
             }
           })})});
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    actual attn time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
   {
     int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
     produce_output_kernel<<<GET_BLOCKS(parallelism),
@@ -547,9 +577,11 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
-  //   float elapsed = 0;
+  //   elapsed = 0;
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  //   printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
+  //   if (device == 0) {
+  //     printf("    produce_output_kernel time: %.4f ms\n", elapsed);
+  //   }
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
 }
@@ -563,6 +595,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
+
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
   //   cudaEvent_t t_start, t_end;
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -602,7 +637,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
-  //   std::cout << "commit tokens time: " << elapsed << " ms\n";
+  //   if (device == 0) {
+  //     std::cout << "Commit tokens time: " << elapsed << " ms\n";
+  //   }
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -634,7 +671,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
-  //   std::cout << "Compute qkv time: " << elapsed << " ms\n";
+  //   if (device == 0) {
+  //     std::cout << "Compute qkv time: " << elapsed << " ms\n";
+  //   }
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -651,11 +690,14 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
-  //   std::cout << "update custom mask time: " << elapsed << " ms\n";
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
+
   // Update key-val cache, compact q array
   update_qkv_cache<DT>(m, bc, stream);
 
@@ -665,7 +707,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
-  //   std::cout << "update qkv time: " << elapsed << " ms\n";
+  //   if (device == 0) {
+  //     std::cout << "Update qkv time: " << elapsed << " ms\n";
+  //   }
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -680,7 +724,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
   //   cudaEventDestroy(t_start);
   //   cudaEventDestroy(t_end);
-  //   std::cout << "Attn time: " << elapsed << " ms\n";
+  //   if (device == 0) {
+  //     std::cout << "Attn time: " << elapsed << " ms\n";
+  //   }
 
   // Debug output:
   // {
@@ -702,6 +748,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   //   delete[] temp_output;
   // }
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
   int processed_tokens_in_batch = bc->num_active_tokens();
 
@@ -714,6 +763,15 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       processed_tokens_in_batch,
                       stream);
 
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Compute output proj time: " << elapsed << " ms\n";
+  //   }
   // {
   //   int size = m->oProjSize;
   //   DT *temp_output = new DT[size];
@@ -747,12 +805,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
 
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
   // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
@@ -796,14 +854,16 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     assert(false && "Unspported data type");
   }
 
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-  }
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "TreeIncMultiHeadSelfAttention time: " << elapsed << "
+  //     ms\n";
+  //   }
 }
 
 TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(

From 8f134fb27c968ed5cd70022b732d008ec9029036 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 23 Jun 2024 02:33:47 -0400
Subject: [PATCH 378/667] Added more parameters to parse_args

---
 inference/spec_infer/spec_infer.cc | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 0955020df..63c431e5c 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -63,6 +63,9 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
+                      int &max_spec_tree_token_num,
+                      int &max_tree_width,
+                      int &max_tree_depth,
                       int &expansion_degree,
                       bool &spec_sampling,
                       bool &do_sample,
@@ -121,6 +124,18 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-spec-tree-token-num")) {
+      max_spec_tree_token_num = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-width")) {
+      max_tree_width = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-depth")) {
+      max_tree_depth = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--expansion-degree")) {
       expansion_degree = std::stoi(argv[++i]);
       continue;
@@ -315,6 +330,9 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
+                   max_spec_tree_token_num,
+                   max_tree_width,
+                   max_tree_depth,
                    expansion_degree,
                    spec_sampling,
                    do_sample,

From a7f649e770e56f2014240716218dae1f374626c3 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 23 Jun 2024 14:19:52 -0400
Subject: [PATCH 379/667] Fix

---
 src/runtime/request_manager.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1faf74511..f8f6897f9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1798,9 +1798,6 @@ void RequestManager::get_verify_results_sample(
       request.tokens.push_back(token_id);
     }
 
-    request.llm_committed = false;
-    request.ssm_committed = false;
-
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
       for (auto const &committed_token : request.committed_tokens) {

From 02c1a827a1e0edac85445602dab70d04b6ad0d27 Mon Sep 17 00:00:00 2001
From: Remi Delacourt <remi.delacourt@gmail.com>
Date: Wed, 26 Jun 2024 20:30:48 +0000
Subject: [PATCH 380/667] Fix ssm_decoding_step timeframe

---
 src/runtime/request_manager.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f8f6897f9..a904ef40a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1441,15 +1441,14 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  if (all_request_last_layer_empty) {
+  if (all_request_last_layer_empty or current_ssm_step == get_max_tree_depth()) {
     // Update profiling statistics before returning
     profiling.ssm_step_times.push_back(
         (Realm::Clock::current_time_in_microseconds() -
          profiling.ssm_step_start) *
         1e-3);
-    return true;
   }
-  return false;
+  return all_request_last_layer_empty;
 }
 
 /* --------- Bitmask Related Functions --------- */

From 8dbe98d4750147bd2451f44edf669676f8fd1a62 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:21:18 -0400
Subject: [PATCH 381/667] Specscheduler new attention (#1434)

* feat: update deps/flashinfer

* feat: update flashinfer

* fix: now can get correct result, but has performance problem

* fix: update_custom_mask performance

* chore: minor

* chore: add perf code

* feat: add attention metadata

* feat: add AttentionMetaData

* feat: tree_verify_attn use global attentionmetadata

* feat: move attentionmetasize to global computing

* chore: minor

* chore: remove unused

* feat: add spec_inc_attn backup

* feat: SSM use flashinfer kernel

* fix: SSM don't use cudaGraph

* chore: remove redundant code

* chore: comment out minor

* feat: attention adapt to cudaGraph

* fix: split handler_collections for prompt/decode phases

* chore: tree verify cannot use cudaGraph

* feat: move all flashinfer-related states to global (tree search attention)

* fix: use identical attention_meta instance across all FFHandlers

* feat: enable cudaGraph in tree search mode

* chore: minor

* feat: tree search & verify use separate attention_meta

* fix: attention_metadata should be distinct for each worker

* feat: tree verify attention use metadata

* feat: support llm cudaGraph

* chore: minor

* chore: temporally only enable SSM cudaGraph for performance issue

* chore: minor

* fix: llm cudaGraph, should ensure the kernel parameter be consistent

* feat: reduce cudaGraph number

* feat: reduce cudaGraph instances number

* chore: remove unused backups

* chore: remove some debug outputs

* chore: some debug outputs

---------

Co-authored-by: zikun-li <lizikunzk@gmail.com>
---
 deps/flashinfer                               |    2 +-
 include/flexflow/config.h                     |  126 +-
 include/flexflow/ops/arg_topk.h.backup        |  110 --
 include/flexflow/ops/graph_params.h           |    2 +-
 .../ops/inc_multihead_self_attention.h        |    4 -
 .../inc_multihead_self_attention_utils.cuh    |    3 -
 .../ops/spec_inc_multihead_self_attention.h   |    6 +-
 .../ops/tree_inc_multihead_self_attention.h   |   13 +-
 ...tree_inc_multihead_self_attention.h.backup |  155 ---
 include/flexflow/request_manager.h            |    1 +
 src/ops/arg_topk.cc.backup                    |  510 --------
 src/ops/arg_topk.cu.backup                    |  525 --------
 src/ops/fused.cu                              |   55 +-
 src/ops/inc_multihead_self_attention.cu       |   24 +-
 src/ops/multihead_self_attention_impl.cu      |  166 +++
 src/ops/spec_inc_multihead_self_attention.cu  |  951 +++++---------
 src/ops/tree_inc_multihead_self_attention.cu  |  422 ++-----
 ...ree_inc_multihead_self_attention.cu.backup | 1119 -----------------
 .../tree_inc_multihead_self_attention_impl.cu |  509 --------
 src/runtime/model.cpp                         |    9 +-
 src/runtime/model.cu                          |   19 +-
 src/runtime/request_manager.cc                |   11 +
 src/runtime/request_manager.cu                |  380 +++++-
 23 files changed, 1114 insertions(+), 4008 deletions(-)
 delete mode 100644 include/flexflow/ops/arg_topk.h.backup
 delete mode 100644 include/flexflow/ops/tree_inc_multihead_self_attention.h.backup
 delete mode 100644 src/ops/arg_topk.cc.backup
 delete mode 100644 src/ops/arg_topk.cu.backup
 create mode 100644 src/ops/multihead_self_attention_impl.cu
 delete mode 100644 src/ops/tree_inc_multihead_self_attention.cu.backup
 delete mode 100644 src/ops/tree_inc_multihead_self_attention_impl.cu

diff --git a/deps/flashinfer b/deps/flashinfer
index 7def34e31..457eb7893 160000
--- a/deps/flashinfer
+++ b/deps/flashinfer
@@ -1 +1 @@
-Subproject commit 7def34e316a731cd069f7fd30a9a2ffc70fad02a
+Subproject commit 457eb7893f3fbf3d7bd087a5f0e111261cf2a5b2
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 54b6d9d8e..b244563c4 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -66,6 +66,127 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 
 class FFConfig;
 
+constexpr uint32_t kPagesize = 64;
+class AttentionMetaData {
+public:
+  AttentionMetaData() {
+    num_q_heads_ = 0;
+    num_kv_heads_ = 0;
+    head_dim_ = 0;
+    q_indptr = nullptr;
+    kv_indptr = nullptr;
+    kv_indices = nullptr;
+    kv_last_page_len = nullptr;
+    qk_indptr = nullptr;
+    custom_mask = nullptr;
+    workspace = nullptr;
+    workspace_block = 0;
+    mem_size_ = 0;
+    enabled_ = false;
+  }
+  AttentionMetaData(const AttentionMetaData &rhs) {
+    num_q_heads_ = rhs.num_q_heads_;
+    num_kv_heads_ = rhs.num_kv_heads_;
+    head_dim_ = rhs.head_dim_;
+    q_indptr = rhs.q_indptr;
+    kv_indptr = rhs.kv_indptr;
+    kv_indices = rhs.kv_indices;
+    kv_last_page_len = rhs.kv_last_page_len;
+    qk_indptr = rhs.qk_indptr;
+    custom_mask = rhs.custom_mask;
+    workspace = rhs.workspace;
+    workspace_block = rhs.workspace_block;
+    mem_size_ = rhs.mem_size_;
+    enabled_ = rhs.enabled_;
+    decode_handler_collections = rhs.decode_handler_collections;
+    prompt_handler_collections = rhs.prompt_handler_collections;
+  }
+
+  size_t mem_size() {
+    if (mem_size_ > 0) {
+      return mem_size_;
+    }
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages =
+        (BatchConfig::max_spec_tree_token_num() +
+         BatchConfig::max_sequence_length() + kPagesize - 1) /
+        kPagesize;
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              ((BatchConfig::max_spec_tree_token_num() *
+                                (BatchConfig::max_spec_tree_token_num() +
+                                BatchConfig::max_sequence_length()) + 7) / 8);
+    workspace_block = 16 * 1024 * 1024; // 16MB
+
+    mem_size_ = sizeof(int32_t) * indices_size + sizeof(uint8_t) * custom_mask_size + workspace_block * BatchConfig::max_requests_per_batch();
+    return mem_size_;
+  }
+
+  void assign_address(void* ptr, int size) {
+    if (ptr == nullptr) {
+      q_indptr = nullptr;
+      kv_indptr = nullptr;
+      kv_indices = nullptr;
+      kv_last_page_len = nullptr;
+      qk_indptr = nullptr;
+      custom_mask = nullptr;
+      workspace = nullptr;
+      return;
+    }
+    assert(size >= mem_size() && "Insufficient memory size for attention metadata");
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages =
+        (BatchConfig::max_spec_tree_token_num() +
+         BatchConfig::max_sequence_length() + kPagesize - 1) /
+        kPagesize;
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              ((BatchConfig::max_spec_tree_token_num() *
+                                (BatchConfig::max_spec_tree_token_num() +
+                                BatchConfig::max_sequence_length()) + 7) / 8);
+
+    q_indptr = static_cast<int32_t*>(ptr);
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + max_num_pages * batch_size;
+    qk_indptr = kv_last_page_len + batch_size + 1;
+    custom_mask = static_cast<uint8_t*>(ptr) + sizeof(int32_t) * indices_size;
+    workspace = static_cast<void*>(static_cast<uint8_t*>(ptr) + sizeof(int32_t) * indices_size + sizeof(uint8_t) * custom_mask_size);
+  }
+
+  void set_num_q_heads(uint32_t const num_q_heads) { num_q_heads_ = num_q_heads; }
+  void set_num_kv_heads(uint32_t const num_kv_heads) { num_kv_heads_ = num_kv_heads; }
+  void set_head_dim(uint32_t const head_dim) { head_dim_ = head_dim; }
+  uint32_t num_q_heads() const { return num_q_heads_; }
+  uint32_t num_kv_heads() const { return num_kv_heads_; }
+  uint32_t head_dim() const { return head_dim_; }
+
+  void set_enabled(bool const enabled) { enabled_ = enabled; }
+  bool enabled() const { return enabled_; }
+
+  uint32_t num_q_heads_;
+  uint32_t num_kv_heads_;
+  uint32_t head_dim_;
+
+  int32_t *q_indptr;
+  int32_t *kv_indptr;
+  int32_t *kv_indices;
+  int32_t *kv_last_page_len;
+  int32_t *qk_indptr;
+  uint8_t *custom_mask;
+  void *workspace;
+  size_t workspace_block;
+
+  size_t mem_size_;
+  
+  // batchsize -> handler
+  bool enabled_;
+  std::unordered_map<int, void*> decode_handler_collections;
+  std::unordered_map<int, void*> prompt_handler_collections;
+};
+
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;
@@ -77,11 +198,14 @@ struct FFHandler {
   void *workSpace;
   size_t workSpaceSize;
   void *batch_config_metadata;
+  AttentionMetaData* tree_search_attention_metadata;
+  AttentionMetaData* tree_verify_attention_metadata;
 
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
       sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
-      sizeof(BatchConfig::committed_tokens);
+      sizeof(BatchConfig::committed_tokens) + sizeof(int);
+
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
   DataType quantization_type;
diff --git a/include/flexflow/ops/arg_topk.h.backup b/include/flexflow/ops/arg_topk.h.backup
deleted file mode 100644
index 935aa9ff9..000000000
--- a/include/flexflow/ops/arg_topk.h.backup
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef _FLEXFLOW_ARG_TOPK_H_
-#define _FLEXFLOW_ARG_TOPK_H_
-
-#include "flexflow/inference.h"
-#include "flexflow/model.h"
-#include "flexflow/node.h"
-#include "flexflow/ops/arg_topk_params.h"
-
-namespace FlexFlow {
-
-class ArgTopKMeta : public OpMeta {
-public:
-  ArgTopKMeta(FFHandler handle, Op const *op);
-  bool sorted;
-  int k;
-  bool speculative_decoding;
-};
-
-class ArgTopK : public Op {
-public:
-  using Params = ArgTopKParams;
-  using Input = ParallelTensor;
-  ArgTopK(FFModel &model,
-          LayerID const &layer_guid,
-          ParallelTensor const input,
-          int k,
-          bool sorted,
-          bool speculative_decoding,
-          char const *name);
-  ArgTopK(FFModel &model,
-          LayerID const &layer_guid,
-          ArgTopK const &other,
-          ParallelTensor const input);
-  ArgTopK(FFModel &model,
-          Params const &params,
-          Input const input,
-          char const *name = nullptr);
-  void init(FFModel const &) override;
-  void init_inference(FFModel const &,
-                      std::vector<ParallelTensor> const &,
-                      std::vector<ParallelTensor> const &,
-                      MachineView const *mv = nullptr) override;
-  void forward(FFModel const &) override;
-  void backward(FFModel const &) override;
-  Legion::FutureMap inference(FFModel const &,
-                              BatchConfigFuture const &,
-                              std::vector<ParallelTensor> const &,
-                              std::vector<ParallelTensor> const &,
-                              MachineView const *mv = nullptr) override;
-  void print_layer(FFModel const &model) override {
-    assert(0);
-  }
-  static Op *
-      create_operator_from_layer(FFModel &model,
-                                 Layer const *layer,
-                                 std::vector<ParallelTensor> const &inputs);
-
-  static OpMeta *init_task(Legion::Task const *task,
-                           std::vector<Legion::PhysicalRegion> const &regions,
-                           Legion::Context ctx,
-                           Legion::Runtime *runtime);
-  static InferenceResult
-      inference_task(Legion::Task const *task,
-                     std::vector<Legion::PhysicalRegion> const &regions,
-                     Legion::Context ctx,
-                     Legion::Runtime *runtime);
-  static InferenceResult inference_speculative_task(
-      Legion::Task const *task,
-      std::vector<Legion::PhysicalRegion> const &regions,
-      Legion::Context ctx,
-      Legion::Runtime *runtime);
-  void serialize(Legion::Serializer &s) const override;
-  static PCG::Node deserialize(FFModel &ff,
-                               Legion::Deserializer &d,
-                               ParallelTensor inputs[],
-                               int num_inputs);
-  Op *materialize(FFModel &ff,
-                  ParallelTensor inputs[],
-                  int num_inputs) const override;
-  bool measure_operator_cost(Simulator *sim,
-                             MachineView const &pc,
-                             CostMetrics &cost_metrics) const override;
-  template <typename DT>
-  static void forward_kernel(ArgTopKMeta const *m,
-                             DT const *input_ptr,
-                             float *output_ptr,
-                             int *indices_ptr,
-                             size_t batch_size,
-                             int length,
-                             int k,
-                             bool sorted,
-                             BatchConfig const *bc,
-                             ffStream_t stream);
-  static void forward_kernel_wrapper(ArgTopKMeta const *m,
-                                     GenericTensorAccessorR const &input,
-                                     GenericTensorAccessorW const &prob,
-                                     GenericTensorAccessorW const &indices,
-                                     int batch_size,
-                                     BatchConfig const *bc);
-  Params get_params() const;
-
-public:
-  int k;
-  bool sorted;
-  bool speculative_decoding;
-};
-
-}; // namespace FlexFlow
-
-#endif
diff --git a/include/flexflow/ops/graph_params.h b/include/flexflow/ops/graph_params.h
index 55964dc30..72bdf8369 100644
--- a/include/flexflow/ops/graph_params.h
+++ b/include/flexflow/ops/graph_params.h
@@ -14,7 +14,7 @@ namespace FlexFlow {
       : num_active_requests(num_active_requests), num_active_tokens(num_active_tokens), prompt_phase(prompt_phase) {}
 
     void Print() const {
-      printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, prompt_phase: %d\n \n", num_active_requests, num_active_tokens, prompt_phase);
+      printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, prompt_phase: %d\n\n", num_active_requests, num_active_tokens, prompt_phase);
     }
   };
 
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 8bf34ca7c..6dd43e333 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -18,10 +18,6 @@
 #include <hip/hip_complex.h>
 #endif
 
-// kPagesize also defined in tree_inc_multihead_self_attention_impl.cu
-// for template instantiation
-constexpr uint32_t kPagesize = 64;
-
 namespace FlexFlow {
 
 class IncMultiHeadSelfAttentionMeta;
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index fbe0c5547..481243867 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -523,8 +523,5 @@ struct threads_per_value_t {
 #define test_bit(bit_mask, idx, pos)                                           \
   (((bit_mask)[idx][(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
-#define test_bit_orig(bit_mask, idx, pos)                                           \
-  (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
-
 } // namespace FlexFlow
 #endif // _FLEXFLOW_OPS_KERNELS_INC_MULTIHEAD_SELF_UTILS_H
\ No newline at end of file
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index f3e5a23ea..fdf0a8729 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -108,7 +108,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                              CostMetrics &cost_metrics) const override;
 
   static void
-      inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta const *m,
+      inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta *m,
                                BatchConfig const *bc,
                                int shard_id,
                                GenericTensorAccessorR const &input,
@@ -137,10 +137,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
                                     int _num_q_heads,
                                     int _num_kv_heads);
   ~SpecIncMultiHeadSelfAttentionMeta(void);
-
-public:
-  Realm::RegionInstance tree_search_reserve_inst;
-  BatchConfig::BitMask *causalMask;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 3f3568803..4cfec8b7a 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -146,18 +146,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
 public:
   int num_active_tokens;
   BatchConfig::CommittedTokensInfo *committed_token_infos;
-  BatchConfig::BitMask *causalMask;
-  // For flashinfer attention
-  Realm::RegionInstance flashinfer_reserve_inst;
-  int32_t *q_indptr;
-  int32_t *kv_indptr;
-  int32_t *kv_indices;
-  int32_t *kv_last_page_len;
-  int32_t *qk_indptr;
-  float *custom_mask;
-  size_t workspace_size;
-  void *workspace;
-  void *batch_prefill_handler;
+  int *num_tokens_to_commit;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h.backup b/include/flexflow/ops/tree_inc_multihead_self_attention.h.backup
deleted file mode 100644
index 45a7a6b56..000000000
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h.backup
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
-#define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
-
-#include "flexflow/accessor.h"
-#include "flexflow/device.h"
-#include "flexflow/fftype.h"
-#include "flexflow/inference.h"
-#include "flexflow/layer.h"
-#include "flexflow/node.h"
-#include "flexflow/op_meta.h"
-#include "flexflow/operator.h"
-#include "flexflow/ops/inc_multihead_self_attention.h"
-#include "flexflow/ops/tree_inc_multihead_self_attention_params.h"
-#include "math.h"
-#include <cfloat>
-#include <complex>
-
-namespace FlexFlow {
-
-class TreeIncMultiHeadSelfAttentionMeta;
-
-class TreeIncMultiHeadSelfAttention : public Op {
-public:
-  using Params = TreeIncMultiHeadSelfAttentionParams;
-  using Input = ParallelTensor;
-
-  TreeIncMultiHeadSelfAttention(FFModel &model,
-                                LayerID const &layer_guid,
-                                ParallelTensor const _input,
-                                int _embed_dim,
-                                int _num_q_heads,
-                                int _num_kv_heads,
-                                int _kdim,
-                                int _vdim,
-                                float _dropout,
-                                bool _qkv_bias,
-                                bool _final_bias,
-                                bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
-                                bool _scaling_query,
-                                float _scaling_factor,
-                                bool _qk_prod_scaling,
-                                bool _position_bias,
-                                bool allocate_weights,
-                                DataType _quantization_type,
-                                bool _offload,
-                                int _tensor_parallelism_degree,
-                                char const *name);
-  TreeIncMultiHeadSelfAttention(FFModel &model,
-                                ParallelTensor const _input,
-                                ParallelTensor const _weight,
-                                int _embed_dim,
-                                int _num_q_heads,
-                                int _num_kv_heads,
-                                int _kdim,
-                                int _vdim,
-                                float _dropout,
-                                bool _qkv_bias,
-                                bool _final_bias,
-                                bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
-                                bool _scaling_query,
-                                float _scaling_factor,
-                                bool _qk_prod_scaling,
-                                bool _position_bias,
-                                bool allocate_weights,
-                                DataType _quantization_type,
-                                bool _offload,
-                                int _tensor_parallelism_degree,
-                                char const *name);
-  TreeIncMultiHeadSelfAttention(FFModel &model,
-                                TreeIncMultiHeadSelfAttention const &other,
-                                ParallelTensor const input,
-                                bool allocate_weights);
-  TreeIncMultiHeadSelfAttention(FFModel &model,
-                                Params const &params,
-                                Input const &inputs,
-                                bool allocate_weights = false,
-                                char const *name = nullptr);
-  static Op *
-      create_operator_from_layer(FFModel &model,
-                                 Layer const *layer,
-                                 std::vector<ParallelTensor> const &inputs);
-  void init(FFModel const &) override;
-  void init_inference(FFModel const &,
-                      std::vector<ParallelTensor> const &,
-                      std::vector<ParallelTensor> const &,
-                      MachineView const *mv = nullptr) override;
-  void forward(FFModel const &) override;
-  void backward(FFModel const &) override;
-  Legion::FutureMap inference(FFModel const &,
-                              BatchConfigFuture const &,
-                              std::vector<ParallelTensor> const &,
-                              std::vector<ParallelTensor> const &,
-                              MachineView const *mv = nullptr) override;
-  void print_layer(FFModel const &model) override {
-    assert(0);
-  }
-  bool get_int_parameter(PMParameter, int *) const override;
-
-  static OpMeta *init_task(Legion::Task const *task,
-                           std::vector<Legion::PhysicalRegion> const &regions,
-                           Legion::Context ctx,
-                           Legion::Runtime *runtime);
-  static void inference_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
-  bool measure_operator_cost(Simulator *sim,
-                             MachineView const &mv,
-                             CostMetrics &cost_metrics) const override;
-
-  static void inference_kernel_wrapper(TreeIncMultiHeadSelfAttentionMeta *m,
-                                       BatchConfig const *bc,
-                                       int shard_id,
-                                       GenericTensorAccessorR const &input,
-                                       GenericTensorAccessorR const &weight,
-                                       GenericTensorAccessorW const &output,
-                                       GenericTensorAccessorR const &bias);
-
-  Params get_params() const;
-
-public:
-  int num_q_heads, num_kv_heads, tensor_parallelism_degree;
-  float dropout, scaling_factor;
-  bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
-  int qoSeqLength, kvSeqLength;
-  DataType quantization_type;
-  bool offload;
-};
-
-class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
-public:
-  TreeIncMultiHeadSelfAttentionMeta(FFHandler handler,
-                                    TreeIncMultiHeadSelfAttention const *attn,
-                                    GenericTensorAccessorR const &weight,
-                                    MemoryAllocator &gpu_mem_allocator,
-                                    int num_samples,
-                                    int _num_q_heads,
-                                    int _num_kv_heads);
-  ~TreeIncMultiHeadSelfAttentionMeta(void);
-
-public:
-  int num_active_tokens;
-  Realm::RegionInstance committed_token_reserve_inst;
-  BatchConfig::CommittedTokensInfo *committed_token_infos;
-  BatchConfig::BitMask *causalMask;
-};
-
-}; // namespace FlexFlow
-
-#endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_H
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 93cc8ddcb..81e7c0f9b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -383,6 +383,7 @@ class RequestManager {
     // Times for each SSM speculation phase (in ms)
     std::vector<double> ssm_step_times;
     // Number of requests getting decoded at each step
+    std::vector<int> ssm_steps;
     std::vector<int> requests_per_step;
     // Number of generated tokens at each step
     std::vector<int> generated_tokens_per_step;
diff --git a/src/ops/arg_topk.cc.backup b/src/ops/arg_topk.cc.backup
deleted file mode 100644
index 706fbbc7a..000000000
--- a/src/ops/arg_topk.cc.backup
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ops/arg_topk.h"
-#include "flexflow/model.h"
-#include "flexflow/utils/hash_utils.h"
-#include "legion/legion_utilities.h"
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "flexflow/utils/cuda_helper.h"
-#else
-#include "flexflow/utils/hip_helper.h"
-#endif
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::Future;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
-
-// For an input tensor, computes the top k entries in each row
-// (resp. vector along the last dimension). Thus,
-// values.shape = indices.shape = input.shape[:-1] + [k]
-Tensor FFModel::arg_top_k(Tensor const input,
-                          int k,
-                          bool sorted,
-                          bool speculative_decoding,
-                          char const *name) {
-  Layer *li = new Layer(this,
-                        OP_ARG_TOPK,
-                        input->data_type,
-                        name,
-                        1 /*inputs*/,
-                        0 /*weights*/,
-                        speculative_decoding ? 2 : 1 /*outputs*/,
-                        input);
-  {
-    int numdims = input->num_dims;
-    int dims[MAX_TENSOR_DIM];
-    for (int i = 0; i < numdims; i++) {
-      dims[i] = input->dims[i];
-    }
-    dims[0] = k;
-    // li->outputs[0] = create_tensor_legion_ordering(
-    //     numdims, dims, input->data_type, li, 0, true /*create_grad*/);
-    li->outputs[0] = create_tensor_legion_ordering(
-        numdims, dims, DT_INT32, li, 0, false /*create_grad*/);
-    if (speculative_decoding) {
-      li->outputs[1] = create_tensor_legion_ordering(
-          numdims, dims, DT_FLOAT, li, 1, false /*create_grad*/);
-    }
-  }
-  li->add_int_property("k", k);
-  li->add_int_property("sorted", sorted);
-  li->add_int_property("speculative_decoding", speculative_decoding);
-  layers.push_back(li);
-  // outputs[0] = li->outputs[0];
-  // outputs[1] = li->outputs[1];
-  return li->outputs[0];
-}
-
-Op *ArgTopK::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("k", value);
-  int k = value;
-  layer->get_int_property("sorted", value);
-  bool sorted = (bool)value;
-  layer->get_int_property("speculative_decoding", value);
-  bool speculative_decoding = (bool)value;
-
-  return new ArgTopK(model,
-                     layer->layer_guid,
-                     inputs[0],
-                     k,
-                     sorted,
-                     speculative_decoding,
-                     layer->name);
-}
-
-ArgTopKParams ArgTopK::get_params() const {
-  ArgTopKParams params;
-  params.k = this->k;
-  params.sorted = this->sorted;
-  params.speculative_decoding = this->speculative_decoding;
-  if (this->name != nullptr) {
-    strcpy(params.name, this->name);
-  }
-  return params;
-}
-
-bool ArgTopKParams::is_valid(ParallelTensorShape const &) const {
-  // topk is always valid
-  return true;
-}
-
-bool operator==(ArgTopKParams const &lhs, ArgTopKParams const &rhs) {
-  return lhs.k == rhs.k && lhs.sorted == rhs.sorted &&
-         lhs.speculative_decoding == rhs.speculative_decoding;
-}
-
-ArgTopK::ArgTopK(FFModel &model,
-                 LayerID const &_layer_guid,
-                 ParallelTensor const _input,
-                 int _k,
-                 bool _sorted,
-                 bool _speculative_decoding,
-                 char const *name)
-    : Op(model,
-         OP_ARG_TOPK,
-         _input->data_type,
-         name,
-         1 /*inputs*/,
-         0 /*weights*/,
-         _speculative_decoding ? 2 : 1 /*outputs*/,
-         _input),
-      k(_k), sorted(_sorted), speculative_decoding(_speculative_decoding) {
-  // overwrite layer_guid
-  layer_guid = _layer_guid;
-  int numdim = inputs[0]->num_dims;
-  ParallelDim dims[MAX_TENSOR_DIM];
-  for (int i = 0; i < numdim; i++) {
-    dims[i] = inputs[0]->dims[i];
-  }
-
-  dims[0].size = k;
-  assert(inputs[0]->dims[0].degree == 1);
-  assert(inputs[0]->dims[0].parallel_idx == -1);
-
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      numdim, dims, DT_INT32, this, 0 /*owner_idx*/);
-  if (_speculative_decoding) {
-    outputs[1] = model.create_parallel_tensor_legion_ordering(
-        numdim, dims, DT_FLOAT, this, 1 /*owner_idx*/);
-  }
-}
-
-ArgTopK::ArgTopK(FFModel &model,
-                 LayerID const &layer_guid,
-                 ArgTopK const &other,
-                 ParallelTensor const input)
-    : ArgTopK(model,
-              layer_guid,
-              input,
-              other.k,
-              other.sorted,
-              other.speculative_decoding,
-              other.name) {}
-
-ArgTopK::ArgTopK(FFModel &model,
-                 ArgTopKParams const &params,
-                 ParallelTensor const input,
-                 char const *name)
-    : ArgTopK(model,
-              params.layer_guid,
-              input,
-              params.k,
-              params.sorted,
-              params.speculative_decoding,
-              params.name) {}
-
-void ArgTopK::init_inference(FFModel const &ff,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = batch_outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  size_t machine_view_hash = view->hash();
-  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
-  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(ArgTopK)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  //   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
-  //                                                     0 /*projection id*/,
-  //                                                     WRITE_ONLY,
-  //                                                     EXCLUSIVE,
-  //                                                     batch_outputs[1]->region));
-  //   launcher.add_field(2, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
-}
-
-void ArgTopK::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(ARG_TOPK_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(ArgTopK)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  //   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
-  //                                                     0 /*projection id*/,
-  //                                                     WRITE_ONLY,
-  //                                                     EXCLUSIVE,
-  //                                                     outputs[1]->region));
-  //   launcher.add_field(2, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
-
-OpMeta *ArgTopK::init_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
-  ArgTopK *topk = (ArgTopK *)task->args;
-  FFHandler handle = *((FFHandler *)task->local_args);
-  ArgTopKMeta *m = new ArgTopKMeta(handle, topk);
-  m->profiling = topk->profiling;
-  m->inference_debugging = topk->inference_debugging;
-  m->sorted = topk->sorted;
-  m->k = topk->k;
-  std::strcpy(m->op_name, topk->name);
-  m->layer_guid = topk->layer_guid;
-  m->speculative_decoding = topk->speculative_decoding;
-  return m;
-}
-
-void ArgTopK::forward(FFModel const &ff) {
-  // ArgTopK does not support forward
-  assert(false);
-}
-
-FutureMap ArgTopK::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  size_t machine_view_hash = view->hash();
-  /* std::cout << "ArgTopK op machine_view: " << *(MachineView const *)mv
-            << std::endl; */
-  if (speculative_decoding) {
-    IndexLauncher launcher(ARG_TOPK_INF_SPECULATIVE_TASK_ID,
-                           parallel_is,
-                           TaskArgument(nullptr, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           machine_view_hash);
-    launcher.add_future(bc);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[0]->region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region));
-    launcher.add_field(1, FID_DATA);
-
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[1]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[1]->region));
-    launcher.add_field(2, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
-
-  } else {
-    IndexLauncher launcher(ARG_TOPK_INF_TASK_ID,
-                           parallel_is,
-                           TaskArgument(nullptr, 0),
-                           argmap,
-                           Predicate::TRUE_PRED,
-                           false /*must*/,
-                           0 /*mapper_id*/,
-                           machine_view_hash);
-    launcher.add_future(bc);
-    launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      batch_inputs[0]->region));
-    launcher.add_field(0, FID_DATA);
-    launcher.add_region_requirement(
-        RegionRequirement(batch_outputs[0]->part,
-                          0 /*projection id*/,
-                          WRITE_ONLY,
-                          EXCLUSIVE,
-                          batch_outputs[0]->region));
-    launcher.add_field(1, FID_DATA);
-    return runtime->execute_index_space(ctx, launcher);
-  }
-}
-
-InferenceResult
-    ArgTopK::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // const ArgTopK* topk = (const ArgTopK*) task->args;
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
-    // Directly return for empty batch config
-    InferenceResult ir;
-    return ir;
-  }
-  ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
-      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW probs;
-
-  int batch_size = bc->num_active_tokens();
-  ArgTopK::forward_kernel_wrapper(
-      m, input, probs, indices, batch_size, nullptr);
-
-  if (m->inference_debugging) {
-    assert(task->index_point.get_dim() == 1);
-    int shard_id = task->index_point.point_data[0];
-    ArgTopK::save_inference_tensors_to_file(
-        m, shard_id, bc, {input}, {}, {indices});
-  }
-
-  InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
-      indices.get_int32_ptr(), ir.token_ids, batch_size);
-  return ir;
-}
-
-InferenceResult ArgTopK::inference_speculative_task(
-    Task const *task,
-    std::vector<PhysicalRegion> const &regions,
-    Context ctx,
-    Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
-  if (bc.num_active_tokens() == 0) {
-    // Directly return for empty batch config
-    InferenceResult ir;
-    return ir;
-  }
-  ArgTopKMeta *m = *((ArgTopKMeta **)task->local_args);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
-      DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
-
-  int batch_size = bc.num_active_tokens();
-  ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
-
-  InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
-      indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
-  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
-  return ir;
-}
-
-void ArgTopK::backward(FFModel const &ff) {
-  // ArgTopK does not support backward
-  assert(false);
-}
-
-void ArgTopK::serialize(Legion::Serializer &sez) const {
-  sez.serialize(this->layer_guid.id);
-  sez.serialize(this->layer_guid.transformer_layer_id);
-  sez.serialize(this->layer_guid.model_id);
-  sez.serialize(this->k);
-  sez.serialize(this->sorted);
-  sez.serialize(this->speculative_decoding);
-  sez.serialize(strlen(this->name));
-  sez.serialize(this->name, strlen(this->name));
-}
-
-Node ArgTopK::deserialize(FFModel &ff,
-                          Legion::Deserializer &dez,
-                          ParallelTensor inputs[],
-                          int num_inputs) {
-  assert(num_inputs == 1);
-  size_t id, transformer_layer_id, deserialized_model_id;
-  dez.deserialize(id);
-  dez.deserialize(transformer_layer_id);
-  dez.deserialize(deserialized_model_id);
-  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
-  int k;
-  bool sorted;
-  bool speculative_decoding;
-  dez.deserialize(k);
-  dez.deserialize(sorted);
-  dez.deserialize(speculative_decoding);
-  size_t name_len;
-  char name[MAX_OPNAME] = {0};
-  dez.deserialize(name_len);
-  dez.deserialize(name, name_len);
-  ArgTopKParams params;
-  params.layer_guid = layer_guid;
-  params.k = k;
-  params.sorted = sorted;
-  params.speculative_decoding = speculative_decoding;
-  strcpy(params.name, name);
-  return ff.get_or_create_node<ArgTopK>(inputs[0], params);
-}
-
-Op *ArgTopK::materialize(FFModel &ff,
-                         ParallelTensor inputs[],
-                         int num_inputs) const {
-  ArgTopKParams params = get_params();
-  return new ArgTopK(ff, params, inputs[0], this->name);
-}
-
-bool ArgTopK::measure_operator_cost(Simulator *sim,
-                                    MachineView const &mv,
-                                    CostMetrics &cost_metrics) const {
-  return false;
-}
-
-}; // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::ArgTopKParams>::operator()(
-    FlexFlow::ArgTopKParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.layer_guid.id);
-  hash_combine(key, params.k);
-  hash_combine(key, params.sorted);
-  hash_combine(key, params.speculative_decoding);
-  return key;
-}
-}; // namespace std
diff --git a/src/ops/arg_topk.cu.backup b/src/ops/arg_topk.cu.backup
deleted file mode 100644
index 491b255be..000000000
--- a/src/ops/arg_topk.cu.backup
+++ /dev/null
@@ -1,525 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flexflow/ops/arg_topk.h"
-#include "flexflow/utils/cuda_helper.h"
-
-namespace FlexFlow {
-// declare Legion names
-using Legion::coord_t;
-
-enum class HeapType { kMinHeap, kMaxHeap };
-enum class PreferIndices { kLower, kHigher };
-
-template <typename T>
-struct Entry {
-  int index;
-  T value;
-};
-
-template <typename T>
-struct LinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return data[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-};
-
-template <typename T>
-struct IndirectLinearData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index];
-  }
-
-  __device__ int get_index(int i) const {
-    return backing_data[data[i].index].index;
-  }
-  __device__ T get_value(int i) const {
-    return data[i].value;
-  }
-
-  Entry *const data;
-  Entry *const backing_data;
-};
-
-template <typename T>
-struct StridedData {
-  typedef Entry<T> Entry;
-
-  __device__ Entry &operator[](std::size_t index) const {
-    return data[index * blockDim.x + threadIdx.x];
-  }
-
-  __device__ int get_index(int i) const {
-    return (*this)[i].index;
-  }
-  __device__ T get_value(int i) const {
-    return (*this)[i].value;
-  }
-
-  Entry *const data;
-};
-
-// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-struct IndexedHeap {
-  typedef typename Data<T>::Entry Entry;
-  Data<T> const data;
-  __device__ IndexedHeap(Data<T> const &d) : data(d) {}
-
-  __device__ bool is_above(int left, int right) {
-    T left_value = data.get_value(left);
-    T right_value = data.get_value(right);
-    if (left_value == right_value) {
-      if (preferIndices == PreferIndices::kLower) {
-        return data.get_index(left) < data.get_index(right);
-      } else {
-        return data.get_index(left) > data.get_index(right);
-      }
-    }
-    if (heapType == HeapType::kMinHeap) {
-      return left_value < right_value;
-    } else {
-      return left_value > right_value;
-    }
-  }
-
-  __device__ void assign(int i, Entry const &entry) {
-    data[i] = entry;
-  }
-
-  __device__ void push_up(int i) {
-    int child = i;
-    int parent;
-    for (; child > 0; child = parent) {
-      parent = (child - 1) / 2;
-      if (!is_above(child, parent)) {
-        // Heap property satisfied.
-        break;
-      }
-      swap(child, parent);
-    }
-  }
-
-  __device__ void swap(int a, int b) {
-    auto tmp = data[b];
-    data[b] = data[a];
-    data[a] = tmp;
-  }
-
-  __device__ void push_root_down(int k) {
-    push_down(0, k);
-  }
-
-  // MAX-HEAPIFY in Cormen
-  __device__ void push_down(int node, int k) {
-    while (true) {
-      int const left = 2 * node + 1;
-      int const right = left + 1;
-      int smallest = node;
-      if (left < k && is_above(left, smallest)) {
-        smallest = left;
-      }
-      if (right < k && is_above(right, smallest)) {
-        smallest = right;
-      }
-      if (smallest == node) {
-        break;
-      }
-      swap(smallest, node);
-      node = smallest;
-    }
-  }
-
-  // BUILD-MAX-HEAPIFY in Cormen
-  __device__ void build(int k) {
-    for (int node = (k - 1) / 2; node >= 0; node--) {
-      push_down(node, k);
-    }
-  }
-
-  // HEAP-EXTRACT-MAX in Cormen
-  __device__ void remove_root(int k) {
-    data[0] = data[k - 1];
-    push_root_down(k - 1);
-  }
-
-  // in-place HEAPSORT in Cormen
-  // This method destroys the heap property.
-  __device__ void sort(int k) {
-    for (int slot = k - 1; slot > 0; slot--) {
-      // This is like remove_root but we insert the element at the end.
-      swap(slot, 0);
-      // Heap is now an element smaller.
-      push_root_down(/*k=*/slot);
-    }
-  }
-
-  __device__ void replace_root(Entry const &entry, int k) {
-    data[0] = entry;
-    push_root_down(k);
-  }
-
-  __device__ Entry const &root() {
-    return data[0];
-  }
-};
-
-template <HeapType heapType,
-          PreferIndices preferIndices,
-          template <typename>
-          class Data,
-          typename T>
-__device__ IndexedHeap<heapType, preferIndices, Data, T>
-    make_indexed_heap(typename Data<T>::Entry *data) {
-  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
-}
-
-// heapArgTopK walks over [input, input+length) with `step_size` stride starting
-// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
-// using `Accessor` to access elements in `heap_entries`. If sorted=true, the
-// elements will be sorted at the end.
-template <typename T, template <typename> class Data = LinearData>
-__device__ void heapArgTopK(T const *__restrict__ input,
-                            int length,
-                            int k,
-                            Entry<T> *__restrict__ heap_entries,
-                            bool sorted = false,
-                            int start_index = 0,
-                            int step_size = 1) {
-  assert(k <= length);
-
-  auto heap =
-      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
-          heap_entries);
-
-  int heap_end_index = start_index + k * step_size;
-  if (heap_end_index > length) {
-    heap_end_index = length;
-  }
-  // Initialize the min-heap.
-  for (int index = start_index, slot = 0; index < heap_end_index;
-       index += step_size, slot++) {
-    heap.assign(slot, {index, input[index]});
-  }
-
-  heap.build(k);
-
-  // Now iterate over the remaining items.
-  // If an item is smaller than the min element, it is not amongst the top k.
-  // Otherwise, replace the min element with it and push upwards.
-  for (int index = heap_end_index; index < length; index += step_size) {
-    // We prefer elements with lower indices. This is given here.
-    // Later elements automatically have higher indices, so can be discarded.
-    if (input[index] > heap.root().value) {
-      // This element should replace the min.
-      heap.replace_root({index, input[index]}, k);
-    }
-  }
-
-  // Sort if wanted.
-  if (sorted) {
-    heap.sort(k);
-  }
-}
-
-// mergeShards performs a top-k merge on `num_shards` many sorted streams that
-// are sorted and stored in `entries` in a strided way:
-// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
-// The overall top k elements are written to `top_k_values` and their indices
-// to top_k_indices.
-// `top_k_heap` is used as temporary storage for the merge heap.
-template <typename T>
-__device__ void mergeShards(int num_shards,
-                            int k,
-                            Entry<T> *__restrict__ entries,
-                            Entry<T> *__restrict__ top_k_heap,
-                            float *top_k_values,
-                            int *top_k_indices,
-                            bool speculative_decoding) {
-  // If k < num_shards, we can use a min-heap with k elements to get the top k
-  // of the sorted blocks.
-  // If k > num_shards, we can initialize a min-heap with the top element from
-  // each sorted block.
-  int const heap_size = k < num_shards ? k : num_shards;
-
-  // Min-heap part.
-  {
-    auto min_heap = IndexedHeap<HeapType::kMinHeap,
-                                PreferIndices::kHigher,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Initialize the heap as a min-heap.
-    for (int slot = 0; slot < heap_size; slot++) {
-      min_heap.assign(slot, {slot, entries[slot].value});
-    }
-    min_heap.build(heap_size);
-
-    // Now perform top k with the remaining shards (if num_shards > heap_size).
-    for (int shard = heap_size; shard < num_shards; shard++) {
-      auto const entry = entries[shard];
-      auto const root = min_heap.root();
-      if (entry.value < root.value) {
-        continue;
-      }
-      if (entry.value == root.value &&
-          entry.index > entries[root.index].index) {
-        continue;
-      }
-      // This element should replace the min.
-      min_heap.replace_root({shard, entry.value}, heap_size);
-    }
-  }
-
-  // Max-part.
-  {
-    // Turn the min-heap into a max-heap in-place.
-    auto max_heap = IndexedHeap<HeapType::kMaxHeap,
-                                PreferIndices::kLower,
-                                IndirectLinearData,
-                                T>{IndirectLinearData<T>{top_k_heap, entries}};
-    // Heapify into a max heap.
-    max_heap.build(heap_size);
-
-    // Now extract the minimum k-1 times.
-    // k is treated specially.
-    int const last_k = k - 1;
-    for (int rank = 0; rank < last_k; rank++) {
-      Entry<T> const &max_element = max_heap.root();
-      if (speculative_decoding) {
-        assert(top_k_values != nullptr);
-        top_k_values[rank] = static_cast<float>(max_element.value);
-      }
-
-      int shard_index = max_element.index;
-      top_k_indices[rank] = entries[shard_index].index;
-      int next_shard_index = shard_index + num_shards;
-      // For rank < k-1, each top k heap still contains at least 1 element,
-      // so we can draw a replacement.
-      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
-                            heap_size);
-    }
-
-    // rank == last_k.
-    Entry<T> const &max_element = max_heap.root();
-    // top_k_values[last_k] = max_element.value;
-    int shard_index = max_element.index;
-    top_k_indices[last_k] = entries[shard_index].index;
-    top_k_values[last_k] = static_cast<float>(max_element.value);
-  }
-}
-
-template <typename T>
-__global__ void arg_topk_forward_kernel(T const *__restrict__ input,
-                                        size_t shared_memory_size,
-                                        int length,
-                                        int k,
-                                        bool sorted,
-                                        float *__restrict__ output,
-                                        int *__restrict__ indices,
-                                        bool speculative_decoding) {
-  __shared__ char shared_memory[48 << 10];
-  int const batch_index = blockIdx.x;
-  T const *batch_input = input + batch_index * length;
-  int const thread_index = threadIdx.x;
-  int const thread_count = blockDim.x;
-  Entry<T> *shared_entries = (Entry<T> *)shared_memory;
-  heapArgTopK<T, StridedData>(
-      batch_input, length, k, shared_entries, true, thread_index, thread_count);
-  __syncthreads();
-  if (thread_index == 0) {
-    int const offset = batch_index * k;
-    auto batch_output = output + offset;
-    auto batch_indices = indices + offset;
-    Entry<T> *top_k_heap = shared_entries + thread_count * k;
-    mergeShards(thread_count,
-                k,
-                shared_entries,
-                top_k_heap,
-                batch_output,
-                batch_indices,
-                speculative_decoding);
-  }
-}
-
-/*static*/
-template <typename DT>
-void ArgTopK::forward_kernel(
-    ArgTopKMeta const *m,
-    DT const *input_ptr,
-    float *output_ptr,
-    int *indices_ptr,
-    size_t batch_size,
-    int length,
-    int k,
-    bool sorted,
-    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
-    cudaStream_t stream) {
-  // Adopted from TensorFlow's ArgTopK implementation
-  // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
-  int num_shards = 0;
-  {
-    constexpr auto shared_memory_size = 48 << 10;
-    auto const heap_size = k * sizeof(Entry<DT>);
-    // shared_memory_size = (num_shards + 1) * heap_size <=>
-    num_shards = shared_memory_size / heap_size - 1;
-    assert(num_shards > 0);
-    if (num_shards > CUDA_NUM_THREADS) {
-      num_shards = CUDA_NUM_THREADS;
-    }
-  }
-  // We are limited by the amount of shared memory we have per block.
-  size_t shared_memory_size = (num_shards + 1) * k * sizeof(Entry<DT>);
-  // size_t num_blocks = (batch_size + num_shards - 1) / num_shards;
-  size_t num_blocks = batch_size;
-
-  // all requests share the same number of branches
-  if (m->speculative_decoding) {
-    assert(bc->num_active_requests() >= 0);
-    assert(num_shards >= (size_t)BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
-    num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
-        input_ptr,
-        shared_memory_size,
-        length,
-        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES,
-        sorted,
-        output_ptr,
-        indices_ptr,
-        m->speculative_decoding);
-  } else {
-
-    assert(num_shards >= (size_t)k);
-    num_shards = k;
-    arg_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
-        input_ptr,
-        shared_memory_size,
-        length,
-        k,
-        sorted,
-        nullptr,
-        indices_ptr,
-        false);
-  }
-}
-
-/*static*/
-void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
-                                     GenericTensorAccessorR const &input,
-                                     // float *output_ptr,
-                                     GenericTensorAccessorW const &probs,
-                                     GenericTensorAccessorW const &indices,
-                                     int batch_size,
-                                     BatchConfig const *bc) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  // Domain in1_domain = runtime->get_index_space_domain(
-  //     ctx, task->regions[0].region.get_index_space());
-  //   Domain out1_domain = runtime->get_index_space_domain(
-  //       ctx, task->regions[1].region.get_index_space());
-  // Domain out2_domain = runtime->get_index_space_domain(
-  //     ctx, task->regions[1].region.get_index_space());
-  int numdims = input.domain.get_dim();
-  assert(indices.domain.get_dim() == numdims);
-
-  int in_cols = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  // int out1_cols = out1_domain.hi()[0] - out1_domain.lo()[0] + 1;
-  int out2_cols = indices.domain.hi()[0] - indices.domain.lo()[0] + 1;
-
-  // assert(out1_domain == out2_domain);
-  for (int i = 1; i < input.domain.get_dim(); i++) {
-    assert(input.domain.lo()[i] == indices.domain.lo()[i]);
-    assert(input.domain.hi()[i] == indices.domain.hi()[i]);
-  }
-  // float const *in_ptr = helperGetTensorPointerRO<float>(
-  //     regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  //   float *value_ptr = helperGetTensorPointerWO<float>(
-  //       regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // int *index_ptr = helperGetTensorPointerWO<int>(
-  //    regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  int k = indices.domain.hi()[0] - indices.domain.lo()[0] +
-          1; /*TODO: This prints to 5*/
-
-  // batch_size = input.domain.get_volume() / length;
-  // assert(indices.domain.get_volume() / k == batch_size);
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
-
-  if (input.data_type == DT_HALF) {
-    ArgTopK::forward_kernel(m,
-                            input.get_half_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
-                                                    : nullptr,
-                            indices.get_int32_ptr(),
-                            batch_size,
-                            length,
-                            k,
-                            m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
-                            stream);
-  } else if (input.data_type == DT_FLOAT) {
-    ArgTopK::forward_kernel(m,
-                            input.get_float_ptr(),
-                            m->speculative_decoding ? probs.get_float_ptr()
-                                                    : nullptr,
-                            indices.get_int32_ptr(),
-                            batch_size,
-                            length,
-                            k,
-                            m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
-                            stream);
-  } else {
-    assert(false && "Unsupported data type");
-  }
-
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    printf("[ArgTopK] forward time = %.2lfms\n", elapsed);
-  }
-}
-
-ArgTopKMeta::ArgTopKMeta(FFHandler handler, Op const *op)
-    : OpMeta(handler, op) {}
-
-}; // namespace FlexFlow
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 4ef5f8446..cc844096d 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -530,8 +530,7 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  /* Reserved: BatchConfig Updated */ BatchConfig const *bc =
-      BatchConfig::from_future(task->futures[0]);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   // Return if no active tokens
   if (bc->num_tokens == 0) {
     return;
@@ -604,31 +603,21 @@ __host__ void
 
   // create new cuda graph
   cudaGraphExec_t instance;
-  // cudaGraphExecUpdateResult updateResult;
 
   GraphParams graph_params = {bc->num_active_requests(),
                       bc->num_active_tokens(),
                       bc->prompt_phase};
-  //graph_params.Print();
-  // int shard_id = task->index_point.point_data[0];
+  int shard_id = task->index_point.point_data[0];
 
-  bool use_cuda_graph = (bc->prompt_phase == false && bc->get_mode() == TREE_SEARCH_MODE);
+  // bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE or bc->get_mode() == TREE_VERIFY_MODE);
+  // bool use_cuda_graph = bc->get_mode() == TREE_VERIFY_MODE;
+  bool use_cuda_graph = bc->get_mode() == TREE_SEARCH_MODE;
+  // bool use_cuda_graph = false;
   bool captured = false;
 
   if(use_cuda_graph && metas->graph_collections.count(graph_params)  != 0) {
     captured = true;
     instance = metas->graph_collections[graph_params];
-    // if (cudaGraphExecUpdate(instance, graph, NULL, &updateResult) != cudaSuccess) {
-    //   cudaGraphExecDestroy(instance);
-    //   captured = false;
-    // } else {
-    //   // if(shard_id == 0) {
-    //   //   printf("---------start to reuse the graph-------\n");
-    //   //   graph_params.Print();
-    //   //   // bc->print();
-    //   //   printf("---------end to reuse the graph-------\n");
-    //   // }
-    // }
   }
 
   if (!captured) {
@@ -935,10 +924,6 @@ __host__ void
             assert(fused->op_num_outputs[op] == 1);
             TreeIncMultiHeadSelfAttentionMeta *m =
                 (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-            // TreeVerifyBatchConfig const *verify_bc =
-            //     (TreeVerifyBatchConfig *)task->args;
-            // BatchConfig const &verify_bc =
-            //     Future(task->futures[0]).get_result<BatchConfig>();
             BatchConfig const *verify_bc =
                 BatchConfig::from_future(task->futures[0]);
             assert(fused->op_num_weights[op] ==
@@ -961,12 +946,8 @@ __host__ void
           case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
             assert(fused->op_num_inputs[op] == 1);
             assert(fused->op_num_outputs[op] == 1);
-            SpecIncMultiHeadSelfAttentionMeta const *m =
+            SpecIncMultiHeadSelfAttentionMeta *m =
                 (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-            // TreeSearchBatchConfig const *search_bc =
-            //     (TreeSearchBatchConfig *)task->args;
-            // BatchConfig const &search_bc =
-            //     Future(task->futures[0]).get_result<BatchConfig>();
             BatchConfig const *search_bc =
                 BatchConfig::from_future(task->futures[0]);
             assert(fused->op_num_weights[op] ==
@@ -1172,12 +1153,6 @@ __host__ void
     if (use_cuda_graph) { 
       cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
       metas->graph_collections[graph_params] = instance;
-      // if(shard_id == 0) {
-      //   printf("*************start cudaGraphInstantiate**********\n");
-      //   graph_params.Print();
-      //   // bc->print();
-      //   printf("*************end cudaGraphInstantiate**********\n");
-      // }
       cudaGraphDestroy(graph);
     }
   }
@@ -1185,7 +1160,23 @@ __host__ void
   if (use_cuda_graph) {
     assert(metas->graph_collections.find(graph_params) !=
           metas->graph_collections.end());
+    cudaEvent_t t_start, t_end;
+    float elapsed;
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+
     cudaGraphLaunch(instance, stream);
+
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    if(shard_id == 0 && bc->get_mode() == TREE_SEARCH_MODE) {
+      printf("cudaGraphLaunch time: %f ms, captured: %d\n", elapsed, captured);
+    }
   }
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 102010869..7f247e18c 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1432,10 +1432,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                        vProjSize * num_q_heads);
     size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0,
            qk_prod_size = 0;
-    assert((BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num()) %
-               kPagesize ==
-           0);
+    // assert((BatchConfig::max_sequence_length() +
+    //         BatchConfig::max_spec_tree_token_num()) %
+    //            kPagesize ==
+    //        0);
     size_t max_num_pages =
         (BatchConfig::max_sequence_length() +
          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
@@ -1452,21 +1452,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                        BatchConfig::max_sequence_length() * num_q_heads;
         break;
       }
-      case TREE_SEARCH_MODE: {
-        key_cache_size = num_q_heads * kProjSize *
-                         BatchConfig::max_requests_per_batch() *
-                         (BatchConfig::max_sequence_length() +
-                          BatchConfig::max_spec_tree_token_num());
-        value_cache_size = num_q_heads * vProjSize *
-                           BatchConfig::max_requests_per_batch() *
-                           (BatchConfig::max_sequence_length() +
-                            BatchConfig::max_spec_tree_token_num());
-        qk_prod_size = BatchConfig::max_sequence_length() *
-                       (BatchConfig::max_sequence_length() +
-                        BatchConfig::max_spec_tree_token_num()) *
-                       num_q_heads;
-        break;
-      }
+      case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
         query_tmp_size =
             num_q_heads * qProjSize * BatchConfig::max_tokens_per_batch();
diff --git a/src/ops/multihead_self_attention_impl.cu b/src/ops/multihead_self_attention_impl.cu
new file mode 100644
index 000000000..08de141a6
--- /dev/null
+++ b/src/ops/multihead_self_attention_impl.cu
@@ -0,0 +1,166 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flashinfer/attention_impl.cuh"
+
+// This is for instantiating the template attention kernels
+namespace flashinfer {
+
+// warp_layout_literal[] = {
+//   "WarpLayout::k4x1x2",
+//   "WarpLayout::k4x1x1"
+// }
+// head_dim[] = {64, 128, 256};
+
+
+/********** batch append instantiations for half precision **********/
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 64,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 128,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 256,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 64,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 128,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 256,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCustom, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+
+/********** batch prefill instantiations for half precision **********/
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 64,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCausal, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 128,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCausal, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 256,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCausal, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 64,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCausal, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 128,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCausal, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+
+template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 256,
+          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
+          false, MaskMode::kCausal, half, half, int32_t>(
+  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
+  int32_t* q_indptr, int32_t* q_offset,
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
+  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
+  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
+  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
+  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+} // namespace flashinfer
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 40a631ab4..706ea4f79 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -15,15 +15,41 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/utils/cuda_helper.h"
 
-namespace FlexFlow {
+#include <sstream>
+#include <stdexcept>
+
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
+  }
 
-#define WARP_SIZE 32
+namespace FlexFlow {
 
 // declare Legion names
 using Legion::coord_t;
@@ -33,663 +59,287 @@ using namespace Kernels::IncMultiHeadAttention;
 namespace Kernels {
 namespace SpecIncMultiHeadSelfAttention {
 
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_spec_inc_attention_kernel_generation_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int const max_seq_length,
-    int per_head_size,
-    int hidden_size,
-    /* Reserved: BatchConfig Updated */
-    BatchConfig::PerRequestInfo *request_infos,
-    BatchConfig::BitMask *causalMask,
-    bool *request_available) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // nth request idx
-  int const request_idx = blockIdx.y;
-
-  // request id in batch config
-  int requext_idx_in_batch = -1;
-  int cnt_1 = 0;
-  while (cnt_1 < request_idx + 1) {
-    requext_idx_in_batch++;
-    if (request_available[requext_idx_in_batch]) {
-      cnt_1++;
-    }
-  }
-
-  int non_tree_cache_size =
-      causalMask[requext_idx_in_batch].non_tree_cache_size;
-  int tree_or_prompt_size =
-      causalMask[requext_idx_in_batch].tree_or_prompt_size;
-  int current_layer_size = causalMask[requext_idx_in_batch].current_layer_size;
-  int start_offset = tree_or_prompt_size - current_layer_size;
-
-  __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
-                              [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
-  for (int i = start_offset + tidx; i < tree_or_prompt_size;
-       i += THREADS_PER_BLOCK) {
-    for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
-      bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
-    }
-  }
-
-  int const first_step = 0;
-
-  int const totalCacheSize = non_tree_cache_size + tree_or_prompt_size;
-
-  int const first_token_idx =
-      request_infos[requext_idx_in_batch].first_token_offset_in_batch;
-
-  int const tree_branch_num =
-      request_infos[requext_idx_in_batch].num_tokens_in_batch;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-
-  for (int qi = 0; qi < tree_branch_num; qi += 1) {
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
-          ii * THREADS_PER_KEY * K_VEC_SIZE);
-    }
-
-    int const query_token = tree_or_prompt_size - tree_branch_num + qi;
-
-    __syncthreads();
-    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-      K_vec k[K_VECS_PER_THREAD];
-      int const ti_circ = ti % max_seq_length;
-
-      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-        if (ti < totalCacheSize) {
-
-          k[ii] = *reinterpret_cast<K_vec const *>(
-              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
-              jj);
-        }
-      }
-      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-
-      if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) {
-        // todo add alobi here
-        // bool const mask = ti_circ >= totalCacheSize;
-        bool const mask =
-            (ti >= non_tree_cache_size &&
-             (!test_bit(bit_mask, query_token, ti - non_tree_cache_size)));
-
-        // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) {
-        //   printf("spec inc attn qkqkqk  request id %d,  %.10f, %d\n",
-        //          requext_idx_in_batch,
-        //          ti,
-        //          qk,
-        //          qi);
-        // }
-        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-        qk_smem[ti - first_step] = mask ? 0.f : qk;
-      }
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Decompose the thread index into warp and lane.
-    int const warp = tidx / WARP_SIZE;
-    int const lane = tidx % WARP_SIZE;
-
-    // The warp leader writes the max to shared memory.
-    if (lane == 0) {
-      red_smem[warp] = qk_max;
-    }
-
-    // Make sure the products are in shared memory.
-    __syncthreads();
-
-    // The warps finalize the reduction.
-    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Broadcast to all the threads in the warp.
-    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-    // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
-    //   printf("spec inc attn first token qk_max %.10f\n", qk_max);
-    // }
-
-    float exp_sum = 0.f;
-    for (int ti = first_step + tidx; ti < totalCacheSize;
-         ti += THREADS_PER_BLOCK) {
-      bool const mask =
-          (ti >= non_tree_cache_size &&
-           (!test_bit(bit_mask, query_token, ti - non_tree_cache_size)));
-      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
-      exp_sum += logit;
-      qk_smem[ti - first_step] = mask ? 0.0f : logit;
-    }
-
-    // Compute the sum.
-    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-    // softmax
-    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-    for (int ti = first_step + tidx; ti < totalCacheSize;
-         ti += THREADS_PER_BLOCK) {
-      qk_smem[ti - first_step] *= inv_sum;
-    }
-
-    __syncthreads();
-
-    // value projection
-    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-    // A vector of V elements for the current timestep.
-    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-    // The value computed by this thread.
-    int vo = tidx / THREADS_PER_VALUE;
-    // The hidden dimensions computed by this particular thread.
-    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-    Out_sum out;
-    zero(out);
-
-    // The base pointer for the value in the cache buffer.
-    DT const *v_cache_batch =
-        value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
-
-    if (Dh == Dh_MAX || vi < Dh) {
-      for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) {
-        // Load the values from the cache.
-        int const ti_circ = ti % max_seq_length;
-        V_vec v = *reinterpret_cast<V_vec const *>(
-            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-        float logit = qk_smem[ti - first_step];
-        out = FlexFlow::fma(logit, cast_to_float(v), out);
-      }
-    }
-
-    //   // Make sure we can start writing to shared memory.
-    __syncthreads();
-
-    // Run the final reduction amongst the different groups computing different
-    // partial outputs.
-    if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-      for (int active_groups = V_PER_ITER; active_groups >= 2;
-           active_groups /= 2) {
-
-        // The midpoint in the number of active groups.
-        int midpoint = active_groups / 2;
-
-        // The upper part of active threads store to shared memory.
-        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-              out;
-        }
-        __syncthreads();
-
-        // The bottom warps update their values.
-        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                    out);
-        }
-        __syncthreads();
-      }
-    }
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
+using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
+
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) *
+         hidden_size;
+}
 
-    // Output the final values.
-    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-      convert_from_float(*reinterpret_cast<V_vec *>(
-                             output_ptr + (first_token_idx + qi) * hidden_size +
-                             head_idx * per_head_size + vi),
-                         out);
-    }
-  }
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) *
+         hidden_size;
 }
 
 template <typename DT>
 __global__ void
-    spec_inc_store_kv_cache(DT const *devQKVProjArray,
-                            DT *kCache_ptr,
-                            DT *vCache_ptr,
-                            BatchConfig::PerTokenInfo *tokenInfos,
-                            BatchConfig::PerRequestInfo *requestInfo,
-                            BatchConfig::BitMask *causalMask,
-                            int qProjSize,
-                            int kProjSize,
-                            int vProjSize,
-                            int num_tokens,
-                            int max_seq_len,
-                            int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / (hidden_size);
-    int offset = i % hidden_size;
-
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const cache_idx = tokenInfos[token_idx].abs_index_in_request;
-
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size +
-               offset] = vVal;
+    update_qkv_cache_kernel(DT *devQKVProjArray,
+                            half *qTmp_ptr,
+                            half *kCache_ptr,
+                            BatchConfig::PerTokenInfo const *tokenInfos,
+                            BatchConfig::PerRequestInfo *request_infos,
+                            int const max_num_pages,
+                            int hidden_size,
+                            int num_new_tokens) {
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / hidden_size;
+  int const offset = thread_idx % hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
   }
-}
 
-template <typename DT>
-void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
-                            BatchConfig const *bc,
-                            cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
-  if (num_tokens > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
-    spec_inc_store_kv_cache<<<GET_BLOCKS(parallelism),
-                              min(CUDA_NUM_THREADS, parallelism),
-                              0,
-                              stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
-        m->token_infos,
-        m->request_infos,
-        m->causalMask,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        num_tokens,
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        m->hidden_size);
-  }
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_k_idx = get_k_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size),
+         to_v_idx = get_v_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size);
+
+  // key and value cache should be stored interleaved
+  kCache_ptr[to_k_idx + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
+  kCache_ptr[to_v_idx + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
+  qTmp_ptr[token_idx * hidden_size + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + offset]);
 }
 
-#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(                                \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
-                                   BatchConfig::max_sequence_length() +        \
-                                       BatchConfig::max_spec_tree_token_num(), \
-                                   THREADS_PER_VALUE,                          \
-                                   THDS_PER_BLOCK);                            \
-  compute_spec_inc_attention_kernel_generation_kernel<DT,                      \
-                                                      THDS_PER_BLOCK,          \
-                                                      Dh,                      \
-                                                      Dh_MAX,                  \
-                                                      THDS_PER_KEY,            \
-                                                      THREADS_PER_VALUE>       \
-      <<<grid,                                                                 \
-         THDS_PER_BLOCK,                                                       \
-         smem_sz + BatchConfig::MAX_SPEC_TREE_TOKEN_NUM *                      \
-                       BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 8,               \
-         stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
-                   static_cast<DT *>(m->keyCache),                             \
-                   static_cast<DT *>(m->valueCache),                           \
-                   output_ptr,                                                 \
-                   scale,                                                      \
-                   BatchConfig::max_sequence_length() +                        \
-                       BatchConfig::max_spec_tree_token_num(),                 \
-                   m->qProjSize,                                               \
-                   m->hidden_size,                                             \
-                   m->request_infos,                                           \
-                   m->causalMask,                                              \
-                   m->request_available)
-
 template <typename DT>
-void compute_spec_inc_attention_kernel_generation(
-    SpecIncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    DT *output_ptr,
-    cudaStream_t stream) {
-  // one block == one head per request
-  // how many generation requests
-  dim3 grid(m->num_q_heads, bc->num_available_requests);
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  size_t smem_sz;
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
-  } else {
-    assert(false && "a unsupported head size");
-  }
+void update_qkv_cache(SpecIncMultiHeadSelfAttentionMeta const *m,
+                      BatchConfig const *bc,
+                      cudaStream_t stream) {
+  // update the kv cache, compact the q array
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  int const max_num_pages =
+      (BatchConfig::max_sequence_length() +
+       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+      kPagesize;
+  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                      static_cast<half *>(m->queryTmp),
+                                      static_cast<half *>(m->keyCache),
+                                      m->token_infos,
+                                      m->request_infos,
+                                      max_num_pages,
+                                      m->hidden_size,
+                                      num_new_tokens);
 }
 
 template <typename DT>
-__global__ void spec_fill_entries_above_diagonal(DT *matrix,
-                                                 size_t new_tokens,
-                                                 size_t total_tokens_in_request,
-                                                 size_t num_q_heads,
-                                                 DT value) {
-  CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) {
-    // size_t head_idx = i / (new_tokens * total_tokens_in_request);
-    size_t src_idx = (i / new_tokens) % total_tokens_in_request;
-    size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens;
-    // Casual Mask
-    if (src_idx > dst_idx) {
-      matrix[i] = value;
-    }
+__global__ void produce_output_kernel(half const *input_ptr,
+                                      DT *output_ptr,
+                                      int parallelism) {
+  CUDA_KERNEL_LOOP(idx, parallelism) {
+    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
   }
 }
 
 template <typename DT>
-void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
-                                     BatchConfig const *bc,
-                                     int shard_id,
-                                     DT *output_ptr,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
-                                     cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
-  int tokens_previous_requests = 0;
-  int tokens_prev_requests_squares = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens;
-  int q_block_size = m->qProjSize;
-
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size = kt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size = vt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i] ||
-        (bc->requestsInfo[i].num_tokens_in_batch == 0)) {
-      continue;
-    }
+void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
+                           BatchConfig const *bc,
+                           DT *output_ptr,
+                           cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const head_dim = m->qProjSize;
+  uint32_t const batch_size = bc->num_active_requests();
+  float const sm_scale =
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
 
-    // all requests in prompt phase should only have one sub requests;
-    // assert(bc->sub_requests[i] == 1);
-    // int num_new_tokens = bc->num_processing_tokens[i];
-    // int total_tokens = bc->token_last_available_idx[i] + 1;
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->keyCache),
+       *o = static_cast<half *>(m->outputTmp);
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
+      num_kv_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      kv,
+      m->handle.tree_search_attention_metadata->kv_indices,
+      m->handle.tree_search_attention_metadata->kv_indptr,
+      m->handle.tree_search_attention_metadata->kv_last_page_len);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    attn prep time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
-                       bc->requestsInfo[i].num_tokens_in_batch;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
 
-    if (num_new_tokens <= 0) {
-      continue;
-    }
+  BatchPrefillHandler *handler = nullptr;
 
-    // Compute (QK^T/sqrt(d_k))
-    int m_ = num_new_tokens;
-    int n = total_tokens;
-    int k = m->qProjSize;
-    int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-        ldc = m_;
-    int strideA = q_block_size;
-    int strideB = kt_block_size;
-    int strideC = num_new_tokens * total_tokens;
-
-    // a flag of using this scaling alpha
-    DT alpha = 1.0f, beta = 0.0f;
-    if (*m->qk_prod_scaling) {
-      alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-    }
-    // To get A, skip over Q entries from previous requests (same head)
-    DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                  bc->requestsInfo[i].first_token_offset_in_batch *
-                      m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-
-    // print_tensor<float>((float*)A, 32, "A");
-    DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-
-    // if (i == 0 && sub_req_id == 0 &&
-    //     bc->beam_slots.at(0).current_depth == 1) {
-    //   int offset = (float *)B - m->keyCache;
-    //   printf("key cache offset %d\n", kt_req_block_size);
-    // }
-    // To get C, skip over QK^T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods) +
-            m->num_q_heads * tokens_prev_requests_squares;
-    checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                         CUBLAS_OP_T,
-                                         CUBLAS_OP_N,
-                                         m_,
-                                         n,
-                                         k,
-                                         &alpha,
-                                         A,
-                                         cublas_data_type,
-                                         lda,
-                                         strideA,
-                                         B,
-                                         cublas_data_type,
-                                         ldb,
-                                         strideB,
-                                         &beta,
-                                         C,
-                                         cublas_data_type,
-                                         ldc,
-                                         strideC,
-                                         m->num_q_heads,
-                                         compute_type,
-                                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // print_tensor<float>((float*)C, 32, "C");
-    // add alibi position bias to qk production
-    // add alibi position bias to qk production
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                  min((size_t)CUDA_NUM_THREADS, parallelism),
-                                  0,
-                                  stream>>>(C,
-                                            num_new_tokens,
-                                            total_tokens,
-                                            m->num_q_heads,
-                                            m->global_num_q_heads,
-                                            shard_id);
-    }
-    // Fill all elements above diagonal in qk prods with -inf to force
-    // causal attention.
-    assert(num_new_tokens <= total_tokens);
-    if (num_new_tokens > 1) {
-      size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens;
-      spec_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                         min((size_t)CUDA_NUM_THREADS,
-                                             parallelism),
-                                         0,
-                                         stream>>>(C,
-                                                   num_new_tokens,
-                                                   total_tokens,
-                                                   m->num_q_heads,
-                                                   static_cast<DT>(-INFINITY));
-    }
-    // Compute Softmax(QK^T/sqrt(d_k))
-    // Before modifying the parameters below, make sure to read the following
-    // description of the CUDNN_TENSOR_NCHW tensor layout, from
-    // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-    // This tensor format specifies that the data is laid out in the following
-    // order: batch size, feature maps, rows, columns. The strides are
-    // implicitly defined in such a way that the data are contiguous in memory
-    // with no padding between images, feature maps, rows, and columns; the
-    // columns are the inner dimension and the images are the outermost
-    // dimension.
-    int n_param = m->num_q_heads;
-    int c_param = total_tokens;
-    int h_param = 1;
-    int w_param = num_new_tokens;
-    checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                          CUDNN_TENSOR_NCHW,
-                                          cudnn_data_type,
-                                          n_param,
-                                          c_param,
-                                          h_param,
-                                          w_param));
-    float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-    DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax) +
-                    m->num_q_heads * tokens_prev_requests_squares;
-    // The softmax operation below is executed according to the
-    // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-    // softmax operation is computed per spatial location (H,W) per image (N)
-    // across dimension C.
-    checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                   CUDNN_SOFTMAX_ACCURATE,
-                                   CUDNN_SOFTMAX_MODE_CHANNEL,
-                                   &softmax_alpha,
-                                   m->qk_tensor,
-                                   C,
-                                   &softmax_beta,
-                                   m->qk_tensor,
-                                   C_softmax));
-    // Matmul softmax(QK^T/sqrt(d_k)) by V
-    alpha = 1.0f, beta = 0.0f;
-    m_ = m->vProjSize;
-    n = num_new_tokens;
-    k = total_tokens;
-    lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-    strideA = vt_block_size;
-    strideB = num_new_tokens * total_tokens;
-    strideC = m->vProjSize;
-    // To get A, skip over V^T entries from previous requests (all heads +
-    // padding)
-    A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-    // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-    // requests (all heads)
-    B = C_softmax;
-    // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-    // requests
-
-    int token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-
-    C = static_cast<DT *>(m->attn_heads) +
-        (token_offset)*m->num_q_heads * m->vProjSize;
-    checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                         CUBLAS_OP_N,
-                                         CUBLAS_OP_T,
-                                         m_,
-                                         n,
-                                         k,
-                                         &alpha,
-                                         A,
-                                         cublas_data_type,
-                                         lda,
-                                         strideA,
-                                         B,
-                                         cublas_data_type,
-                                         ldb,
-                                         strideB,
-                                         &beta,
-                                         C,
-                                         cublas_data_type,
-                                         ldc,
-                                         strideC,
-                                         m->num_q_heads,
-                                         compute_type,
-                                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    tokens_previous_requests += num_new_tokens;
-    tokens_prev_requests_squares += num_new_tokens * total_tokens;
+  if (!bc->prompt_phase) {
+    assert(m->handle.tree_search_attention_metadata->decode_handler_collections.count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_search_attention_metadata->decode_handler_collections[batch_size]);
+  } else {
+    assert(m->handle.tree_search_attention_metadata->prompt_handler_collections.count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_search_attention_metadata->prompt_handler_collections[batch_size]);
   }
 
-  if (tokens_previous_requests != num_tokens) {
-    bc->print();
-    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-    printf("num_tokens: %i\n", num_tokens);
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    BeginForward time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  DISPATCH_HEADDIM(
+    head_dim, HEAD_DIM, {
+      cudaError_t result;
+      if (bc->prompt_phase) {
+        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+            PageStorage::kIndices,
+            HEAD_DIM,
+            LogitsPostHook::kNone,
+            QKVLayout::kNHD,
+            PosEncodingMode::kNone,
+            false,
+            MaskMode::kCausal,
+            half,
+            half,
+            int32_t>(handler,
+                      q,
+                      m->handle.tree_search_attention_metadata->q_indptr,
+                      /*q_offset=*/nullptr,
+                      paged_kv,
+                      /*custom_mask=*/nullptr,
+                      /*qk_indptr=*/nullptr,
+                      o,
+                      /*lse=*/nullptr,
+                      num_q_heads,
+                      /*logits_soft_cap=*/0.f,
+                      sm_scale,
+                      /*rope_scale=*/1.f,
+                      /*rope_theta=*/static_cast<float>(1e4),
+                      stream);
+      } else {
+        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+            PageStorage::kIndices,
+            HEAD_DIM,
+            LogitsPostHook::kNone,
+            QKVLayout::kNHD,
+            PosEncodingMode::kNone,
+            false,
+            MaskMode::kCustom,
+            half,
+            half,
+            int32_t>(handler,
+                      q,
+                      m->handle.tree_search_attention_metadata->q_indptr,
+                      /*q_offset=*/nullptr,
+                      paged_kv,
+                      m->handle.tree_search_attention_metadata->custom_mask,
+                      m->handle.tree_search_attention_metadata->qk_indptr,
+                      o,
+                      /*lse=*/nullptr,
+                      num_q_heads,
+                      /*logits_soft_cap=*/0.f,
+                      sm_scale,
+                      /*rope_scale=*/1.f,
+                      /*rope_theta=*/static_cast<float>(1e4),
+                      stream);
+      }
+    if (result != cudaSuccess) {
+      throw std::runtime_error(
+          "Failed to run "
+          "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+          std::string(cudaGetErrorString(result)));
+    }
+  });
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    actual attn time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  {
+    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+    produce_output_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(m->outputTmp, output_ptr, parallelism);
   }
-  assert(tokens_previous_requests == num_tokens);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    produce_output_kernel time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
 }
 
 template <typename DT>
-void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -709,17 +359,11 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                      bias_ptr,
                      stream);
   // phase 2: Update key/val cache
-  update_kv_cache_kernel<DT>(m, bc, stream);
+  update_qkv_cache<DT>(m, bc, stream);
 
   // phase 3: Compute attention score
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  if (bc->prompt_phase) {
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
-  } else {
-    compute_spec_inc_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
-  }
+  tree_search_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
   //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
@@ -756,7 +400,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
 /*static*/
 void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
-    SpecIncMultiHeadSelfAttentionMeta const *m,
+    SpecIncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -783,7 +427,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
   if (input.data_type == DT_HALF) {
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel<half>(
         m,
         bc,
         shard_id,
@@ -795,7 +439,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
   } else if (input.data_type == DT_FLOAT) {
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel(
+    Kernels::SpecIncMultiHeadSelfAttention::inference_kernel<float>(
         m,
         bc,
         shard_id,
@@ -860,21 +504,22 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
 
-  // allocate memory for the seqArray and reserve space
-  {
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::request_available));
-  }
+  // set attention constants
+  handler.tree_search_attention_metadata->set_enabled(true);
+  handler.tree_search_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.tree_search_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.tree_search_attention_metadata->set_head_dim(qProjSize);
 
   cudaStreamSynchronize(stream);
 }
 
 SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) {
-  if (tree_search_reserve_inst != Realm::RegionInstance::NO_INST) {
-    tree_search_reserve_inst.destroy();
-  }
+  // for (auto &decode_handler: decode_handler_collections) {
+  //   delete static_cast<flashinfer::BatchPrefillHandler *>(decode_handler.second);
+  // }
+  // for (auto &prompt_handler: prompt_handler_collections) {
+  //   delete static_cast<flashinfer::BatchPrefillHandler *>(prompt_handler.second);
+  // }
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 84f33372a..23e71c699 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -24,23 +24,6 @@
 
 #include <sstream>
 #include <stdexcept>
-#include <vector>
-
-#define DISPATCH_GROUPSIZE(group_size, GROUP_SIZE, ...)                        \
-  if (group_size == 1) {                                                       \
-    constexpr size_t GROUP_SIZE = 1;                                           \
-    __VA_ARGS__                                                                \
-  } else if (group_size == 4) {                                                \
-    constexpr size_t GROUP_SIZE = 4;                                           \
-    __VA_ARGS__                                                                \
-  } else if (group_size == 8) {                                                \
-    constexpr size_t GROUP_SIZE = 8;                                           \
-    __VA_ARGS__                                                                \
-  } else {                                                                     \
-    std::ostringstream err_msg;                                                \
-    err_msg << "Unsupported group_size: " << group_size;                       \
-    throw std::invalid_argument(err_msg.str());                                \
-  }
 
 #define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
   switch (head_dim) {                                                          \
@@ -66,16 +49,6 @@
     }                                                                          \
   }
 
-#define DISPATCH_PAGESIZE(page_size, PAGE_SIZE, ...)                           \
-  if (page_size == kPagesize) {                                                \
-    constexpr size_t PAGE_SIZE = kPagesize;                                    \
-    __VA_ARGS__                                                                \
-  } else {                                                                     \
-    std::ostringstream err_msg;                                                \
-    err_msg << "Unsupported page_size: " << page_size;                         \
-    throw std::invalid_argument(err_msg.str());                                \
-  }
-
 namespace FlexFlow {
 
 // declare Legion names
@@ -91,6 +64,7 @@ namespace TreeIncMultiHeadAttention {
 
 using flashinfer::BatchPrefillHandler;
 using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
 using flashinfer::MaskMode;
 using flashinfer::paged_kv_t;
 using flashinfer::PageStorage;
@@ -121,7 +95,7 @@ __global__ void commit_tokens_kernel(
     bool const *request_available,
     int num_requests,
     int hidden_size,
-    int num_committed_tokens,
+    int const *num_committed_tokens,
     int const max_num_pages) {
   int const idx = blockIdx.x * blockDim.x + threadIdx.x;
   int const request_compact_idx = idx / hidden_size;
@@ -136,7 +110,7 @@ __global__ void commit_tokens_kernel(
     }
   }
 
-  for (int i = 0; i < num_committed_tokens; i++) {
+  for (int i = 0; i < *num_committed_tokens; i++) {
     if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
       int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache;
       if (index_in_kv_cache == -1) {
@@ -170,7 +144,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  int num_tokens_to_commit = bc->num_tokens_to_commit;
   int const max_num_pages =
       (BatchConfig::max_sequence_length() +
        BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
@@ -185,7 +158,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                                    m->request_available,
                                    num_requests,
                                    m->hidden_size,
-                                   num_tokens_to_commit,
+                                   m->num_tokens_to_commit,
                                    max_num_pages);
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -196,82 +169,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   //   cudaEventDestroy(t_end);
 }
 
-__global__ void
-    update_custom_mask_kernel(float *custom_mask,
-                              BatchConfig::BitMask *causalMask,
-                              BatchConfig::PerRequestInfo *request_infos,
-                              bool *request_available,
-                              int const num_requests,
-                              int const max_q_length,
-                              int const max_kv_length,
-                              float mask_value) {
-  // get thread idx in [0, num_requests * max_q_length)
-  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
-  // get (request_idx, q_idx) from thread idx
-  int const request_idx = idx / max_q_length / max_kv_length;
-  int const q_idx = (idx % (max_q_length * max_kv_length)) / max_kv_length;
-  int const kv_idx = idx % max_kv_length;
-
-  // request id in batch config
-  int requext_idx_in_batch = -1;
-  int cnt_1 = 0, mask_offset = 0, mask_lens = 0;
-  while (cnt_1 < request_idx + 1) {
-    requext_idx_in_batch++;
-    if (request_available[requext_idx_in_batch]) {
-      cnt_1++;
-      mask_offset = mask_lens;
-      int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch,
-          k_len =
-              q_len +
-              request_infos[requext_idx_in_batch].first_token_index_in_request;
-      mask_lens += q_len * k_len;
-    }
-  }
-
-  int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch;
-  int const q_start =
-      request_infos[requext_idx_in_batch].first_token_index_in_request;
-  if (q_idx >= q_length) {
-    return;
-  }
-  if (kv_idx >= q_start + q_length) {
-    return;
-  }
-  assert(q_start + q_length <= max_kv_length);
-
-  float *mask = custom_mask + mask_offset + q_idx * (q_start + q_length);
-  if (kv_idx < q_start) {
-    mask[kv_idx] = 0.0f;
-  } else {
-    mask[kv_idx] = test_bit_orig(causalMask[requext_idx_in_batch].bit_mask,
-                                 q_idx,
-                                 kv_idx - q_start)
-                       ? 0.0f
-                       : mask_value;
-  }
-}
-
-void update_custom_mask(TreeIncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        cudaStream_t stream) {
-  int const num_requests = bc->num_active_requests();
-  int const max_q_length = BatchConfig::max_spec_tree_token_num();
-  int const max_kv_length = BatchConfig::max_spec_tree_token_num() +
-                            BatchConfig::max_sequence_length();
-  int parallelism = num_requests * max_q_length * max_kv_length;
-  update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
-                              min(CUDA_NUM_THREADS, parallelism),
-                              0,
-                              stream>>>(m->custom_mask,
-                                        m->causalMask,
-                                        m->request_infos,
-                                        m->request_available,
-                                        num_requests,
-                                        max_q_length,
-                                        max_kv_length,
-                                        -5e4);
-}
-
 template <typename DT>
 __global__ void
     update_qkv_cache_kernel(DT *devQKVProjArray,
@@ -331,54 +228,6 @@ void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
                                       num_new_tokens);
 }
 
-__global__ void
-    prepare_inference_params_kernel(int const num_requests,
-                                    BatchConfig::PerRequestInfo *request_infos,
-                                    bool *request_available,
-                                    uint32_t const max_num_pages,
-                                    int32_t *q_indptr,
-                                    int32_t *kv_indptr,
-                                    int32_t *kv_indices,
-                                    int32_t *kv_last_page_len,
-                                    int32_t *qk_indptr) {
-  int const request_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (request_idx >= num_requests) {
-    return;
-  }
-
-  // request id in batch config
-  int requext_idx_in_batch = -1;
-  int cnt_1 = 0, q_lens = 0, qk_len = 0;
-  int indices_offset = 0, indices_lens = 0, kv_len = 0;
-  while (cnt_1 < request_idx + 1) {
-    requext_idx_in_batch++;
-    if (request_available[requext_idx_in_batch]) {
-      cnt_1++;
-      int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch;
-      q_lens += q_len;
-      kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch +
-               request_infos[requext_idx_in_batch].first_token_index_in_request;
-      qk_len += q_len * kv_len;
-      indices_offset = indices_lens;
-      indices_lens += (kv_len + kPagesize - 1) / kPagesize;
-    }
-  }
-
-  if (request_idx == 0) {
-    q_indptr[0] = 0;
-    kv_indptr[0] = 0;
-    qk_indptr[0] = 0;
-  }
-  __syncthreads();
-  q_indptr[request_idx + 1] = q_lens;
-  kv_indptr[request_idx + 1] = indices_lens;
-  for (int i = indices_offset; i < indices_lens; i++) {
-    kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset);
-  }
-  kv_last_page_len[request_idx] = (kv_len - 1) % kPagesize + 1;
-  qk_indptr[request_idx + 1] = qk_len;
-}
-
 template <typename DT>
 __global__ void produce_output_kernel(half const *input_ptr,
                                       DT *output_ptr,
@@ -389,7 +238,7 @@ __global__ void produce_output_kernel(half const *input_ptr,
 }
 
 template <typename DT>
-void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
+void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
                            BatchConfig const *bc,
                            DT *output_ptr,
                            cudaStream_t stream) {
@@ -403,38 +252,24 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   // global constant parameters
   uint32_t const num_q_heads = m->num_q_heads;
   uint32_t const num_kv_heads = m->num_kv_heads;
-  uint32_t const group_size = num_q_heads / num_kv_heads;
   uint32_t const head_dim = m->qProjSize;
-  uint32_t const max_num_pages =
-      (BatchConfig::max_sequence_length() +
-       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-      kPagesize;
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale =
       (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  std::vector<int32_t> q_indptr_h{0};
 
-  {
-    int parallelism = batch_size;
-    prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(batch_size,
-                                                m->request_infos,
-                                                m->request_available,
-                                                max_num_pages,
-                                                m->q_indptr,
-                                                m->kv_indptr,
-                                                m->kv_indices,
-                                                m->kv_last_page_len,
-                                                m->qk_indptr);
-    for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
-      if (bc->request_available[req_idx]) {
-        int q_len = bc->requestsInfo[req_idx].num_tokens_in_batch;
-        q_indptr_h.push_back(q_indptr_h.back() + q_len);
-      }
-    }
-  }
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
 
   half *q = static_cast<half *>(m->queryTmp),
        *kv = static_cast<half *>(m->keyCache),
@@ -445,9 +280,9 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
       head_dim,
       batch_size,
       kv,
-      m->kv_indices,
-      m->kv_indptr,
-      m->kv_last_page_len);
+      m->handle.tree_verify_attention_metadata->kv_indices,
+      m->handle.tree_verify_attention_metadata->kv_indptr,
+      m->handle.tree_verify_attention_metadata->kv_last_page_len);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -463,16 +298,17 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  BatchPrefillHandler *handler =
-      static_cast<BatchPrefillHandler *>(m->batch_prefill_handler);
-  handler->SetCUDAStream(stream);
-  handler->BeginForward(m->workspace,
-                        m->workspace_size,
-                        q_indptr_h.data(),
-                        batch_size,
-                        num_q_heads,
-                        num_kv_heads,
-                        head_dim);
+  BatchPrefillHandler *handler = nullptr;
+
+  if (!bc->prompt_phase) {
+    assert(m->handle.tree_verify_attention_metadata->decode_handler_collections.count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_verify_attention_metadata->decode_handler_collections[batch_size]);
+  } else {
+    assert(m->handle.tree_verify_attention_metadata->prompt_handler_collections.count(batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_verify_attention_metadata->prompt_handler_collections[batch_size]);
+  }
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -488,70 +324,69 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta const *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  DISPATCH_GROUPSIZE(
-      group_size,
-      GROUP_SIZE,
-      {DISPATCH_HEADDIM(
-          head_dim, HEAD_DIM, {DISPATCH_PAGESIZE(kPagesize, PAGE_SIZE, {
-            cudaError_t result;
-            if (bc->prompt_phase) {
-              result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-                  PageStorage::kIndices,
-                  QKVLayout::kNHD,
-                  PAGE_SIZE,
-                  GROUP_SIZE,
-                  HEAD_DIM,
-                  PosEncodingMode::kNone,
-                  false,
-                  MaskMode::kCausal,
-                  half,
-                  half,
-                  int32_t>(handler,
-                           q,
-                           m->q_indptr,
-                           /*q_offset=*/nullptr,
-                           paged_kv,
-                           /*custom_mask=*/nullptr,
-                           /*qk_indptr=*/nullptr,
-                           o,
-                           /*lse=*/nullptr,
-                           sm_scale,
-                           /*rope_scale=*/1.f,
-                           /*rope_theta=*/static_cast<float>(1e4),
-                           stream);
-            } else {
-              result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-                  PageStorage::kIndices,
-                  QKVLayout::kNHD,
-                  PAGE_SIZE,
-                  GROUP_SIZE,
-                  HEAD_DIM,
-                  PosEncodingMode::kNone,
-                  false,
-                  MaskMode::kCustom,
-                  half,
-                  half,
-                  int32_t>(handler,
-                           q,
-                           m->q_indptr,
-                           /*q_offset=*/nullptr,
-                           paged_kv,
-                           m->custom_mask,
-                           m->qk_indptr,
-                           o,
-                           /*lse=*/nullptr,
-                           sm_scale,
-                           /*rope_scale=*/1.f,
-                           /*rope_theta=*/static_cast<float>(1e4),
-                           stream);
-            }
-            if (result != cudaSuccess) {
-              throw std::runtime_error(
-                  "Failed to run "
-                  "BatchPrefillWithPagedKVCacheWrapperDispatched" +
-                  std::string(cudaGetErrorString(result)));
-            }
-          })})});
+  DISPATCH_HEADDIM(
+    head_dim, HEAD_DIM, {
+      cudaError_t result;
+      if (bc->prompt_phase) {
+        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+            PageStorage::kIndices,
+            HEAD_DIM,
+            LogitsPostHook::kNone,
+            QKVLayout::kNHD,
+            PosEncodingMode::kNone,
+            false,
+            MaskMode::kCausal,
+            half,
+            half,
+            int32_t>(handler,
+                      q,
+                      m->handle.tree_verify_attention_metadata->q_indptr,
+                      /*q_offset=*/nullptr,
+                      paged_kv,
+                      /*custom_mask=*/nullptr,
+                      /*qk_indptr=*/nullptr,
+                      o,
+                      /*lse=*/nullptr,
+                      num_q_heads,
+                      /*logits_soft_cap=*/0.f,
+                      sm_scale,
+                      /*rope_scale=*/1.f,
+                      /*rope_theta=*/static_cast<float>(1e4),
+                      stream);
+      } else {
+        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
+            PageStorage::kIndices,
+            HEAD_DIM,
+            LogitsPostHook::kNone,
+            QKVLayout::kNHD,
+            PosEncodingMode::kNone,
+            false,
+            MaskMode::kCustom,
+            half,
+            half,
+            int32_t>(handler,
+                      q,
+                      m->handle.tree_verify_attention_metadata->q_indptr,
+                      /*q_offset=*/nullptr,
+                      paged_kv,
+                      m->handle.tree_verify_attention_metadata->custom_mask,
+                      m->handle.tree_verify_attention_metadata->qk_indptr,
+                      o,
+                      /*lse=*/nullptr,
+                      num_q_heads,
+                      /*logits_soft_cap=*/0.f,
+                      sm_scale,
+                      /*rope_scale=*/1.f,
+                      /*rope_theta=*/static_cast<float>(1e4),
+                      stream);
+      }
+    if (result != cudaSuccess) {
+      throw std::runtime_error(
+          "Failed to run "
+          "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+          std::string(cudaGetErrorString(result)));
+    }
+  });
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -679,25 +514,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  // Update gpu-side custom mask referring from CaualMask
-  if (!bc->prompt_phase) {
-    update_custom_mask(m, bc, stream);
-  }
-
-  //   cudaEventRecord(t_end, stream);
-  //   checkCUDA(cudaEventSynchronize(t_end));
-  //   elapsed = 0;
-  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  //   cudaEventDestroy(t_start);
-  //   cudaEventDestroy(t_end);
-  //   if (device == 0) {
-  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
-  //   }
-
-  //   cudaEventCreate(&t_start);
-  //   cudaEventCreate(&t_end);
-  //   cudaEventRecord(t_start, stream);
-
   // Update key-val cache, compact q array
   update_qkv_cache<DT>(m, bc, stream);
 
@@ -905,43 +721,14 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
 
-  {
-    size_t batch_size = BatchConfig::max_requests_per_batch();
-    size_t max_num_pages =
-        (BatchConfig::max_spec_tree_token_num() +
-         BatchConfig::max_sequence_length() + kPagesize - 1) /
-        kPagesize;
-    size_t indices_size = std::max(
-        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
-    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
-                              BatchConfig::max_spec_tree_token_num() *
-                              (BatchConfig::max_spec_tree_token_num() +
-                               BatchConfig::max_sequence_length());
-    workspace_size = 32 * 1024 * 1024; // 32MB
-
-    gpu_mem_allocator.create_legion_instance(
-        flashinfer_reserve_inst,
-        sizeof(int32_t) * indices_size + sizeof(float) * custom_mask_size +
-            workspace_size);
-
-    q_indptr = gpu_mem_allocator.allocate_instance<int32_t>(indices_size);
-    kv_indptr = q_indptr + batch_size + 1;
-    kv_indices = kv_indptr + batch_size + 1;
-    kv_last_page_len = kv_indices + max_num_pages * batch_size;
-    qk_indptr = kv_last_page_len + batch_size + 1;
-    custom_mask = gpu_mem_allocator.allocate_instance<float>(custom_mask_size);
-    workspace = static_cast<void *>(
-        gpu_mem_allocator.allocate_instance<char>(workspace_size));
-    batch_prefill_handler =
-        static_cast<void *>(new flashinfer::BatchPrefillHandler);
-  }
+  // set attention constants
+  handler.tree_verify_attention_metadata->set_enabled(true);
+  handler.tree_verify_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.tree_verify_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.tree_verify_attention_metadata->set_head_dim(qProjSize);
 
   // allocate memory for the seqArray and reserve space
   {
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::request_available));
     committed_token_infos =
         reinterpret_cast<BatchConfig::CommittedTokensInfo *>(
             reinterpret_cast<char *>(handler.batch_config_metadata) +
@@ -949,16 +736,17 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
             sizeof(BatchConfig::requestsInfo) +
             sizeof(BatchConfig::request_available) +
             sizeof(BatchConfig::causalMask));
+    num_tokens_to_commit = 
+        reinterpret_cast<int *>(
+            reinterpret_cast<char *>(committed_token_infos) +
+            sizeof(BatchConfig::committed_tokens));
   }
 
   cudaStreamSynchronize(stream);
 }
 
 TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
-  if (flashinfer_reserve_inst != Realm::RegionInstance::NO_INST) {
-    flashinfer_reserve_inst.destroy();
-  }
-  delete static_cast<flashinfer::BatchPrefillHandler *>(batch_prefill_handler);
+  // delete static_cast<flashinfer::BatchPrefillHandler *>(batch_prefill_handler);
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/tree_inc_multihead_self_attention.cu.backup b/src/ops/tree_inc_multihead_self_attention.cu.backup
deleted file mode 100644
index c022fabcf..000000000
--- a/src/ops/tree_inc_multihead_self_attention.cu.backup
+++ /dev/null
@@ -1,1119 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "cuComplex.h"
-#endif
-#include "flexflow/ffconst_utils.h"
-#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
-#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
-#include "flexflow/ops/tree_inc_multihead_self_attention.h"
-#include "flexflow/utils/cuda_helper.h"
-
-namespace FlexFlow {
-
-// declare Legion names
-using Legion::coord_t;
-using Legion::Memory;
-
-#define WARP_SIZE 32
-
-using namespace Kernels::IncMultiHeadAttention;
-
-namespace Kernels {
-namespace TreeIncMultiHeadAttention {
-
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_fused_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int const max_seq_length,
-    int const max_token_per_batch,
-    int per_head_size,
-    int hidden_size,
-    /* Reserved: BatchConfig Updated */
-    BatchConfig::PerRequestInfo *request_infos,
-    int num_heads,
-    int num_requests,
-    BatchConfig::BitMask *causalMask,
-    bool *request_available,
-    int qk_smem_sz,
-    bool prompt_phase) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  // request id in batch config
-  int requext_idx_in_batch = -1;
-  int cnt_1 = 0;
-  while (cnt_1 < request_idx + 1) {
-    requext_idx_in_batch++;
-    if (request_available[requext_idx_in_batch]) {
-      cnt_1++;
-    }
-  }
-
-  // threads converge
-  //   __syncthreads();
-
-  int const first_step = 0;
-
-  int const tlength =
-      request_infos[requext_idx_in_batch].first_token_index_in_request +
-      request_infos[requext_idx_in_batch].num_tokens_in_batch;
-  int const qlength = request_infos[requext_idx_in_batch].num_tokens_in_batch;
-
-  __shared__ uint64_t bit_mask[BatchConfig::MAX_SPEC_TREE_TOKEN_NUM]
-                              [BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64];
-  for (int i = tidx; i < qlength; i += THREADS_PER_BLOCK) {
-    for (int j = 0; j < BatchConfig::MAX_SPEC_TREE_TOKEN_NUM / 64; j++) {
-      bit_mask[i][j] = causalMask[requext_idx_in_batch].bit_mask[i].bits[j];
-    }
-  }
-
-  int non_tree_cache_size =
-      causalMask[requext_idx_in_batch].non_tree_cache_size;
-
-  int const first_token_idx =
-      request_infos[requext_idx_in_batch].first_token_offset_in_batch;
-
-  int q_start =
-      request_infos[requext_idx_in_batch].first_token_index_in_request;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-
-  for (int qi = 0; qi < qlength; qi += 1) {
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
-          ii * THREADS_PER_KEY * K_VEC_SIZE);
-
-      // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
-      //     printf("laod q %d,  %d %.10f\n",
-      //     request_idx,
-      //            qi,q_vecs[ki_o][ii].x);
-      //   }
-    }
-
-    __syncthreads();
-    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-      K_vec k[K_VECS_PER_THREAD];
-      int const ti_circ = ti % max_seq_length;
-#pragma unroll
-      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-        if (ti < tlength) {
-          k[ii] = *reinterpret_cast<K_vec const *>(
-              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
-              jj);
-        }
-      }
-      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-
-      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-        bool const mask =
-            prompt_phase
-                ? (qi + q_start < ti)
-                : (ti >= non_tree_cache_size &&
-                   (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
-
-        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-
-        // if (head_idx == 0 && !mask) {
-        //   printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f,
-        //   %.10f, %d\n",
-        //          request_idx,
-        //          qi,
-        //          ti,
-        //          qk,
-        //          q_vecs[ki_o][0].x,
-        //          k[0].x,
-        //          bitmask->non_tree_cache_size);
-        // }
-        qk_smem[ti - first_step] = mask ? 0.0f : qk;
-      }
-    }
-
-    __syncthreads();
-
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Decompose the thread index into warp and lane.
-    int const warp = tidx / WARP_SIZE;
-    int const lane = tidx % WARP_SIZE;
-
-    // The warp leader writes the max to shared memory.
-    if (lane == 0) {
-      red_smem[warp] = qk_max;
-    }
-
-    // Make sure the products are in shared memory.
-    __syncthreads();
-
-    // The warps finalize the reduction.
-    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Broadcast to all the threads in the warp.
-    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-    // if (head_idx == 0 && qi == 9 && tidx == 0) {
-    //   printf("tree attn first token qk_max %f\n", qk_max);
-    // }
-
-    float exp_sum = 0.f;
-    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      bool const mask =
-          prompt_phase ? (q_start + qi < ti)
-                       : (ti >= non_tree_cache_size &&
-                          (!test_bit(bit_mask, qi, ti - non_tree_cache_size)));
-      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
-      exp_sum += logit;
-      qk_smem[ti - first_step] = mask ? 0.0f : logit;
-    }
-
-    // Compute the sum.
-    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-    // softmax
-    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-      qk_smem[ti - first_step] *= inv_sum;
-    }
-
-    __syncthreads();
-
-    // value projection
-    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-    // A vector of V elements for the current timestep.
-    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-    // The value computed by this thread.
-    int vo = tidx / THREADS_PER_VALUE;
-    // The hidden dimensions computed by this particular thread.
-    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-    Out_sum out;
-    zero(out);
-
-    // The base pointer for the value in the cache buffer.
-    DT const *v_cache_batch =
-        value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
-
-    if (Dh == Dh_MAX || vi < Dh) {
-      for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-        // Load the values from the cache.
-        int const ti_circ = ti % max_seq_length;
-        V_vec v = *reinterpret_cast<V_vec const *>(
-            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-        float logit = qk_smem[ti - first_step];
-        out = FlexFlow::fma(logit, cast_to_float(v), out);
-      }
-    }
-
-    //   // Make sure we can start writing to shared memory.
-    __syncthreads();
-
-    // Run the final reduction amongst the different groups computing different
-    // partial outputs.
-    if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-      for (int active_groups = V_PER_ITER; active_groups >= 2;
-           active_groups /= 2) {
-
-        // The midpoint in the number of active groups.
-        int midpoint = active_groups / 2;
-
-        // The upper part of active threads store to shared memory.
-        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-              out;
-        }
-        __syncthreads();
-
-        // The bottom warps update their values.
-        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                    out);
-        }
-        __syncthreads();
-      }
-    }
-
-    // Output the final values.
-    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-      convert_from_float(*reinterpret_cast<V_vec *>(
-                             output_ptr + (first_token_idx + qi) * hidden_size +
-                             head_idx * per_head_size + vi),
-                         out);
-      // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) {
-      //   printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n",
-      //          out.x,
-      //          out.y,
-      //          out.z,
-      //          out.w,
-      //          vi,
-      //          (first_token_idx + qi) * hidden_size + head_idx *
-      //          per_head_size +
-      //              vi);
-      // }
-    }
-  }
-}
-
-template <typename DT>
-__global__ void commit_tokens_kernel(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    BatchConfig::CommittedTokensInfo const *committedTokenInfos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_tokens_to_commit,
-    int num_active_tokens_in_last_batch,
-    int max_seq_len,
-    int hidden_size) {
-
-  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
-
-    int token_pos = i / (hidden_size);
-    int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
-    int offset = i % hidden_size;
-    assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
-
-    size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size +
-                     hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = committedTokenInfos[token_pos].request_index;
-    int const tok_id = committedTokenInfos[token_pos].token_depth;
-
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
-void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
-                   BatchConfig const *bc,
-                   cudaStream_t stream) {
-  int num_tokens_to_commit = bc->num_tokens_to_commit;
-  if (num_tokens_to_commit > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
-    commit_tokens_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(
-        static_cast<DT *>(m->devQKVProjArray),
-        static_cast<DT *>(m->keyCache),
-        static_cast<DT *>(m->valueCache),
-        m->committed_token_infos,
-        m->qProjSize,
-        m->kProjSize,
-        m->vProjSize,
-        num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
-        BatchConfig::max_sequence_length() +
-            BatchConfig::max_spec_tree_token_num(),
-        m->hidden_size);
-  }
-}
-
-template <typename DT>
-__global__ void
-    update_tree_branch_kv_cache(DT const *devQKVProjArray,
-                                DT *kCache_ptr,
-                                DT *vCache_ptr,
-                                BatchConfig::PerTokenInfo const *tokenInfos,
-                                int qProjSize,
-                                int kProjSize,
-                                int vProjSize,
-                                int num_tokens_in_branch,
-                                int processed_tokens_in_batch,
-                                int total_tokens_in_batch,
-                                int max_seq_len,
-                                int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
-
-    int token_idx = i / (hidden_size);
-    int offset = i % hidden_size;
-
-    token_idx += processed_tokens_in_batch; // get index in the whole batch
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
-__global__ void update_tree_branch_kv_cache_fused(
-    DT const *devQKVProjArray,
-    DT *kCache_ptr,
-    DT *vCache_ptr,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    BatchConfig::PerRequestInfo *request_infos,
-    int qProjSize,
-    int kProjSize,
-    int vProjSize,
-    int num_new_tokens,
-    int max_seq_len,
-    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) {
-
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-
-    int const req_idx = tokenInfos[token_idx].request_index;
-    int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-
-    kCache_ptr[req_idx * (hidden_size * max_seq_len) +
-               token_abs_idx * hidden_size + offset] = kVal;
-    vCache_ptr[req_idx * (hidden_size * max_seq_len) +
-               token_abs_idx * hidden_size + offset] = vVal;
-  }
-}
-
-template <typename DT>
-__global__ void tree_fill_entries_above_diagonal(DT *matrix,
-                                                 size_t new_tokens,
-                                                 size_t total_tokens_in_request,
-                                                 size_t num_q_heads,
-                                                 DT value) {
-  CUDA_KERNEL_LOOP(i, new_tokens * total_tokens_in_request * num_q_heads) {
-    // size_t head_idx = i / (new_tokens * total_tokens_in_request);
-    size_t src_idx = (i / new_tokens) % total_tokens_in_request;
-    size_t dst_idx = i % new_tokens + total_tokens_in_request - new_tokens;
-    // Casual Mask
-    if (src_idx > dst_idx) {
-      matrix[i] = value;
-    }
-  }
-}
-
-template <typename DT>
-void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
-                              BatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int processed_tokens_in_batch = 0;
-  // int qkv_block_size =
-  //     (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens();
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size = kt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size = vt_block_size * m->num_q_heads *
-                          (BatchConfig::max_sequence_length() +
-                           BatchConfig::max_spec_tree_token_num());
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i]) {
-      continue;
-    }
-    assert(processed_tokens_in_batch ==
-           bc->requestsInfo[i].first_token_offset_in_batch);
-    int last_token_idx_of_the_request =
-        processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
-    while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
-      int num_new_tokens = 1;
-      int j = processed_tokens_in_batch;
-      while ((j + 1 <= last_token_idx_of_the_request) &&
-             (bc->tokensInfo[j].abs_index_in_request + 1 ==
-              bc->tokensInfo[j + 1].abs_index_in_request)) {
-        j++;
-        num_new_tokens++;
-      }
-
-      int total_tokens_in_request = bc->tokensInfo[j].abs_index_in_request + 1;
-      assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
-      {
-        // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
-        update_tree_branch_kv_cache<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-            static_cast<DT *>(m->devQKVProjArray),
-            static_cast<DT *>(m->keyCache),
-            static_cast<DT *>(m->valueCache),
-            m->token_infos,
-            m->qProjSize,
-            m->kProjSize,
-            m->vProjSize,
-            num_new_tokens,            // num_tokens_in_branch
-            processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
-            BatchConfig::max_sequence_length() +
-                BatchConfig::max_spec_tree_token_num(),
-            m->hidden_size);
-      }
-
-      // bc->token_last_available_idx[i] + 1;
-      // Compute (QK^T/sqrt(d_k))
-      int m_ = num_new_tokens;
-      int n = total_tokens_in_request;
-      int k = m->qProjSize;
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens_in_request;
-
-      // a flag of using this scaling alpha
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // To get A, skip over Q entries from previous requests (same head)
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    processed_tokens_in_batch * m->qProjSize * m->num_q_heads *
-                        QKV_WEIGHT_NUM;
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // To get C, skip over QK^T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      // add alibi position bias to qk production
-      // add alibi position bias to qk production
-      if (*m->position_bias) {
-        size_t parallelism =
-            m->num_q_heads * total_tokens_in_request * num_new_tokens;
-        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens_in_request,
-                                              m->num_q_heads,
-                                              m->global_num_q_heads,
-                                              shard_id);
-      }
-
-      // Fill all elements above diagonal in qk prods with -inf to force
-      // causal attention.
-      assert(num_new_tokens <= total_tokens_in_request);
-      if (num_new_tokens > 1) {
-        size_t parallelism =
-            m->num_q_heads * num_new_tokens * total_tokens_in_request;
-        tree_fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                           min((size_t)CUDA_NUM_THREADS,
-                                               parallelism),
-                                           0,
-                                           stream>>>(
-            C,
-            num_new_tokens,
-            total_tokens_in_request,
-            m->num_q_heads,
-            static_cast<DT>(-INFINITY));
-      }
-      // Compute Softmax(QK^T/sqrt(d_k))
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens_in_request;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-      // Matmul softmax(QK^T/sqrt(d_k)) by V
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->vProjSize;
-      n = num_new_tokens;
-      k = total_tokens_in_request;
-      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      strideA = vt_block_size;
-      strideB = num_new_tokens * total_tokens_in_request;
-      strideC = m->vProjSize;
-      // To get A, skip over V^T entries from previous requests (all heads +
-      // padding)
-      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      B = C_softmax;
-      // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-      // requests
-      C = static_cast<DT *>(m->attn_heads) +
-          processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-      processed_tokens_in_batch += num_new_tokens;
-    }
-    // Before moving to the next request
-    // check that we have finished all tokens of the request
-    assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
-  }
-  // Project to output, save result directly on output tensor
-  DT alpha = 1.0f, beta = 0.0f;
-  int m_ = m->oProjSize;
-  int k = m->vProjSize * m->num_q_heads;
-  int n = processed_tokens_in_batch;
-  int lda = k, ldb = k, ldc = m_;
-  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                         m->kProjSize * m->num_q_heads +
-                                         m->vProjSize * m->num_q_heads);
-  DT const *B = static_cast<DT *>(m->attn_heads);
-  DT *C = static_cast<DT *>(output_ptr);
-
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         m_,
-                         n,
-                         k,
-                         &alpha,
-                         A,
-                         cublas_data_type,
-                         lda,
-                         B,
-                         cublas_data_type,
-                         ldb,
-                         &beta,
-                         C,
-                         cublas_data_type,
-                         ldc,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * processed_tokens_in_batch;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(output_ptr,
-                                  bias_ptr,
-                                  processed_tokens_in_batch,
-                                  qkv_weight_size,
-                                  m->oProjSize);
-  }
-
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
-}
-
-#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(DT,                          \
-                                                  Dh,                          \
-                                                  Dh_MAX,                      \
-                                                  THDS_PER_KEY,                \
-                                                  THDS_PER_VALUE,              \
-                                                  THDS_PER_BLOCK,              \
-                                                  stream,                      \
-                                                  prompt_phase)                \
-  smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
-                              BatchConfig::max_sequence_length() +             \
-                                  BatchConfig::max_spec_tree_token_num(),      \
-                              THDS_PER_VALUE,                                  \
-                              THDS_PER_BLOCK,                                  \
-                              bc,                                              \
-                              smem_sz);                                        \
-  compute_attention_kernel_fused_kernel<DT,                                    \
-                                        THDS_PER_BLOCK,                        \
-                                        Dh,                                    \
-                                        Dh_MAX,                                \
-                                        THDS_PER_KEY,                          \
-                                        THDS_PER_VALUE>                        \
-      <<<grid,                                                                 \
-         THDS_PER_BLOCK,                                                       \
-         smem_sz[1],                                                           \
-         stream>>>(static_cast<DT *>(m->devQKVProjArray),                      \
-                   static_cast<DT *>(m->keyCache),                             \
-                   static_cast<DT *>(m->valueCache),                           \
-                   output_ptr,                                                 \
-                   scale,                                                      \
-                   BatchConfig::max_sequence_length() +                        \
-                       BatchConfig::max_spec_tree_token_num(),                 \
-                   BatchConfig::max_tokens_per_batch(),                        \
-                   m->qProjSize,                                               \
-                   m->hidden_size,                                             \
-                   m->request_infos,                                           \
-                   m->num_q_heads,                                             \
-                   bc->num_active_requests(),                                  \
-                   m->causalMask,                                              \
-                   m->request_available,                                       \
-                   smem_sz[0],                                                 \
-                   prompt_phase)
-
-template <typename DT>
-void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
-                                    BatchConfig const *bc,
-                                    DT *output_ptr,
-                                    cudaStream_t stream) {
-
-  // update the kv cache
-  //  update K-V cache
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<DT *>(m->keyCache),
-      static_cast<DT *>(m->valueCache),
-      m->token_infos,
-      m->request_infos,
-      m->qProjSize,
-      m->kProjSize,
-      m->vProjSize,
-      num_new_tokens,
-      BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num(),
-      m->hidden_size);
-
-  // cudaEvent_t t_start, t_end;
-  // cudaEventCreate(&t_start);
-  // cudaEventCreate(&t_end);
-  // cudaEventRecord(t_start, stream);
-
-  dim3 grid(m->num_q_heads, bc->num_active_requests());
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  // 0->qk production size, 1->total shared size
-  // per_head_size: 128, thd_per_v:32, prompt_phase: 0
-  int smem_sz[2];
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream, bc->prompt_phase);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream, bc->prompt_phase);
-  } else {
-    assert(false && "a unsupported head size");
-  }
-
-  // cudaEventRecord(t_end, stream);
-  // checkCUDA(cudaEventSynchronize(t_end));
-  // float elapsed = 0;
-  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  // printf("TreeIncMultiHeadSelfAttention part 2 time: %.2f ms\n", elapsed);
-  // cudaEventDestroy(t_start);
-  // cudaEventDestroy(t_end);
-
-}
-
-template <typename DT>
-void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
-                      BatchConfig const *bc,
-                      int shard_id,
-                      DT const *input_ptr,
-                      DT const *weight_ptr,
-                      DT *output_ptr,
-                      DT const *bias_ptr,
-                      cudaStream_t stream) {
-  // additional processing for weight uploading
-  if (m->handle.offload_reserve_space != nullptr) {
-    // Note that we update weight_ptr and bias_ptr when uploading weight and
-    // bias
-    cudaMemcpyAsync(m->weight_ptr,
-                    weight_ptr,
-                    m->weightSize,
-                    cudaMemcpyHostToDevice,
-                    stream);
-    weight_ptr = static_cast<DT *>(m->weight_ptr);
-    if (m->biasSize > 0) {
-      cudaMemcpyAsync(
-          m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
-      bias_ptr = static_cast<DT *>(m->bias_ptr);
-    }
-  }
-
-  // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
-  // tokens in the previous batch, which is needed for committing
-  // keys/values to the key-value cache
-  // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
-  // "\n";
-
-  commit_tokens<DT>(m, bc, stream);
-
-  // After commit we update m->num_active_tokens to be the number of active
-  // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
-
-  // here because we need postion info in infernece 1
-  if (m->offload && m->biasSize > 0) {
-    cudaMemcpyAsync(
-        m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream);
-    bias_ptr = static_cast<DT *>(m->bias_ptr);
-  }
-  // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
-
-  // phase 2: No need to update key/val cache
-  // IncMultiHeadSelfAttention::update_kv_cache_kernel(
-  //    m, bc, stream);
-  // use the new kernel
-  compute_attention_kernel_fused<DT>(
-      m, bc, static_cast<DT *>(m->attn_heads), stream);
-
-  // Debug output:
-  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
-  //   float *temp_output = new float[size];
-  //   cudaDeviceSynchronize();
-  //   cudaMemcpy(
-  //       temp_output, m->attn_heads, size * sizeof(float),
-  //       cudaMemcpyDeviceToHost);
-  //   printf("Output: ");
-  //   for (int i = 0; i < 1; ++i) {
-  //     float temp = 0;
-  //     for (int j = 0; j < m->hidden_size; ++j) {
-  //       temp += temp_output[i * m->hidden_size + j];
-  //     }
-  //     printf("%.6f ", temp);
-  //   }
-  //   printf("\n");
-
-  //   delete[] temp_output;
-
-  int processed_tokens_in_batch = bc->num_active_tokens();
-
-  compute_o_prod_bias(m,
-                      bc,
-                      shard_id,
-                      output_ptr,
-                      weight_ptr,
-                      bias_ptr,
-                      processed_tokens_in_batch,
-                      stream);
-}
-
-} // namespace TreeIncMultiHeadAttention
-} // namespace Kernels
-
-/*static*/
-void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-    TreeIncMultiHeadSelfAttentionMeta *m,
-    BatchConfig const *bc,
-    int shard_id,
-    GenericTensorAccessorR const &input,
-    GenericTensorAccessorR const &weight,
-    GenericTensorAccessorW const &output,
-    GenericTensorAccessorR const &bias) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
-
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
-
-  // assert(input.data_type == weight.data_type);
-  assert(input.data_type == output.data_type);
-  if (use_bias) {
-    assert(input.data_type == bias.data_type);
-  }
-
-  if (input.data_type == DT_HALF) {
-    if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
-    }
-
-    half const *bias_ptr =
-        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_half_ptr(),
-        m->offload ? static_cast<half *>(m->weight_ptr) : weight.get_half_ptr(),
-        output.get_half_ptr(),
-        bias_ptr,
-        stream);
-  } else if (input.data_type == DT_FLOAT) {
-    if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
-    }
-    float const *bias_ptr =
-        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::TreeIncMultiHeadAttention::inference_kernel(
-        m,
-        bc,
-        shard_id,
-        input.get_float_ptr(),
-        m->offload ? static_cast<float *>(m->weight_ptr)
-                   : weight.get_float_ptr(),
-        output.get_float_ptr(),
-        bias_ptr,
-        stream);
-  } else {
-    assert(false && "Unspported data type");
-  }
-
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-  }
-}
-
-TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
-    FFHandler handler,
-    TreeIncMultiHeadSelfAttention const *attn,
-    GenericTensorAccessorR const &weight,
-    MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
-    int _num_q_heads,
-    int _num_kv_heads)
-    : IncMultiHeadSelfAttentionMeta(handler,
-                                    TREE_VERIFY_MODE,
-                                    attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
-                                    attn->apply_rotary_embedding,
-                                    attn->qkv_bias,
-                                    attn->scaling_query,
-                                    attn->qk_prod_scaling,
-                                    attn->position_bias,
-                                    attn->final_bias,
-                                    attn->scaling_factor,
-                                    weight,
-                                    gpu_mem_allocator,
-                                    num_samples,
-                                    attn->num_q_heads,
-                                    attn->num_kv_heads,
-                                    _num_q_heads,
-                                    _num_kv_heads,
-                                    attn->quantization_type,
-                                    attn->offload),
-      num_active_tokens(0) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkCUDNN(cudnnSetStream(handler.dnn, stream));
-
-  // allocate memory for the seqArray and reserve space
-  {
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::request_available));
-    committed_token_infos =
-        reinterpret_cast<BatchConfig::CommittedTokensInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BatchConfig::request_available) +
-            sizeof(BatchConfig::causalMask));
-  }
-
-  cudaStreamSynchronize(stream);
-}
-
-TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
-  if (committed_token_reserve_inst != Realm::RegionInstance::NO_INST) {
-    committed_token_reserve_inst.destroy();
-  }
-}
-
-}; // namespace FlexFlow
diff --git a/src/ops/tree_inc_multihead_self_attention_impl.cu b/src/ops/tree_inc_multihead_self_attention_impl.cu
deleted file mode 100644
index 0af0529af..000000000
--- a/src/ops/tree_inc_multihead_self_attention_impl.cu
+++ /dev/null
@@ -1,509 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "cuComplex.h"
-#endif
-#include "flashinfer/attention_impl.cuh"
-
-// This is for instantiating the template attention kernels
-namespace flashinfer {
-
-// group_size[] = {1, 4, 8};
-// head_dim[] = {64, 128, 256};
-
-/********** prefill instantiations for half precision **********/
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  1, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  4, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  4, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  4, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  8, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  8, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  8, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCausal, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-
-/********** append instantiations for half precision **********/
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  1, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  1, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  1, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  4, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  4, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  4, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  8, 64, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  8, 128, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-template cudaError_t SinglePrefillWithKVCacheDispatched<
-  8, 256, QKVLayout::kNHD, PosEncodingMode::kNone,
-  false, MaskMode::kCustom, half, half>(
-    half* q, half* k, half* v, float* custom_mask, half* o,
-    float* tmp, float* lse, uint32_t num_kv_heads, uint32_t qo_len, uint32_t kv_len,
-    float sm_scale, float rope_scale,
-    float rope_theta, cudaStream_t stream);
-
-
-constexpr uint32_t kPagesize = 64;
-// num_frags_x[] = {1, 2};
-// group_size[] = {1, 4, 8};
-// head_dim[] = {64, 128, 256};
-
-/********** batch append instantiations for half precision **********/
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  1, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  1, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  1, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  4, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  4, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  4, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  8, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  8, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  8, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  1, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  1, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  1, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  4, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  4, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  4, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  8, 64, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  8, 128, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  8, 256, PosEncodingMode::kNone, false, MaskMode::kCustom,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-
-/********** batch prefill instantiations for half precision **********/
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  1, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  1, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  1, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  4, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  4, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  4, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  8, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  8, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 1, kPagesize,
-  8, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  1, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  1, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  1, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  4, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  4, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  4, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  8, 64, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  8, 128, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<
-  PageStorage::kIndices, QKVLayout::kNHD, 2, kPagesize,
-  8, 256, PosEncodingMode::kNone, false, MaskMode::kCausal,
-  half, half, int32_t>(
-    half* q, int32_t* request_indices, int32_t* tile_indices, int32_t* qo_indptr, int32_t* q_offset,
-    paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, float* custom_mask,
-    int32_t* qk_indptr, half* o, float* tmp, float* lse, uint32_t num_qo_tiles, float sm_scale,
-    float rope_scale, float rope_theta, cudaStream_t stream);
-} // namespace flashinfer
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index ad2b78156..37eda6e35 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -155,7 +155,7 @@ FFHandler
   } else {
     handle.offload_reserve_space = nullptr;
   }
-  if (handle.batch_config_metadata_size > 0) {
+  if (handle.batch_config_metadata_size + handle.attention_metadata->mem_size() > 0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                          .only_kind(Memory::GPU_FB_MEM)
@@ -163,7 +163,7 @@ FFHandler
                          .first();
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
+        Realm::Point<1, coord_t>(handle.batch_config_metadata_size + handle.attention_metadata->mem_size() - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -176,8 +176,13 @@ FFHandler
         .wait();
     handle.batch_config_metadata =
         workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size),
+        handle.attention_metadata->mem_size());
   } else {
     handle.batch_config_metadata = nullptr;
+    handle.attention_metadata->assign_address(nullptr, 0);
   }
   // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 23b7f0efb..ab42539cb 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -89,6 +89,10 @@ FFHandler
   handle.offload_reserve_space_size = info->offload_reserve_space_size;
   handle.quantization_type = info->quantization_type;
   handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion;
+  handle.tree_search_attention_metadata = new AttentionMetaData();
+  handle.tree_verify_attention_metadata = new AttentionMetaData();
+  assert(handle.tree_search_attention_metadata != nullptr && "Attention metadata must be allocated");
+  assert(handle.tree_verify_attention_metadata != nullptr && "Attention metadata must be allocated");
   checkCUDA(cublasCreate(&handle.blas));
   if (handle.allowTensorOpMathConversion) {
     checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH));
@@ -151,7 +155,7 @@ FFHandler
   } else {
     handle.offload_reserve_space = nullptr;
   }
-  if (handle.batch_config_metadata_size > 0) {
+  if (handle.batch_config_metadata_size + handle.tree_search_attention_metadata->mem_size() + handle.tree_verify_attention_metadata->mem_size() > 0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                          .only_kind(Memory::GPU_FB_MEM)
@@ -159,7 +163,7 @@ FFHandler
                          .first();
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1));
+        Realm::Point<1, coord_t>(handle.batch_config_metadata_size + handle.tree_search_attention_metadata->mem_size() + handle.tree_verify_attention_metadata->mem_size() - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -172,8 +176,19 @@ FFHandler
         .wait();
     handle.batch_config_metadata =
         workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.tree_search_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size),
+        handle.tree_search_attention_metadata->mem_size());
+    handle.tree_verify_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size +
+                            handle.tree_search_attention_metadata->mem_size()),
+        handle.tree_verify_attention_metadata->mem_size());
   } else {
     handle.batch_config_metadata = nullptr;
+    handle.tree_search_attention_metadata->assign_address(nullptr, 0);
+    handle.tree_verify_attention_metadata->assign_address(nullptr, 0);
   }
 
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a904ef40a..e339168f3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1447,6 +1447,8 @@ bool RequestManager::update_ssm_inference_results(
         (Realm::Clock::current_time_in_microseconds() -
          profiling.ssm_step_start) *
         1e-3);
+    profiling.ssm_steps.push_back(current_ssm_step);
+    printf("SSM step finished\n");
   }
   return all_request_last_layer_empty;
 }
@@ -2221,6 +2223,15 @@ void RequestManager::terminate_background_server() {
       ssm_step_times_ms += ")";
       str += ssm_step_times_ms;
     }
+    if (profiling.ssm_steps.size() > 0) {
+      str += "\n ssm_steps(";
+      std::string ssm_steps = " ";
+      for (int nb : profiling.ssm_steps) {
+        ssm_steps += std::to_string(nb) + " ";
+      }
+      ssm_steps += ")";
+      str += ssm_steps;
+    }
     str += "\n generated_tokens_per_step(";
     std::string generated_tokens_per_step = " ";
     for (int nb : profiling.generated_tokens_per_step) {
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index ac5cc5e88..7e03a848e 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 
@@ -20,6 +21,8 @@ namespace FlexFlow {
 
 using namespace Legion;
 
+using flashinfer::BatchPrefillHandler;
+
 void RequestManager::load_tokens_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -67,6 +70,107 @@ void RequestManager::load_tokens_task(
   }
 }
 
+// NOTE: qk_indptr is accumulative `ceil(qk_len / 8)`
+__global__ void
+    prepare_inference_params_kernel(int const num_requests,
+                                    BatchConfig::PerRequestInfo *request_infos,
+                                    bool *request_available,
+                                    uint32_t const max_num_pages,
+                                    int32_t *q_indptr,
+                                    int32_t *kv_indptr,
+                                    int32_t *kv_indices,
+                                    int32_t *kv_last_page_len,
+                                    int32_t *qk_indptr) {
+  int const request_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (request_idx >= num_requests) {
+    return;
+  }
+
+  // request id in batch config
+  int requext_idx_in_batch = -1;
+  int cnt_1 = 0, q_lens = 0, qk_lens = 0;
+  int indices_offset = 0, indices_lens = 0, kv_len = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+      int q_len = request_infos[requext_idx_in_batch].num_tokens_in_batch;
+      q_lens += q_len;
+      kv_len = request_infos[requext_idx_in_batch].num_tokens_in_batch +
+               request_infos[requext_idx_in_batch].first_token_index_in_request;
+      qk_lens += (q_len * kv_len + 7) / 8;
+      indices_offset = indices_lens;
+      indices_lens += (kv_len + kPagesize - 1) / kPagesize;
+    }
+  }
+
+  if (request_idx == 0) {
+    q_indptr[0] = 0;
+    kv_indptr[0] = 0;
+    qk_indptr[0] = 0;
+  }
+  __syncthreads();
+  q_indptr[request_idx + 1] = q_lens;
+  kv_indptr[request_idx + 1] = indices_lens;
+  for (int i = indices_offset; i < indices_lens; i++) {
+    kv_indices[i] = max_num_pages * requext_idx_in_batch + (i - indices_offset);
+  }
+  kv_last_page_len[request_idx] = (kv_len - 1) % kPagesize + 1;
+  qk_indptr[request_idx + 1] = qk_lens;
+}
+
+#define test_bit_orig(bit_mask, idx, pos)                                           \
+  (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
+
+__global__ void
+    update_custom_mask_kernel(uint8_t *custom_mask,
+                              int32_t const *qk_indptr,
+                              BatchConfig::BitMask *causalMask,
+                              BatchConfig::PerRequestInfo *request_infos,
+                              bool *request_available,
+                              uint32_t const num_requests) {
+  int byte_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int request_idx = 0;
+  while (request_idx < num_requests) {
+    if (qk_indptr[request_idx + 1] > byte_idx) {
+      break;
+    }
+    request_idx++;
+  }
+
+  if (request_idx >= num_requests) {
+    return;
+  }
+  byte_idx -= qk_indptr[request_idx];
+
+  // request id in batch config
+  int requext_idx_in_batch = -1, cnt_1 = 0;
+  while (cnt_1 < request_idx + 1) {
+    requext_idx_in_batch++;
+    if (request_available[requext_idx_in_batch]) {
+      cnt_1++;
+    }
+  }
+
+  int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch,
+            q_start = request_infos[requext_idx_in_batch].first_token_index_in_request;
+
+  uint8_t packed_bits = 0;
+  for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
+    int const bit_offset = byte_idx * 8 + bit_idx,
+              q_idx = bit_offset / (q_start + q_length),
+              kv_idx = bit_offset % (q_start + q_length);
+    if (kv_idx < q_start || q_idx >= q_length) {
+      packed_bits |= 1 << bit_idx;
+    } else {
+      if (test_bit_orig(causalMask[requext_idx_in_batch].bit_mask, q_idx, kv_idx - q_start)) {
+        packed_bits |= 1 << bit_idx;
+      }
+    }
+  }
+  custom_mask[qk_indptr[request_idx] + byte_idx] = packed_bits;
+}
+
 void RequestManager::load_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -116,47 +220,267 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::request_available);
 
-  // load speculative metadata
+  // load attention metadata
   if (batch_config->get_mode() == TREE_SEARCH_MODE) {
-    for (int request_idx = 0;
-         request_idx < BatchConfig::max_requests_per_batch();
-         request_idx++) {
-      if (batch_config->request_available[request_idx]) {
-        checkCUDA(cudaMemcpyAsync(
-            static_cast<char *>(handle.batch_config_metadata) +
-                total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
-            &(batch_config->causalMask[request_idx]),
-            sizeof(BatchConfig::BitMask),
-            cudaMemcpyHostToDevice,
-            stream));
+    if (handle.tree_search_attention_metadata->enabled()) {
+      for (int request_idx = 0;
+          request_idx < BatchConfig::max_requests_per_batch();
+          request_idx++) {
+        if (batch_config->request_available[request_idx]) {
+          checkCUDA(cudaMemcpyAsync(
+              static_cast<char *>(handle.batch_config_metadata) +
+                  total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+              &(batch_config->causalMask[request_idx]),
+              sizeof(BatchConfig::BitMask),
+              cudaMemcpyHostToDevice,
+              stream));
+        }
+      }
+      total_copy_size += sizeof(BatchConfig::causalMask);
+
+      // calculate the attention meta data
+      {
+        BatchConfig::PerRequestInfo *request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
+          static_cast<char *>(handle.batch_config_metadata) +
+          sizeof(BatchConfig::tokensInfo));
+        bool *request_available = reinterpret_cast<bool *>(
+          static_cast<char *>(handle.batch_config_metadata) +
+          sizeof(BatchConfig::tokensInfo) +
+          sizeof(BatchConfig::requestsInfo));
+        BatchConfig::BitMask *causalMask = reinterpret_cast<BatchConfig::BitMask *>(
+          static_cast<char *>(handle.batch_config_metadata) +
+          sizeof(BatchConfig::tokensInfo) +
+          sizeof(BatchConfig::requestsInfo) +
+          sizeof(BatchConfig::request_available));
+        int batch_size = batch_config->num_active_requests();
+        uint32_t const max_num_pages = (BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+
+        int parallelism = batch_size;
+        prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(batch_size,
+                                                    request_infos,
+                                                    request_available,
+                                                    max_num_pages,
+                                                    handle.tree_search_attention_metadata->q_indptr,
+                                                    handle.tree_search_attention_metadata->kv_indptr,
+                                                    handle.tree_search_attention_metadata->kv_indices,
+                                                    handle.tree_search_attention_metadata->kv_last_page_len,
+                                                    handle.tree_search_attention_metadata->qk_indptr);
+
+        // Update gpu-side custom mask referring from CaualMask
+        if (!batch_config->prompt_phase) {
+          int parallelism = 0;
+          for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+            if (batch_config->request_available[req_idx]) {
+              int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+              int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+              parallelism += (q_len * kv_len + 7) / 8;
+            }
+          }
+          update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(handle.tree_search_attention_metadata->custom_mask,
+                                                handle.tree_search_attention_metadata->qk_indptr,
+                                                causalMask,
+                                                request_infos,
+                                                request_available,
+                                                batch_size);
+        }
+      }
+
+      // prepare attention forward handler
+      {
+        int batch_size = batch_config->num_active_requests();
+        BatchPrefillHandler *handler = nullptr;
+
+        if (!batch_config->prompt_phase) {
+          if (handle.tree_search_attention_metadata->decode_handler_collections.count(batch_size) == 0) {
+            handle.tree_search_attention_metadata->decode_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+            handle.tree_search_attention_metadata->decode_handler_collections[batch_size]);
+        } else {
+          if (handle.tree_search_attention_metadata->prompt_handler_collections.count(batch_size) == 0) {
+            handle.tree_search_attention_metadata->prompt_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+            handle.tree_search_attention_metadata->prompt_handler_collections[batch_size]);
+        }
+
+        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+        q_indptr_h[0] = 0;
+        kv_indptr_h[0] = 0;
+        for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+          if (batch_config->request_available[req_idx]) {
+            int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+            int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                        batch_config->requestsInfo[req_idx].first_token_index_in_request;
+            q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+            kv_indptr_h[indptr_idx + 1] = kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+            indptr_idx++;
+          }
+        }
+
+        handler->SetCUDAStream(stream);
+        handler->BeginForward<half, int32_t>(static_cast<void*>(
+                                              static_cast<char*>(handle.tree_search_attention_metadata->workspace) +
+                                              handle.tree_search_attention_metadata->workspace_block * batch_size),
+                                            handle.tree_search_attention_metadata->workspace_block,
+                                            static_cast<int32_t *>(q_indptr_h),
+                                            static_cast<int32_t *>(kv_indptr_h),
+                                            batch_size,
+                                            handle.tree_search_attention_metadata->num_q_heads(),
+                                            handle.tree_search_attention_metadata->num_kv_heads(),
+                                            handle.tree_search_attention_metadata->head_dim(),
+                                            kPagesize);
       }
     }
-    total_copy_size += sizeof(BatchConfig::causalMask);
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    for (int request_idx = 0;
-         request_idx < BatchConfig::max_requests_per_batch();
-         request_idx++) {
-      if (batch_config->request_available[request_idx]) {
+    if (handle.tree_verify_attention_metadata->enabled()) {
+      for (int request_idx = 0;
+          request_idx < BatchConfig::max_requests_per_batch();
+          request_idx++) {
+        if (batch_config->request_available[request_idx]) {
+          checkCUDA(cudaMemcpyAsync(
+              static_cast<char *>(handle.batch_config_metadata) +
+                  total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+              &(batch_config->causalMask[request_idx]),
+              sizeof(BatchConfig::BitMask),
+              cudaMemcpyHostToDevice,
+              stream));
+        }
+      }
+      total_copy_size += sizeof(BatchConfig::causalMask);
+
+      if (batch_config->num_tokens_to_commit > 0) {
         checkCUDA(cudaMemcpyAsync(
-            static_cast<char *>(handle.batch_config_metadata) +
-                total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
-            &(batch_config->causalMask[request_idx]),
-            sizeof(BatchConfig::BitMask),
+            static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+            &(batch_config->committed_tokens),
+            batch_config->num_tokens_to_commit *
+                sizeof(BatchConfig::CommittedTokensInfo),
             cudaMemcpyHostToDevice,
             stream));
       }
-    }
-    total_copy_size += sizeof(BatchConfig::causalMask);
+      total_copy_size += sizeof(BatchConfig::committed_tokens);
 
-    if (batch_config->num_tokens_to_commit > 0) {
       checkCUDA(cudaMemcpyAsync(
           static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-          &(batch_config->committed_tokens),
-          batch_config->num_tokens_to_commit *
-              sizeof(BatchConfig::CommittedTokensInfo),
+          &(batch_config->num_tokens_to_commit),
+          sizeof(int),
           cudaMemcpyHostToDevice,
           stream));
-      total_copy_size += sizeof(BatchConfig::committed_tokens);
+      total_copy_size += sizeof(int);
+
+      // calculate the attention meta data
+      {
+        BatchConfig::PerRequestInfo *request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
+          static_cast<char *>(handle.batch_config_metadata) +
+          sizeof(BatchConfig::tokensInfo));
+        bool *request_available = reinterpret_cast<bool *>(
+          static_cast<char *>(handle.batch_config_metadata) +
+          sizeof(BatchConfig::tokensInfo) +
+          sizeof(BatchConfig::requestsInfo));
+        BatchConfig::BitMask *causalMask = reinterpret_cast<BatchConfig::BitMask *>(
+          static_cast<char *>(handle.batch_config_metadata) +
+          sizeof(BatchConfig::tokensInfo) +
+          sizeof(BatchConfig::requestsInfo) +
+          sizeof(BatchConfig::request_available));
+        int batch_size = batch_config->num_active_requests();
+        uint32_t const max_num_pages = (BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+
+        int parallelism = batch_size;
+        prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(batch_size,
+                                                    request_infos,
+                                                    request_available,
+                                                    max_num_pages,
+                                                    handle.tree_verify_attention_metadata->q_indptr,
+                                                    handle.tree_verify_attention_metadata->kv_indptr,
+                                                    handle.tree_verify_attention_metadata->kv_indices,
+                                                    handle.tree_verify_attention_metadata->kv_last_page_len,
+                                                    handle.tree_verify_attention_metadata->qk_indptr);
+
+        // Update gpu-side custom mask referring from CaualMask
+        if (!batch_config->prompt_phase) {
+          int parallelism = 0;
+          for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+            if (batch_config->request_available[req_idx]) {
+              int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+              int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+              parallelism += (q_len * kv_len + 7) / 8;
+            }
+          }
+          update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(handle.tree_verify_attention_metadata->custom_mask,
+                                                handle.tree_verify_attention_metadata->qk_indptr,
+                                                causalMask,
+                                                request_infos,
+                                                request_available,
+                                                batch_size);
+        }
+      }
+
+      // prepare attention forward handler
+      {
+        int batch_size = batch_config->num_active_requests();
+        BatchPrefillHandler *handler = nullptr;
+
+        if (!batch_config->prompt_phase) {
+          if (handle.tree_verify_attention_metadata->decode_handler_collections.count(batch_size) == 0) {
+            handle.tree_verify_attention_metadata->decode_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+            handle.tree_verify_attention_metadata->decode_handler_collections[batch_size]);
+        } else {
+          if (handle.tree_verify_attention_metadata->prompt_handler_collections.count(batch_size) == 0) {
+            handle.tree_verify_attention_metadata->prompt_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+            handle.tree_verify_attention_metadata->prompt_handler_collections[batch_size]);
+        }
+
+        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+        q_indptr_h[0] = 0;
+        kv_indptr_h[0] = 0;
+        for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+          if (batch_config->request_available[req_idx]) {
+            int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+            int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                        batch_config->requestsInfo[req_idx].first_token_index_in_request;
+            q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+            kv_indptr_h[indptr_idx + 1] = kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+            indptr_idx++;
+          }
+        }
+
+        handler->SetCUDAStream(stream);
+        handler->BeginForward<half, int32_t>(static_cast<void*>(
+                                              static_cast<char*>(handle.tree_verify_attention_metadata->workspace) +
+                                              handle.tree_verify_attention_metadata->workspace_block * batch_size),
+                                            handle.tree_verify_attention_metadata->workspace_block,
+                                            static_cast<int32_t *>(q_indptr_h),
+                                            static_cast<int32_t *>(kv_indptr_h),
+                                            batch_size,
+                                            handle.tree_verify_attention_metadata->num_q_heads(),
+                                            handle.tree_verify_attention_metadata->num_kv_heads(),
+                                            handle.tree_verify_attention_metadata->head_dim(),
+                                            kPagesize);
+      }
     }
   }
 

From 2b026484e49a0b522d619101798ddc0ef2854e21 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com>
Date: Wed, 14 Aug 2024 21:04:09 -0400
Subject: [PATCH 382/667] Custom AllReduce (#1467)

* feat: update deps/flashinfer

* feat: update flashinfer

* fix: now can get correct result, but has performance problem

* fix: update_custom_mask performance

* chore: minor

* chore: add perf code

* feat: add attention metadata

* feat: add AttentionMetaData

* feat: tree_verify_attn use global attentionmetadata

* feat: move attentionmetasize to global computing

* chore: minor

* chore: remove unused

* feat: add spec_inc_attn backup

* feat: SSM use flashinfer kernel

* fix: SSM don't use cudaGraph

* chore: remove redundant code

* chore: comment out minor

* feat: attention adapt to cudaGraph

* fix: split handler_collections for prompt/decode phases

* chore: tree verify cannot use cudaGraph

* feat: move all flashinfer-related states to global (tree search attention)

* fix: use identical attention_meta instance across all FFHandlers

* feat: enable cudaGraph in tree search mode

* chore: minor

* feat: tree search & verify use separate attention_meta

* fix: attention_metadata should be distinct for each worker

* feat: tree verify attention use metadata

* feat: support llm cudaGraph

* chore: minor

* chore: temporally only enable SSM cudaGraph for performance issue

* chore: minor

* fix: llm cudaGraph, should ensure the kernel parameter be consistent

* feat: reduce cudaGraph number

* feat: reduce cudaGraph instances number

* feat: add tensorRT-LLM custom_allreduce

* feat: add tensorrt_llm custom_allreduce kernel into exeutable

* doc: add a README for acknowledgement

* feat: add device info in FFHandle

* feat: enable both cudaGraph

* feat: temporally add the ipc mem

* feat: enable only ssm cudaGraph

* feat: minor reconstruct

* feat: implementation of CommunicationBuffer

* feat: implement custom_allreduce

* feat: allocate memory  from legion, not cudaMalloc

* chore: some debug output

* feat: switch to use peer memory, rather than IPC memory

* chore: remove debug output

* fix: minor concurrent bug

* style: format code

* chore: remove unused backup code

* chore: more measurements

---------

Co-authored-by: zikun-li <lizikunzk@gmail.com>
---
 CMakeLists.txt                                |   6 +
 deps/tensorrt_llm/README.md                   |   5 +
 .../tensorrt_llm/custom_allreduce_kernels.cu  | 448 +++++++++++++
 .../tensorrt_llm/custom_allreduce_kernels.h   |  88 +++
 include/flexflow/config.h                     |  75 ++-
 include/flexflow/ops/fused.h                  |   8 +-
 include/flexflow/ops/graph_params.h           |  70 +-
 include/flexflow/ops/gumbel_topk.h            |  39 +-
 .../ops/spec_inc_multihead_self_attention.h   |  15 +-
 .../parallel_ops/kernels/allreduce_kernels.h  |  18 +-
 include/flexflow/request_manager.h            |   8 +-
 include/flexflow/utils/communication_buffer.h |  74 +++
 src/c/flexflow_c.cc                           |   4 +-
 src/ops/add_bias_residual_layer_norm.cc       |   2 +-
 src/ops/aggregate.cc                          |  11 +-
 src/ops/aggregate_spec.cc                     |  12 +-
 src/ops/attention.cc                          |   2 +-
 src/ops/cast.cc                               |  11 +-
 src/ops/element_binary.cc                     |  12 +-
 src/ops/element_unary.cc                      |  12 +-
 src/ops/embedding.cc                          |  11 +-
 src/ops/experts.cc                            |  11 +-
 src/ops/fused.cc                              |  11 +-
 src/ops/fused.cu                              | 154 +++--
 src/ops/group_by.cc                           |  11 +-
 src/ops/gumbel_topk.cu                        | 176 +++---
 src/ops/inc_multihead_self_attention.cc       |   2 +-
 src/ops/layer_norm.cc                         |  11 +-
 src/ops/linear.cc                             |  11 +-
 src/ops/multihead_self_attention_impl.cu      | 596 ++++++++++++++----
 src/ops/noop.cc                               |  11 +-
 src/ops/residual_layer_norm.cc                |   2 +-
 src/ops/residual_rms_norm.cc                  |  12 +-
 src/ops/rms_norm.cc                           |  11 +-
 src/ops/sigmoid_silu_multi.cc                 |   2 +-
 src/ops/spec_inc_multihead_self_attention.cpp |   3 +-
 src/ops/spec_inc_multihead_self_attention.cu  | 140 ++--
 src/ops/split.cc                              |  11 +-
 src/ops/topk.cc                               |  11 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 144 +++--
 src/parallel_ops/allreduce.cc                 |   9 +-
 src/parallel_ops/kernels/allreduce_kernels.cu | 171 ++++-
 src/runtime/batch_config.cc                   |   4 +-
 src/runtime/graph.cc                          |  28 +-
 src/runtime/model.cc                          |  30 +-
 src/runtime/model.cpp                         |   9 +-
 src/runtime/model.cu                          |  18 +-
 src/runtime/request_manager.cc                | 117 +++-
 src/runtime/request_manager.cu                | 285 +++++----
 src/utils/communication_buffer.cu             | 135 ++++
 50 files changed, 2269 insertions(+), 798 deletions(-)
 create mode 100644 deps/tensorrt_llm/README.md
 create mode 100644 deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
 create mode 100644 deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h
 create mode 100644 include/flexflow/utils/communication_buffer.h
 create mode 100644 src/utils/communication_buffer.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5618f315a..00f257b6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -301,6 +301,12 @@ if(NOT BUILD_LEGION_ONLY)
       LIST_DIRECTORIES False
       ${FLEXFLOW_ROOT}/src/*.cu)
 
+    # tensorrt_llm custom allreduce
+    if(FF_USE_NCCL)
+      list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm)
+      list(APPEND FLEXFLOW_GPU_SRC ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu)
+    endif()
+
     add_compile_definitions(FF_USE_CUDA)
 
     if(BUILD_SHARED_LIBS)
diff --git a/deps/tensorrt_llm/README.md b/deps/tensorrt_llm/README.md
new file mode 100644
index 000000000..39fcecdd7
--- /dev/null
+++ b/deps/tensorrt_llm/README.md
@@ -0,0 +1,5 @@
+## Custom AllReduce Implementation
+
+This is an adapted version of the custom AllReduce plugin from NVIDIA's [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) repository.
+
+To replace the NCCL AllReduce call, we should also add a CUDA IPC support to the custom AllReduce usage. Our IPC&AllReduce implementation is referenced from [mlc-ai/relax](https://github.com/mlc-ai/relax).
diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
new file mode 100644
index 000000000..27a266fa3
--- /dev/null
+++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_fp16.h>
+
+#include "custom_allreduce_kernels.h"
+
+namespace tensorrt_llm {
+
+static inline __device__ void st_flag_release(uint32_t &flag,
+                                              uint32_t *flag_addr) {
+#if __CUDA_ARCH__ >= 700
+  asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#else
+  __threadfence_system();
+  asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void ld_flag_acquire(uint32_t &flag,
+                                              uint32_t *flag_addr) {
+#if __CUDA_ARCH__ >= 700
+  asm volatile("ld.global.acquire.sys.b32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#else
+  asm volatile("ld.global.volatile.b32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Type Converter that packs data format to 128 bits data type
+//
+using PackedFloat = union {
+  int4 packed;
+  float unpacked[4];
+};
+
+using PackedHalf = union {
+  int4 packed;
+  half2 unpacked[4];
+};
+
+template <typename T>
+struct PackedOn16Bytes {};
+
+template <>
+struct PackedOn16Bytes<float> {
+  using Type = PackedFloat;
+};
+
+template <>
+struct PackedOn16Bytes<half> {
+  using Type = PackedHalf;
+};
+
+#ifdef ENABLE_BF16
+using PackedBFloat16 = union {
+  int4 packed;
+  __nv_bfloat162 unpacked[4];
+};
+
+template <>
+struct PackedOn16Bytes<__nv_bfloat16> {
+  using Type = PackedBFloat16;
+};
+#endif
+
+// add two 128b data
+template <typename T>
+inline __device__ int4 add128b(T &a, T &b) {
+  T c;
+  c.unpacked[0] = a.unpacked[0] + b.unpacked[0];
+  c.unpacked[1] = a.unpacked[1] + b.unpacked[1];
+  c.unpacked[2] = a.unpacked[2] + b.unpacked[2];
+  c.unpacked[3] = a.unpacked[3] + b.unpacked[3];
+  return c.packed;
+}
+
+__inline__ __device__ void multi_gpu_barrier(uint32_t **signals,
+                                             const uint32_t flag,
+                                             const size_t rank,
+                                             const size_t world_size,
+                                             int const tidx,
+                                             int const bidx) {
+  // At the end of the function, we now that has least block 0 from all others
+  // GPUs have reached that point.
+  uint32_t volatile *my_signals = signals[rank];
+  if (tidx < world_size) {
+    // The 1st block notifies the other ranks.
+    if (bidx == 0) {
+      signals[tidx][rank] = flag;
+    }
+
+    // Busy-wait until all ranks are ready.
+    while (my_signals[tidx] != flag) {
+    }
+  }
+
+  // Make sure we can move on...
+  __syncthreads();
+}
+
+__global__ void multiGpuBarrierKernel(AllReduceParams params) {
+  multi_gpu_barrier(params.peer_barrier_ptrs_out,
+                    params.barrier_flag,
+                    params.local_rank,
+                    params.ranks_per_node,
+                    threadIdx.x,
+                    blockIdx.x);
+}
+
+template <typename T, int RANKS_PER_NODE>
+static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
+  int const bidx = blockIdx.x;
+  int const tidx = threadIdx.x;
+
+  // The number of elements packed into one for comms
+  static constexpr int NUM_ELTS = 16 / sizeof(T);
+
+  // Packed data type for comms
+  using PackedStruct = typename PackedOn16Bytes<T>::Type;
+
+  multi_gpu_barrier(params.peer_barrier_ptrs_in,
+                    params.barrier_flag,
+                    params.local_rank,
+                    RANKS_PER_NODE,
+                    tidx,
+                    bidx);
+
+  // The source pointers. Distributed round-robin for the different warps.
+  T const *src_d[RANKS_PER_NODE];
+#pragma unroll
+  for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+    int rank = (params.local_rank + ii) % RANKS_PER_NODE;
+    src_d[ii] = reinterpret_cast<T *>(params.peer_comm_buffer_ptrs[rank]);
+  }
+
+  // The location in the destination array (load 8 fp16 or load 4 fp32 using
+  // LDG.128).
+  size_t offset = bidx * params.elts_per_block + tidx * NUM_ELTS;
+  // The end of the segment computed by that block.
+  size_t max_offset =
+      min((bidx + 1) * params.elts_per_block, params.elts_per_rank);
+
+  // Each block accumulates the values from the different GPUs on the same node.
+  for (size_t iter_offset = offset; iter_offset < max_offset;
+       iter_offset += blockDim.x * NUM_ELTS) {
+    // Iterate over the different ranks/devices on the node to load the values.
+    PackedStruct vals[RANKS_PER_NODE];
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      vals[ii].packed =
+          *reinterpret_cast<int4 const *>(&src_d[ii][iter_offset]);
+    }
+
+    // Sum the values from the different ranks.
+    PackedStruct sums;
+    sums.packed = {0, 0, 0, 0};
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      sums.packed = add128b(sums, vals[ii]);
+    }
+
+    // Store to the destination buffer.
+    *reinterpret_cast<int4 *>(&reinterpret_cast<T *>(
+        params.local_output_buffer_ptr)[iter_offset]) = sums.packed;
+  }
+}
+
+template <typename T, int RANKS_PER_NODE>
+static __global__ void twoShotAllReduceKernel(AllReduceParams params) {
+  // The block index.
+  int const bidx = blockIdx.x;
+  // The thread index with the block.
+  int const tidx = threadIdx.x;
+
+  // The number of elements packed into one for comms
+  static constexpr int NUM_ELTS = 16 / sizeof(T);
+
+  // Packed data type for comms
+  using PackedType = typename PackedOn16Bytes<T>::Type;
+
+  // The location in the destination array (load 8 fp16 or load 4 fp32 using
+  // LDG.128).
+  const size_t block_offset = bidx * params.elts_per_block + tidx * NUM_ELTS;
+  const size_t block_start = params.rank_offset + block_offset;
+  // The end of the segment computed by that block.
+  size_t max_offset = min(block_start + params.elts_per_block,
+                          params.rank_offset + params.elts_per_rank);
+
+  multi_gpu_barrier(params.peer_barrier_ptrs_in,
+                    params.barrier_flag,
+                    params.local_rank,
+                    RANKS_PER_NODE,
+                    tidx,
+                    bidx);
+
+  // The source pointers. Distributed round-robin for the different warps.
+  T *src_d[RANKS_PER_NODE];
+  // The destination ranks for round-robin gathering
+  size_t dst_rank[RANKS_PER_NODE];
+#pragma unroll
+  for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+    int rank = (params.local_rank + ii) % RANKS_PER_NODE;
+    src_d[ii] = reinterpret_cast<T *>(params.peer_comm_buffer_ptrs[rank]);
+    dst_rank[ii] = rank;
+  }
+
+  // Each block accumulates the values from the different GPUs on the same node.
+  for (size_t local_offset = block_start; local_offset < max_offset;
+       local_offset += blockDim.x * NUM_ELTS) {
+    // Iterate over the different ranks/devices on the node to load the values.
+    PackedType vals[RANKS_PER_NODE];
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      vals[ii].packed =
+          *reinterpret_cast<int4 const *>(&src_d[ii][local_offset]);
+    }
+
+    // Sum the values from the different ranks.
+    PackedType sums;
+    sums.packed = {0, 0, 0, 0};
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      sums.packed = add128b(sums, vals[ii]);
+    }
+
+    // Store to the local buffer.
+    *reinterpret_cast<int4 *>(&src_d[0][local_offset]) = sums.packed;
+  }
+
+  // sync threads to make sure all block threads have the sums
+  __syncthreads();
+
+  // barriers among the blocks with the same idx (release-acquire semantics)
+  if (tidx < RANKS_PER_NODE) {
+    // The all blocks notifies the other ranks.
+    uint32_t flag_block_offset = RANKS_PER_NODE + bidx * RANKS_PER_NODE;
+    st_flag_release(params.barrier_flag,
+                    params.peer_barrier_ptrs_in[tidx] + flag_block_offset +
+                        params.local_rank);
+
+    // Busy-wait until all ranks are ready.
+    uint32_t rank_barrier = 0;
+    uint32_t *peer_barrier_d = params.peer_barrier_ptrs_in[params.local_rank] +
+                               flag_block_offset + tidx;
+    do {
+      ld_flag_acquire(rank_barrier, peer_barrier_d);
+    } while (rank_barrier != params.barrier_flag);
+  }
+
+  // sync threads to make sure all other ranks has the final partial results
+  __syncthreads();
+
+  size_t max_block_offset =
+      min(block_offset + params.elts_per_block, params.elts_per_rank);
+  // Gather all needed elts from other intra-node ranks
+  for (size_t local_offset = block_offset; local_offset < max_block_offset;
+       local_offset += blockDim.x * NUM_ELTS) {
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+      // use round-robin gathering from other ranks
+      size_t offset_rank = dst_rank[ii] * params.elts_per_rank + local_offset;
+      if (offset_rank >= params.elts_total) {
+        continue;
+      }
+      *reinterpret_cast<int4 *>(
+          &reinterpret_cast<T *>(params.local_output_buffer_ptr)[offset_rank]) =
+          *reinterpret_cast<int4 *>(&src_d[ii][offset_rank]);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int divUp(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+std::tuple<int, int> kernelLaunchConfig(AllReduceStrategyType algo,
+                                        AllReduceParams &param,
+                                        size_t elts_per_thread) {
+  assert(param.elts_total % elts_per_thread == 0);
+
+  int blocks_per_grid = 1, threads_per_block = DEFAULT_BLOCK_SIZE;
+
+  const size_t total_threads = param.elts_total / elts_per_thread;
+  switch (algo) {
+    case AllReduceStrategyType::ONESHOT: {       // one stage all reduce algo
+      if (total_threads <= DEFAULT_BLOCK_SIZE) { // local reduce
+        threads_per_block = WARP_SIZE * divUp(total_threads, WARP_SIZE);
+        blocks_per_grid = 1;
+      } else { // local reduce
+        threads_per_block = DEFAULT_BLOCK_SIZE;
+        blocks_per_grid = divUp(total_threads, DEFAULT_BLOCK_SIZE);
+        blocks_per_grid =
+            std::min(static_cast<int>(MAX_ALL_REDUCE_BLOCKS), blocks_per_grid);
+      }
+      param.elts_per_rank = param.elts_total;
+      param.elts_per_block =
+          elts_per_thread *
+          divUp(param.elts_per_rank, elts_per_thread * blocks_per_grid);
+      break;
+    }
+    case AllReduceStrategyType::TWOSHOT: { // two stage all reduce algo
+      const size_t elts_per_rank = param.elts_total / param.ranks_per_node;
+      assert(elts_per_rank % elts_per_thread == 0);
+
+      size_t total_threads = elts_per_rank / elts_per_thread;
+      total_threads = WARP_SIZE * ((total_threads + WARP_SIZE - 1) / WARP_SIZE);
+      assert(total_threads % WARP_SIZE == 0);
+
+      while (total_threads % blocks_per_grid != 0 ||
+             total_threads / blocks_per_grid > DEFAULT_BLOCK_SIZE) {
+        blocks_per_grid += 1;
+      }
+
+      threads_per_block = total_threads / blocks_per_grid;
+
+      // NOTE: need to adjust here
+      if (static_cast<size_t>(blocks_per_grid) > MAX_ALL_REDUCE_BLOCKS) {
+        size_t iter_factor = 1;
+        while (blocks_per_grid / iter_factor > MAX_ALL_REDUCE_BLOCKS ||
+               blocks_per_grid % iter_factor) {
+          iter_factor += 1;
+        }
+        blocks_per_grid /= iter_factor;
+      }
+      param.elts_per_rank = param.elts_total / param.ranks_per_node;
+      param.elts_per_block = param.elts_per_rank / blocks_per_grid;
+      param.elts_per_block =
+          elts_per_thread * divUp(param.elts_per_block, elts_per_thread);
+      param.rank_offset = param.rank * param.elts_per_rank;
+      break;
+    }
+    default:
+      assert(false && "Algorithm not supported here.");
+  }
+
+  return std::make_tuple(blocks_per_grid, threads_per_block);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int RANKS_PER_NODE>
+void dispatchARKernels(AllReduceStrategyType algo,
+                       AllReduceParams &param,
+                       int blocks_per_grid,
+                       int threads_per_block,
+                       cudaStream_t stream) {
+  if (algo == AllReduceStrategyType::ONESHOT) {
+    oneShotAllReduceKernel<T, RANKS_PER_NODE>
+        <<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+  } else {
+    twoShotAllReduceKernel<T, RANKS_PER_NODE>
+        <<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+  }
+}
+
+template <typename T>
+void invokeOneOrTwoShotAllReduceKernel(AllReduceParams &param,
+                                       AllReduceStrategyType strat,
+                                       cudaStream_t stream) {
+  assert(strat == AllReduceStrategyType::ONESHOT ||
+         strat == AllReduceStrategyType::TWOSHOT);
+  auto last_error = cudaGetLastError();
+  if (last_error != cudaSuccess) {
+    printf("cuda error: %s\n", cudaGetErrorString(last_error));
+    assert(false && "Error before launching the kernel");
+  }
+
+  size_t elts_per_thread = 16 / sizeof(T);
+  auto [blocks_per_grid, threads_per_block] =
+      kernelLaunchConfig(strat, param, elts_per_thread);
+  switch (param.ranks_per_node) {
+    case 2:
+      dispatchARKernels<T, 2>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    case 4:
+      dispatchARKernels<T, 4>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    case 6:
+      dispatchARKernels<T, 6>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    case 8:
+      dispatchARKernels<T, 8>(
+          strat, param, blocks_per_grid, threads_per_block, stream);
+      break;
+    default:
+      break;
+  }
+  last_error = cudaGetLastError();
+  if (last_error != cudaSuccess) {
+    printf("cuda error: %s\n", cudaGetErrorString(last_error));
+    assert(false && "Error after launching the kernel");
+  }
+}
+
+void invokeMultiGpuBarrier(AllReduceParams &param, cudaStream_t stream) {
+  multiGpuBarrierKernel<<<1, param.ranks_per_node, 0, stream>>>(param);
+}
+
+void customAllReduce(AllReduceParams &params,
+                     void *data,
+                     size_t elts,
+                     DataType dataType,
+                     AllReduceStrategyType strat,
+                     cudaStream_t stream) {
+  params.local_output_buffer_ptr = data;
+  params.elts_total = elts;
+
+  if (dataType == DT_FLOAT) {
+    invokeOneOrTwoShotAllReduceKernel<float>(params, strat, stream);
+  } else if (dataType == DT_HALF) {
+    invokeOneOrTwoShotAllReduceKernel<half>(params, strat, stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+}
+
+} // namespace tensorrt_llm
diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h
new file mode 100644
index 000000000..e56795047
--- /dev/null
+++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst.h"
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+namespace tensorrt_llm {
+
+constexpr size_t WARP_SIZE = 32;
+constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24;
+constexpr size_t MAX_RANKS_PER_NODE = 8;
+constexpr size_t DEFAULT_BLOCK_SIZE = 1024;
+
+enum class AllReduceStrategyType : int8_t {
+  RING = 0,
+  ONESHOT = 1,
+  TWOSHOT = 2,
+  AUTO = 3,
+};
+
+struct AllReduceParams {
+  size_t elts_total;
+  size_t elts_per_rank;
+  size_t elts_per_block;
+  size_t rank_offset;
+  size_t ranks_per_node, rank, local_rank;
+  uint32_t barrier_flag;
+  uint32_t *peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
+  uint32_t *peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
+  void *peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE];
+  void *local_output_buffer_ptr;
+};
+
+inline size_t GetMaxRequiredWorkspaceSize(int world_size) {
+  if (world_size <= 2) {
+    return 16 * 1000 * 1000;
+  }
+  return 8 * 1000 * 1000;
+}
+
+inline AllReduceStrategyType SelectImplementation(size_t message_size,
+                                                  int world_size) {
+  const size_t maxWorkspaceSize = GetMaxRequiredWorkspaceSize(world_size);
+
+  if (message_size > maxWorkspaceSize) {
+    return AllReduceStrategyType::RING;
+  }
+
+  if (world_size <= 2) {
+    return AllReduceStrategyType::ONESHOT;
+  }
+
+  if (world_size <= 4) {
+    if (message_size < 1 * 1000 * 1000) {
+      return AllReduceStrategyType::ONESHOT;
+    }
+    return AllReduceStrategyType::TWOSHOT;
+  }
+
+  if (message_size < 500 * 1000) {
+    return AllReduceStrategyType::ONESHOT;
+  }
+  return AllReduceStrategyType::TWOSHOT;
+}
+
+void customAllReduce(AllReduceParams &params,
+                     void *data,
+                     size_t elts,
+                     DataType dataType,
+                     AllReduceStrategyType strat,
+                     cudaStream_t stream);
+
+} // namespace tensorrt_llm
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index b244563c4..6b3a5c08c 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -84,7 +84,7 @@ class AttentionMetaData {
     mem_size_ = 0;
     enabled_ = false;
   }
-  AttentionMetaData(const AttentionMetaData &rhs) {
+  AttentionMetaData(AttentionMetaData const &rhs) {
     num_q_heads_ = rhs.num_q_heads_;
     num_kv_heads_ = rhs.num_kv_heads_;
     head_dim_ = rhs.head_dim_;
@@ -115,15 +115,19 @@ class AttentionMetaData {
         (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               ((BatchConfig::max_spec_tree_token_num() *
-                                (BatchConfig::max_spec_tree_token_num() +
-                                BatchConfig::max_sequence_length()) + 7) / 8);
+                                    (BatchConfig::max_spec_tree_token_num() +
+                                     BatchConfig::max_sequence_length()) +
+                                7) /
+                               8);
     workspace_block = 16 * 1024 * 1024; // 16MB
 
-    mem_size_ = sizeof(int32_t) * indices_size + sizeof(uint8_t) * custom_mask_size + workspace_block * BatchConfig::max_requests_per_batch();
+    mem_size_ = sizeof(int32_t) * indices_size +
+                sizeof(uint8_t) * custom_mask_size +
+                workspace_block * BatchConfig::max_requests_per_batch();
     return mem_size_;
   }
 
-  void assign_address(void* ptr, int size) {
+  void assign_address(void *ptr, int size) {
     if (ptr == nullptr) {
       q_indptr = nullptr;
       kv_indptr = nullptr;
@@ -134,7 +138,8 @@ class AttentionMetaData {
       workspace = nullptr;
       return;
     }
-    assert(size >= mem_size() && "Insufficient memory size for attention metadata");
+    assert(size >= mem_size() &&
+           "Insufficient memory size for attention metadata");
     size_t batch_size = BatchConfig::max_requests_per_batch();
     size_t max_num_pages =
         (BatchConfig::max_spec_tree_token_num() +
@@ -144,27 +149,47 @@ class AttentionMetaData {
         (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
                               ((BatchConfig::max_spec_tree_token_num() *
-                                (BatchConfig::max_spec_tree_token_num() +
-                                BatchConfig::max_sequence_length()) + 7) / 8);
+                                    (BatchConfig::max_spec_tree_token_num() +
+                                     BatchConfig::max_sequence_length()) +
+                                7) /
+                               8);
 
-    q_indptr = static_cast<int32_t*>(ptr);
+    q_indptr = static_cast<int32_t *>(ptr);
     kv_indptr = q_indptr + batch_size + 1;
     kv_indices = kv_indptr + batch_size + 1;
     kv_last_page_len = kv_indices + max_num_pages * batch_size;
     qk_indptr = kv_last_page_len + batch_size + 1;
-    custom_mask = static_cast<uint8_t*>(ptr) + sizeof(int32_t) * indices_size;
-    workspace = static_cast<void*>(static_cast<uint8_t*>(ptr) + sizeof(int32_t) * indices_size + sizeof(uint8_t) * custom_mask_size);
+    custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
+    workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
+                                    sizeof(int32_t) * indices_size +
+                                    sizeof(uint8_t) * custom_mask_size);
   }
 
-  void set_num_q_heads(uint32_t const num_q_heads) { num_q_heads_ = num_q_heads; }
-  void set_num_kv_heads(uint32_t const num_kv_heads) { num_kv_heads_ = num_kv_heads; }
-  void set_head_dim(uint32_t const head_dim) { head_dim_ = head_dim; }
-  uint32_t num_q_heads() const { return num_q_heads_; }
-  uint32_t num_kv_heads() const { return num_kv_heads_; }
-  uint32_t head_dim() const { return head_dim_; }
+  void set_num_q_heads(uint32_t const num_q_heads) {
+    num_q_heads_ = num_q_heads;
+  }
+  void set_num_kv_heads(uint32_t const num_kv_heads) {
+    num_kv_heads_ = num_kv_heads;
+  }
+  void set_head_dim(uint32_t const head_dim) {
+    head_dim_ = head_dim;
+  }
+  uint32_t num_q_heads() const {
+    return num_q_heads_;
+  }
+  uint32_t num_kv_heads() const {
+    return num_kv_heads_;
+  }
+  uint32_t head_dim() const {
+    return head_dim_;
+  }
 
-  void set_enabled(bool const enabled) { enabled_ = enabled; }
-  bool enabled() const { return enabled_; }
+  void set_enabled(bool const enabled) {
+    enabled_ = enabled;
+  }
+  bool enabled() const {
+    return enabled_;
+  }
 
   uint32_t num_q_heads_;
   uint32_t num_kv_heads_;
@@ -180,11 +205,11 @@ class AttentionMetaData {
   size_t workspace_block;
 
   size_t mem_size_;
-  
+
   // batchsize -> handler
   bool enabled_;
-  std::unordered_map<int, void*> decode_handler_collections;
-  std::unordered_map<int, void*> prompt_handler_collections;
+  std::unordered_map<int, void *> decode_handler_collections;
+  std::unordered_map<int, void *> prompt_handler_collections;
 };
 
 struct FFHandler {
@@ -198,8 +223,8 @@ struct FFHandler {
   void *workSpace;
   size_t workSpaceSize;
   void *batch_config_metadata;
-  AttentionMetaData* tree_search_attention_metadata;
-  AttentionMetaData* tree_verify_attention_metadata;
+  AttentionMetaData *tree_search_attention_metadata;
+  AttentionMetaData *tree_verify_attention_metadata;
 
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
@@ -212,6 +237,8 @@ struct FFHandler {
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
   ncclComm_t ncclComm;
+  int num_devices;
+  int device_id;
 #endif
 };
 
diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h
index e0481302e..b8e417ddc 100644
--- a/include/flexflow/ops/fused.h
+++ b/include/flexflow/ops/fused.h
@@ -30,13 +30,11 @@ class FusedOpMeta {
   OpMeta *meta[MAX_NUM_FUSED_OPERATORS];
   FusedOp *fused_op;
   int numOperators;
-  bool graphCaptured=false;
+  bool graphCaptured = false;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-  std::unordered_map<GraphParams, cudaGraphExec_t>
-      graph_collections;
+  std::unordered_map<GraphParams, cudaGraphExec_t> graph_collections;
 #else
-  std::unordered_map<GraphParams, hipGraphExec_t>
-      graph_collections;
+  std::unordered_map<GraphParams, hipGraphExec_t> graph_collections;
 #endif
 };
 
diff --git a/include/flexflow/ops/graph_params.h b/include/flexflow/ops/graph_params.h
index 72bdf8369..0362801c8 100644
--- a/include/flexflow/ops/graph_params.h
+++ b/include/flexflow/ops/graph_params.h
@@ -5,41 +5,47 @@
 #include <string>
 
 namespace FlexFlow {
-  struct GraphParams {
-    int num_active_requests;
-    int num_active_tokens;
-    bool prompt_phase;
-
-    GraphParams(int num_active_requests, int num_active_tokens, bool prompt_phase)
-      : num_active_requests(num_active_requests), num_active_tokens(num_active_tokens), prompt_phase(prompt_phase) {}
-
-    void Print() const {
-      printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, prompt_phase: %d\n\n", num_active_requests, num_active_tokens, prompt_phase);
-    }
-  };
-
-}
+struct GraphParams {
+  int num_active_requests;
+  int num_active_tokens;
+  bool prompt_phase;
+
+  GraphParams(int num_active_requests, int num_active_tokens, bool prompt_phase)
+      : num_active_requests(num_active_requests),
+        num_active_tokens(num_active_tokens), prompt_phase(prompt_phase) {}
+
+  void Print() const {
+    printf("GraphParams, num_active_requests: %d, num_active_tokens: %d, "
+           "prompt_phase: %d\n\n",
+           num_active_requests,
+           num_active_tokens,
+           prompt_phase);
+  }
+};
+
+} // namespace FlexFlow
 
 namespace std {
-  template <>
-  struct hash<FlexFlow::GraphParams> {
-    size_t operator()(const FlexFlow::GraphParams& gp) const {
-      return std::hash<int>()(gp.num_active_requests) ^
-             std::hash<int>()(gp.num_active_tokens) ^
-             std::hash<bool>()(gp.prompt_phase);
-    }
-  };
-}
+template <>
+struct hash<FlexFlow::GraphParams> {
+  size_t operator()(FlexFlow::GraphParams const &gp) const {
+    return std::hash<int>()(gp.num_active_requests) ^
+           std::hash<int>()(gp.num_active_tokens) ^
+           std::hash<bool>()(gp.prompt_phase);
+  }
+};
+} // namespace std
 
 namespace std {
-  template <>
-  struct equal_to<FlexFlow::GraphParams> {
-    bool operator()(const FlexFlow::GraphParams& lhs, const FlexFlow::GraphParams& rhs) const {
-      return lhs.num_active_requests == rhs.num_active_requests &&
-             lhs.num_active_tokens == rhs.num_active_tokens && 
-             lhs.prompt_phase == rhs.prompt_phase;
-    }
-  };
-}
+template <>
+struct equal_to<FlexFlow::GraphParams> {
+  bool operator()(FlexFlow::GraphParams const &lhs,
+                  FlexFlow::GraphParams const &rhs) const {
+    return lhs.num_active_requests == rhs.num_active_requests &&
+           lhs.num_active_tokens == rhs.num_active_tokens &&
+           lhs.prompt_phase == rhs.prompt_phase;
+  }
+};
+} // namespace std
 
 #endif
diff --git a/include/flexflow/ops/gumbel_topk.h b/include/flexflow/ops/gumbel_topk.h
index 454c8e4fd..b74361fb2 100644
--- a/include/flexflow/ops/gumbel_topk.h
+++ b/include/flexflow/ops/gumbel_topk.h
@@ -39,20 +39,20 @@ class GumbelTopK : public Op {
   using Params = GumbelTopKParams;
   using Input = ParallelTensor;
   GumbelTopK(FFModel &model,
-          LayerID const &layer_guid,
-          ParallelTensor const input,
-          int k,
-          bool sorted,
-          bool speculative_decoding,
-          char const *name);
+             LayerID const &layer_guid,
+             ParallelTensor const input,
+             int k,
+             bool sorted,
+             bool speculative_decoding,
+             char const *name);
   GumbelTopK(FFModel &model,
-          LayerID const &layer_guid,
-          GumbelTopK const &other,
-          ParallelTensor const input);
+             LayerID const &layer_guid,
+             GumbelTopK const &other,
+             ParallelTensor const input);
   GumbelTopK(FFModel &model,
-          Params const &params,
-          Input const input,
-          char const *name = nullptr);
+             Params const &params,
+             Input const input,
+             char const *name = nullptr);
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
@@ -110,13 +110,14 @@ class GumbelTopK : public Op {
                              bool sorted,
                              BatchConfig const *bc,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(GumbelTopKMeta const *m,
-                                     GenericTensorAccessorR const &input,
-                                     GenericTensorAccessorW const &log_probs,
-                                     GenericTensorAccessorW const &perturbed_log_probs,
-                                     GenericTensorAccessorW const &indices,
-                                     int batch_size,
-                                     BatchConfig const *bc);
+  static void
+      forward_kernel_wrapper(GumbelTopKMeta const *m,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &log_probs,
+                             GenericTensorAccessorW const &perturbed_log_probs,
+                             GenericTensorAccessorW const &indices,
+                             int batch_size,
+                             BatchConfig const *bc);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index fdf0a8729..0e97239eb 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -107,14 +107,13 @@ class SpecIncMultiHeadSelfAttention : public Op {
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
 
-  static void
-      inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta *m,
-                               BatchConfig const *bc,
-                               int shard_id,
-                               GenericTensorAccessorR const &input,
-                               GenericTensorAccessorR const &weight,
-                               GenericTensorAccessorW const &output,
-                               GenericTensorAccessorR const &bias);
+  static void inference_kernel_wrapper(SpecIncMultiHeadSelfAttentionMeta *m,
+                                       BatchConfig const *bc,
+                                       int shard_id,
+                                       GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorR const &weight,
+                                       GenericTensorAccessorW const &output,
+                                       GenericTensorAccessorR const &bias);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
index bdf7aae50..676429f8b 100644
--- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
@@ -6,18 +6,32 @@
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
 #include "flexflow/parallel_ops/allreduce.h"
+#include "flexflow/utils/communication_buffer.h"
+#include "flexflow/utils/memory_allocator.h"
+#include <unordered_map>
 
 namespace FlexFlow {
 
 class AllReduceMeta : public OpMeta {
 public:
-  AllReduceMeta(FFHandler handle, AllReduce const *reduct);
+  AllReduceMeta(FFHandler handle,
+                AllReduce const *reduct,
+                MemoryAllocator &gpu_mem_allocator);
+  ~AllReduceMeta(void);
+
+public:
+  std::unordered_map<void *, CommunicationBuffer *> comm_bufs;
+  Realm::RegionInstance reserveInst;
+  void *allgather_src, *allgather_dst;
+  // reuse for communication buffer
+  void *barrier_in_ptr, *barrier_out_ptr;
+  int barrier_ptr_size, barrier_flag;
 };
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
+void inference_kernel_wrapper(AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output);
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 81e7c0f9b..5c6f6b6e0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -380,13 +380,16 @@ class RequestManager {
     long long llm_step_start = 0, ssm_step_start = 0;
     // Times for each LLM verification phase (in ms)
     std::vector<double> llm_step_times;
+    // Number of requests in batch at each step
+    std::vector<int> requests_per_step;
     // Times for each SSM speculation phase (in ms)
     std::vector<double> ssm_step_times;
     // Number of requests getting decoded at each step
     std::vector<int> ssm_steps;
-    std::vector<int> requests_per_step;
     // Number of generated tokens at each step
     std::vector<int> generated_tokens_per_step;
+    // To calculate the E2E time of serving
+    long long server_start_time = 0;
   };
 
   ProfileInfo profiling;
@@ -445,6 +448,9 @@ class RequestManager {
                       int k);
   void gumbel_conditioned_on_max(float target_max,
                                  std::vector<std::pair<float, int>> &logits);
+
+  // Profiling related functions
+  void reset_profiling_statistics();
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/utils/communication_buffer.h b/include/flexflow/utils/communication_buffer.h
new file mode 100644
index 000000000..3c14284d6
--- /dev/null
+++ b/include/flexflow/utils/communication_buffer.h
@@ -0,0 +1,74 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _COMMUNICATION_BUFFER_H
+#define _COMMUNICATION_BUFFER_H
+
+#include <vector>
+#ifdef FF_USE_NCCL
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include <nccl.h>
+#else
+#include <rccl/rccl.h>
+#endif
+#endif
+
+// adapted from https://github.com/mlc-ai/relax
+
+// The CUDA interdevice communication memory object,
+// which internally contains data pointers to other device's peer memory.
+// It is be useful for efficient all-reduce implementation.
+// Right now the class members are closely tied with customized
+// all-reduce kernel. They may also be extended for other uses in
+// the future.
+class CommunicationBuffer {
+public:
+  // The device information for CUDA CommunicationBuffer.
+  int num_devices;
+  int device_id;
+  void *local_ptr;
+
+  // The data pointers of all all-reduce inputs.
+  // It has "num_devices" pointers. The i-th pointer is the data pointer on
+  // worker i. If "i != device_id", the pointer is an peer data pointer of other
+  // device. Otherwise, the pointer is a local CUDA data pointer.
+  std::vector<void *> comm_ptrs;
+
+  // The barrier helper datas per CommunicationBuffer, which can be used
+  // by custom collective operations and allow fine-grained synchronization on
+  // each buffer. They have "num_devices" pointers, and the pointer arrangement
+  // is the same as "comm_ptrs".
+  std::vector<void *> barrier_in;
+  std::vector<void *> barrier_out;
+
+  // The integer buffer flag for all-reduce.
+  // It will self increment by 1 after each all-reduce operation.
+  int *barrier_flag;
+};
+
+CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
+                                                    int device_id,
+                                                    ncclComm_t ncclComm,
+                                                    void *allgather_src,
+                                                    void *allgather_dst,
+                                                    void *local_ptr,
+                                                    void *barrier_in_ptr,
+                                                    void *barrier_out_ptr,
+                                                    int *barrier_flag,
+                                                    cudaStream_t stream);
+
+void release_comm_buf(CommunicationBuffer *comm_buf);
+
+#endif // _COMMUNICATION_BUFFER_H
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 455cb131c..d6a34fa8f 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -35,9 +35,7 @@ class FFCObjectWrapper {
     t_.impl = const_cast<void *>(static_cast<void const *>(t));                \
     return t_;                                                                 \
   }                                                                            \
-  static T unwrap(T_ t_) {                                                     \
-    return static_cast<T>(t_.impl);                                            \
-  }                                                                            \
+  static T unwrap(T_ t_) { return static_cast<T>(t_.impl); }                   \
   static const T unwrap_const(const T_ t_) {                                   \
     return static_cast<const T>(t_.impl);                                      \
   }
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index badb41f40..b2039e30c 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -526,7 +526,7 @@ void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
 
 FutureMap AddBiasResidualLayerNorm::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 8047b0aee..3baf469d1 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -296,11 +296,12 @@ void Aggregate::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Aggregate::inference(FFModel const &ff,
-                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                               std::vector<ParallelTensor> const &batch_inputs,
-                               std::vector<ParallelTensor> const &batch_outputs,
-                               MachineView const *mv) {
+FutureMap Aggregate::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 75c084721..70f5508e0 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -264,12 +264,12 @@ void AggregateSpec::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap
-    AggregateSpec::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap AggregateSpec::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index a2672c297..37f971668 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -577,7 +577,7 @@ void MultiHeadAttention::forward(FFModel const &ff) {
 
 FutureMap MultiHeadAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/cast.cc b/src/ops/cast.cc
index 8cff9f741..701f407de 100644
--- a/src/ops/cast.cc
+++ b/src/ops/cast.cc
@@ -226,11 +226,12 @@ void Cast::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Cast::inference(FFModel const &ff,
-                          /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                          std::vector<ParallelTensor> const &batch_inputs,
-                          std::vector<ParallelTensor> const &batch_outputs,
-                          MachineView const *mv) {
+FutureMap Cast::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 485162dd0..921800da1 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -540,12 +540,12 @@ void ElementBinary::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap
-    ElementBinary::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap ElementBinary::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index a166978e4..b657cf657 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -420,12 +420,12 @@ void ElementUnary::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap
-    ElementUnary::inference(FFModel const &ff,
-                            /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                            std::vector<ParallelTensor> const &batch_inputs,
-                            std::vector<ParallelTensor> const &batch_outputs,
-                            MachineView const *mv) {
+FutureMap ElementUnary::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index 674aaf63d..644d79efe 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -455,11 +455,12 @@ void Embedding::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Embedding::inference(FFModel const &ff,
-                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                               std::vector<ParallelTensor> const &batch_inputs,
-                               std::vector<ParallelTensor> const &batch_outputs,
-                               MachineView const *mv) {
+FutureMap Embedding::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index a5b00e6cb..bbcbcda91 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -668,11 +668,12 @@ void Experts::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Experts::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap Experts::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index a6d2bb2f9..2ba98bc09 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -513,11 +513,12 @@ void FusedOp::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap FusedOp::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap FusedOp::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   // Set iter_config
   iter_config = ff.iter_config;
   ArgumentMap argmap;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index cc844096d..bb0fdff8c 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -15,6 +15,7 @@
 
 #include "cuda.h"
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -44,7 +45,6 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
-#include "flexflow/ffconst_utils.h"
 
 namespace FlexFlow {
 
@@ -604,25 +604,25 @@ __host__ void
   // create new cuda graph
   cudaGraphExec_t instance;
 
-  GraphParams graph_params = {bc->num_active_requests(),
-                      bc->num_active_tokens(),
-                      bc->prompt_phase};
-  int shard_id = task->index_point.point_data[0];
+  GraphParams graph_params = {
+      bc->num_active_requests(), bc->num_active_tokens(), bc->prompt_phase};
+  // int shard_id = task->index_point.point_data[0];
 
-  // bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE or bc->get_mode() == TREE_VERIFY_MODE);
-  // bool use_cuda_graph = bc->get_mode() == TREE_VERIFY_MODE;
-  bool use_cuda_graph = bc->get_mode() == TREE_SEARCH_MODE;
+  // bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE or bc->get_mode()
+  // == TREE_VERIFY_MODE);
+  bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE);
+  // bool use_cuda_graph = (bc->get_mode() == TREE_VERIFY_MODE);
   // bool use_cuda_graph = false;
   bool captured = false;
 
-  if(use_cuda_graph && metas->graph_collections.count(graph_params)  != 0) {
+  if (use_cuda_graph && metas->graph_collections.count(graph_params) != 0) {
     captured = true;
     instance = metas->graph_collections[graph_params];
   }
 
   if (!captured) {
     cudaGraph_t graph;
-    {    
+    {
       if (use_cuda_graph) {
         cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
       }
@@ -653,7 +653,8 @@ __host__ void
           // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
           // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
           assert(fused->op_weight_idx[i + woff] < fused->numWeights);
-          my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+          my_weight_accessor[i] =
+              weight_accessor[fused->op_weight_idx[i + woff]];
         }
         for (int i = 0; i < fused->op_num_outputs[op]; i++) {
           int my_off = fused->op_output_idx[i + ooff];
@@ -699,13 +700,15 @@ __host__ void
             int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
             int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
             assert(my_output_accessor[0].domain.get_volume() ==
-                  out_dim * batch_size);
-            assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+                   out_dim * batch_size);
+            assert(my_input_accessor[0].domain.get_volume() ==
+                   in_dim * batch_size);
             void const *bias_ptr = nullptr;
             LinearMeta *m = (LinearMeta *)metas->meta[op];
             if (fused->op_num_weights[op] == 2) {
               assert(my_weight_accessor[1].domain.get_volume() == out_dim);
-              if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
+              if (!m->add_bias_only_once ||
+                  task->index_point.point_data[0] == 0) {
                 bias_ptr = my_weight_accessor[1].ptr;
               }
             } else {
@@ -774,10 +777,11 @@ __host__ void
             assert(my_input_accessor[0].domain == my_input_accessor[1].domain);
             assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
             ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
-            Kernels::ElementBinary::forward_kernel_wrapper(m,
-                                                          my_input_accessor[0],
-                                                          my_input_accessor[1],
-                                                          my_output_accessor[0]);
+            Kernels::ElementBinary::forward_kernel_wrapper(
+                m,
+                my_input_accessor[0],
+                my_input_accessor[1],
+                my_output_accessor[0]);
             break;
           }
           case OP_EMBEDDING: {
@@ -788,30 +792,32 @@ __host__ void
             if (m->aggr == AGGR_MODE_NONE) {
               // assert(kernel_domain.get_dim() == 2);
               assert(my_input_accessor[0].domain.get_dim() + 1 ==
-                    my_output_accessor[0].domain.get_dim());
-              for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) {
+                     my_output_accessor[0].domain.get_dim());
+              for (size_t i = 0; i < my_input_accessor[0].domain.get_dim();
+                   i++) {
                 assert(my_input_accessor[0].domain.hi()[i] ==
-                      my_output_accessor[0].domain.hi()[i + 1]);
+                       my_output_accessor[0].domain.hi()[i + 1]);
                 assert(my_input_accessor[0].domain.lo()[i] ==
-                      my_output_accessor[0].domain.lo()[i + 1]);
+                       my_output_accessor[0].domain.lo()[i + 1]);
               }
               assert(my_weight_accessor[0].domain.hi()[0] -
-                        my_weight_accessor[0].domain.lo()[0] ==
-                    my_output_accessor[0].domain.hi()[0] -
-                        my_output_accessor[0].domain.lo()[0]);
+                         my_weight_accessor[0].domain.lo()[0] ==
+                     my_output_accessor[0].domain.hi()[0] -
+                         my_output_accessor[0].domain.lo()[0]);
             } else {
               assert(my_input_accessor[0].domain.get_dim() ==
-                    my_output_accessor[0].domain.get_dim());
-              for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) {
+                     my_output_accessor[0].domain.get_dim());
+              for (size_t i = 1; i < my_input_accessor[0].domain.get_dim();
+                   i++) {
                 assert(my_input_accessor[0].domain.hi()[i] ==
-                      my_output_accessor[0].domain.hi()[i]);
+                       my_output_accessor[0].domain.hi()[i]);
                 assert(my_input_accessor[0].domain.lo()[i] ==
-                      my_output_accessor[0].domain.lo()[i]);
+                       my_output_accessor[0].domain.lo()[i]);
               }
               assert(my_weight_accessor[0].domain.hi()[0] -
-                        my_weight_accessor[0].domain.lo()[0] ==
-                    my_output_accessor[0].domain.hi()[0] -
-                        my_output_accessor[0].domain.lo()[0]);
+                         my_weight_accessor[0].domain.lo()[0] ==
+                     my_output_accessor[0].domain.hi()[0] -
+                         my_output_accessor[0].domain.lo()[0]);
             }
             int in_dim, out_dim, effective_batch_size;
             if (m->aggr == AGGR_MODE_NONE) {
@@ -821,28 +827,28 @@ __host__ void
               effective_batch_size =
                   my_output_accessor[0].domain.get_volume() / out_dim;
               assert(effective_batch_size * in_dim ==
-                    my_input_accessor[0].domain.get_volume());
+                     my_input_accessor[0].domain.get_volume());
             } else {
               assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM);
               in_dim = my_input_accessor[0].domain.hi()[0] -
-                      my_input_accessor[0].domain.lo()[0] + 1;
+                       my_input_accessor[0].domain.lo()[0] + 1;
               out_dim = my_output_accessor[0].domain.hi()[0] -
                         my_output_accessor[0].domain.lo()[0] + 1;
               effective_batch_size =
                   my_output_accessor[0].domain.get_volume() / out_dim;
               assert(effective_batch_size * in_dim ==
-                    my_input_accessor[0].domain.get_volume());
+                     my_input_accessor[0].domain.get_volume());
             }
 
             assert(my_input_accessor[0].data_type == DT_INT32 ||
-                  my_input_accessor[0].data_type == DT_INT64);
+                   my_input_accessor[0].data_type == DT_INT64);
             Kernels::Embedding::forward_kernel_wrapper(m,
-                                                      my_input_accessor[0],
-                                                      my_output_accessor[0],
-                                                      my_weight_accessor[0],
-                                                      in_dim,
-                                                      out_dim,
-                                                      effective_batch_size);
+                                                       my_input_accessor[0],
+                                                       my_output_accessor[0],
+                                                       my_weight_accessor[0],
+                                                       in_dim,
+                                                       out_dim,
+                                                       effective_batch_size);
             break;
           }
           case OP_GELU:
@@ -879,22 +885,24 @@ __host__ void
             assert(fused->op_num_outputs[op] == 1);
             RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
             Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_weight_accessor[0],
-                                                    my_output_accessor[0]);
+                                                     my_input_accessor[0],
+                                                     my_weight_accessor[0],
+                                                     my_output_accessor[0]);
             break;
           }
           case OP_RESIDUAL_RMS_NORM: {
             assert(fused->op_num_inputs[op] == 2);
             assert(fused->op_num_weights[op] == 1);
             assert(fused->op_num_outputs[op] == 2);
-            ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-            Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                            my_input_accessor[0],
-                                                            my_input_accessor[1],
-                                                            my_weight_accessor[0],
-                                                            my_output_accessor[0],
-                                                            my_output_accessor[1]);
+            ResidualRMSNormMeta const *m =
+                (ResidualRMSNormMeta *)metas->meta[op];
+            Kernels::ResidualRMSNorm::forward_kernel_wrapper(
+                m,
+                my_input_accessor[0],
+                my_input_accessor[1],
+                my_weight_accessor[0],
+                my_output_accessor[0],
+                my_output_accessor[1]);
             break;
           }
           case OP_INC_MULTIHEAD_SELF_ATTENTION: {
@@ -903,7 +911,7 @@ __host__ void
             IncMultiHeadSelfAttentionMeta const *m =
                 (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
             assert(fused->op_num_weights[op] ==
-                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
             GenericTensorAccessorR biases;
             if (*m->qkv_bias || *m->final_bias) {
               assert(fused->op_num_weights[op] == 2);
@@ -927,7 +935,7 @@ __host__ void
             BatchConfig const *verify_bc =
                 BatchConfig::from_future(task->futures[0]);
             assert(fused->op_num_weights[op] ==
-                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
             GenericTensorAccessorR biases;
             if (*m->qkv_bias || *m->final_bias) {
               assert(fused->op_num_weights[op] == 2);
@@ -951,7 +959,7 @@ __host__ void
             BatchConfig const *search_bc =
                 BatchConfig::from_future(task->futures[0]);
             assert(fused->op_num_weights[op] ==
-                  (1 + (int)(*m->qkv_bias || *m->final_bias)));
+                   (1 + (int)(*m->qkv_bias || *m->final_bias)));
             GenericTensorAccessorR biases;
             if (*m->qkv_bias || *m->final_bias) {
               assert(fused->op_num_weights[op] == 2);
@@ -1035,7 +1043,8 @@ __host__ void
               if (!m->use_bias) {
                 assert(fused->op_num_weights[op] == 2); // attn bias + weight
               } else {
-                assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+                assert(fused->op_num_weights[op] ==
+                       3); // attn bias + weight + bias
               }
             }
             GenericTensorAccessorR gamma, beta;
@@ -1066,11 +1075,12 @@ __host__ void
           case OP_SIGMOID_SILU_MULTI: {
             assert(fused->op_num_inputs[op] == 2);
             assert(fused->op_num_outputs[op] == 1);
-            SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+            SigmoidSiluMultiMeta const *m =
+                (SigmoidSiluMultiMeta *)metas->meta[op];
             SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                      my_input_accessor[0],
-                                                      my_input_accessor[1],
-                                                      my_output_accessor[0]);
+                                                       my_input_accessor[0],
+                                                       my_input_accessor[1],
+                                                       my_output_accessor[0]);
             break;
           }
           case OP_SOFTMAX: {
@@ -1078,7 +1088,7 @@ __host__ void
             assert(fused->op_num_weights[op] == 0);
             assert(fused->op_num_outputs[op] == 1);
             assert(my_input_accessor[0].domain.get_volume() ==
-                  my_output_accessor[0].domain.get_volume());
+                   my_output_accessor[0].domain.get_volume());
             SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
             if (m->input_type == DT_HALF) {
               Kernels::Softmax::forward_kernel_wrapper(
@@ -1096,7 +1106,7 @@ __host__ void
           case OP_ALLREDUCE: {
             assert(fused->op_num_inputs[op] == 1);
             assert(fused->op_num_outputs[op] == 1);
-            AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+            AllReduceMeta *m = (AllReduceMeta *)metas->meta[op];
             Kernels::AllReduce::inference_kernel_wrapper(
                 m, bc, my_input_accessor[0], my_output_accessor[0]);
             break;
@@ -1150,7 +1160,7 @@ __host__ void
         cudaStreamEndCapture(stream, &graph);
       }
     }
-    if (use_cuda_graph) { 
+    if (use_cuda_graph) {
       cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
       metas->graph_collections[graph_params] = instance;
       cudaGraphDestroy(graph);
@@ -1159,24 +1169,8 @@ __host__ void
 
   if (use_cuda_graph) {
     assert(metas->graph_collections.find(graph_params) !=
-          metas->graph_collections.end());
-    cudaEvent_t t_start, t_end;
-    float elapsed;
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-
+           metas->graph_collections.end());
     cudaGraphLaunch(instance, stream);
-
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    if(shard_id == 0 && bc->get_mode() == TREE_SEARCH_MODE) {
-      printf("cudaGraphLaunch time: %f ms, captured: %d\n", elapsed, captured);
-    }
   }
 }
 
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index 715ad14f0..bb254a714 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -321,11 +321,12 @@ void Group_by::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Group_by::inference(FFModel const &ff,
-                              /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                              std::vector<ParallelTensor> const &batch_inputs,
-                              std::vector<ParallelTensor> const &batch_outputs,
-                              MachineView const *mv) {
+FutureMap Group_by::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index 7ccd4ade0..0878eb6fe 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -222,29 +222,29 @@ __global__ void
 
 // Unified log function for float
 __device__ inline float unified_log(float x) {
-    return logf(x);
+  return logf(x);
 }
 
 // Unified log function for half
 __device__ inline __half unified_log(__half x) {
-    return hlog(x);
+  return hlog(x);
 }
 
-// heapGumbelTopK walks over [input, input+length) with `step_size` stride starting
-// at `start_index`. It builds a top-`k` heap that is stored in `heap_entries`
-// using `Accessor` to access elements in `heap_entries`. If sorted=true, the
-// elements will be sorted at the end.
-// NOTE that it applies Gumbel trick on `input`, which is,
-// input -> log(input) - log(-log(U)), where U is a uniform random number in (0, 1).
+// heapGumbelTopK walks over [input, input+length) with `step_size` stride
+// starting at `start_index`. It builds a top-`k` heap that is stored in
+// `heap_entries` using `Accessor` to access elements in `heap_entries`. If
+// sorted=true, the elements will be sorted at the end. NOTE that it applies
+// Gumbel trick on `input`, which is, input -> log(input) - log(-log(U)), where
+// U is a uniform random number in (0, 1).
 template <typename T, template <typename> class Data = LinearData>
 __device__ void heapGumbelTopK(curandState state,
-                            T const *__restrict__ input,
-                            int length,
-                            int k,
-                            GumbelEntry<T> *__restrict__ heap_entries,
-                            bool sorted = false,
-                            int start_index = 0,
-                            int step_size = 1) {
+                               T const *__restrict__ input,
+                               int length,
+                               int k,
+                               GumbelEntry<T> *__restrict__ heap_entries,
+                               bool sorted = false,
+                               int start_index = 0,
+                               int step_size = 1) {
   assert(k <= length);
 
   auto heap =
@@ -259,7 +259,8 @@ __device__ void heapGumbelTopK(curandState state,
   for (int index = start_index, slot = 0; index < heap_end_index;
        index += step_size, slot++) {
     T value = unified_log(input[index]);
-    T perturbed_value = value - unified_log(-unified_log((T)curand_uniform(&state)));
+    T perturbed_value =
+        value - unified_log(-unified_log((T)curand_uniform(&state)));
     heap.assign(slot, {index, value, perturbed_value});
   }
 
@@ -272,7 +273,8 @@ __device__ void heapGumbelTopK(curandState state,
     // We prefer elements with lower indices. This is given here.
     // Later elements automatically have higher indices, so can be discarded.
     T value = unified_log(input[index]);
-    T perturbed_value = value - unified_log(-unified_log((T)curand_uniform(&state)));
+    T perturbed_value =
+        value - unified_log(-unified_log((T)curand_uniform(&state)));
     if (perturbed_value > heap.root().perturbed_value) {
       // This element should replace the min.
       heap.replace_root({index, value, perturbed_value}, k);
@@ -288,9 +290,9 @@ __device__ void heapGumbelTopK(curandState state,
 // mergeShards performs a top-k merge on `num_shards` many sorted streams that
 // are sorted and stored in `entries` in a strided way:
 // |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
-// The overall top k elements are written to `top_k_values` and `top_k_perturbed_values`,
-// and their indices to `top_k_indices`.
-// `top_k_heap` is used as temporary storage for the merge heap.
+// The overall top k elements are written to `top_k_values` and
+// `top_k_perturbed_values`, and their indices to `top_k_indices`. `top_k_heap`
+// is used as temporary storage for the merge heap.
 template <typename T>
 __device__ void mergeShards(int num_shards,
                             int k,
@@ -314,7 +316,8 @@ __device__ void mergeShards(int num_shards,
                                 T>{IndirectLinearData<T>{top_k_heap, entries}};
     // Initialize the heap as a min-heap.
     for (int slot = 0; slot < heap_size; slot++) {
-      min_heap.assign(slot, {slot, entries[slot].value, entries[slot].perturbed_value});
+      min_heap.assign(
+          slot, {slot, entries[slot].value, entries[slot].perturbed_value});
     }
     min_heap.build(heap_size);
 
@@ -330,7 +333,8 @@ __device__ void mergeShards(int num_shards,
         continue;
       }
       // This element should replace the min.
-      min_heap.replace_root({shard, entry.value, entry.perturbed_value}, heap_size);
+      min_heap.replace_root({shard, entry.value, entry.perturbed_value},
+                            heap_size);
     }
   }
 
@@ -354,12 +358,15 @@ __device__ void mergeShards(int num_shards,
       if (speculative_decoding) {
         assert(top_k_values != nullptr);
         top_k_values[rank] = static_cast<float>(max_element.value);
-        top_k_perturbed_values[rank] = static_cast<float>(max_element.perturbed_value);
+        top_k_perturbed_values[rank] =
+            static_cast<float>(max_element.perturbed_value);
       }
       int next_shard_index = shard_index + num_shards;
       // For rank < k-1, each top k heap still contains at least 1 element,
       // so we can draw a replacement.
-      max_heap.replace_root({next_shard_index, entries[next_shard_index].value, entries[next_shard_index].perturbed_value},
+      max_heap.replace_root({next_shard_index,
+                             entries[next_shard_index].value,
+                             entries[next_shard_index].perturbed_value},
                             heap_size);
     }
 
@@ -370,22 +377,24 @@ __device__ void mergeShards(int num_shards,
     if (speculative_decoding) {
       assert(top_k_values != nullptr);
       top_k_values[last_k] = static_cast<float>(max_element.value);
-      top_k_perturbed_values[last_k] = static_cast<float>(max_element.perturbed_value);
+      top_k_perturbed_values[last_k] =
+          static_cast<float>(max_element.perturbed_value);
     }
   }
 }
 
 template <typename T>
-__global__ void gumbel_topk_forward_kernel(curandState *state,
-                                        T const *__restrict__ input,
-                                        size_t shared_memory_size,
-                                        int length,
-                                        int k,
-                                        bool sorted,
-                                        float *__restrict__ log_probs_ptr,
-                                        float *__restrict__ perturbed_log_probs_ptr,
-                                        int *__restrict__ indices,
-                                        bool speculative_decoding) {
+__global__ void
+    gumbel_topk_forward_kernel(curandState *state,
+                               T const *__restrict__ input,
+                               size_t shared_memory_size,
+                               int length,
+                               int k,
+                               bool sorted,
+                               float *__restrict__ log_probs_ptr,
+                               float *__restrict__ perturbed_log_probs_ptr,
+                               int *__restrict__ indices,
+                               bool speculative_decoding) {
   __shared__ char shared_memory[48 << 10]; // block-wise shared memory
   int const batch_index = blockIdx.x;
   T const *batch_input = input + batch_index * length;
@@ -393,7 +402,14 @@ __global__ void gumbel_topk_forward_kernel(curandState *state,
   int const thread_count = blockDim.x;
   GumbelEntry<T> *shared_entries = (GumbelEntry<T> *)shared_memory;
   heapGumbelTopK<T, StridedData>(
-      state[thread_index + batch_index * thread_count], batch_input, length, k, shared_entries, true, thread_index, thread_count);
+      state[thread_index + batch_index * thread_count],
+      batch_input,
+      length,
+      k,
+      shared_entries,
+      true,
+      thread_index,
+      thread_count);
   __syncthreads();
   if (thread_index == 0) {
     int const offset = batch_index * k;
@@ -452,9 +468,9 @@ void GumbelTopK::forward_kernel(
 
     int state_length = batch_size * num_shards;
     init_random_state_kernel<<<GET_BLOCKS(state_length),
-                        min((int)CUDA_NUM_THREADS, state_length),
-                        0,
-                      stream>>>(m->state, state_length, rand());
+                               min((int)CUDA_NUM_THREADS, state_length),
+                               0,
+                               stream>>>(m->state, state_length, rand());
 
     gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         m->state,
@@ -470,12 +486,12 @@ void GumbelTopK::forward_kernel(
   } else {
     assert(num_shards >= (size_t)k);
     num_shards = k;
-    
+
     int state_length = batch_size * num_shards;
     init_random_state_kernel<<<GET_BLOCKS(state_length),
-                        min((int)CUDA_NUM_THREADS, state_length),
-                        0,
-                      stream>>>(m->state, state_length, rand());
+                               min((int)CUDA_NUM_THREADS, state_length),
+                               0,
+                               stream>>>(m->state, state_length, rand());
 
     gumbel_topk_forward_kernel<<<num_blocks, num_shards, 0, stream>>>(
         m->state,
@@ -492,14 +508,15 @@ void GumbelTopK::forward_kernel(
 }
 
 /*static*/
-void GumbelTopK::forward_kernel_wrapper(GumbelTopKMeta const *m,
-                                     GenericTensorAccessorR const &input,
-                                     // float *output_ptr,
-                                     GenericTensorAccessorW const &log_probs,
-                                     GenericTensorAccessorW const &perturbed_log_probs,
-                                     GenericTensorAccessorW const &indices,
-                                     int batch_size,
-                                     BatchConfig const *bc) {
+void GumbelTopK::forward_kernel_wrapper(
+    GumbelTopKMeta const *m,
+    GenericTensorAccessorR const &input,
+    // float *output_ptr,
+    GenericTensorAccessorW const &log_probs,
+    GenericTensorAccessorW const &perturbed_log_probs,
+    GenericTensorAccessorW const &indices,
+    int batch_size,
+    BatchConfig const *bc) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
@@ -542,33 +559,31 @@ void GumbelTopK::forward_kernel_wrapper(GumbelTopKMeta const *m,
   }
 
   if (input.data_type == DT_HALF) {
-    GumbelTopK::forward_kernel(m,
-                            input.get_half_ptr(),
-                            m->speculative_decoding ? log_probs.get_float_ptr()
-                                                    : nullptr,
-                            m->speculative_decoding ? perturbed_log_probs.get_float_ptr()
-                                                    : nullptr,
-                            indices.get_int32_ptr(),
-                            batch_size,
-                            length,
-                            k,
-                            m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
-                            stream);
+    GumbelTopK::forward_kernel(
+        m,
+        input.get_half_ptr(),
+        m->speculative_decoding ? log_probs.get_float_ptr() : nullptr,
+        m->speculative_decoding ? perturbed_log_probs.get_float_ptr() : nullptr,
+        indices.get_int32_ptr(),
+        batch_size,
+        length,
+        k,
+        m->sorted,
+        m->speculative_decoding ? bc : nullptr,
+        stream);
   } else if (input.data_type == DT_FLOAT) {
-    GumbelTopK::forward_kernel(m,
-                            input.get_float_ptr(),
-                            m->speculative_decoding ? log_probs.get_float_ptr()
-                                                    : nullptr,
-                            m->speculative_decoding ? perturbed_log_probs.get_float_ptr()
-                                                    : nullptr,
-                            indices.get_int32_ptr(),
-                            batch_size,
-                            length,
-                            k,
-                            m->sorted,
-                            m->speculative_decoding ? bc : nullptr,
-                            stream);
+    GumbelTopK::forward_kernel(
+        m,
+        input.get_float_ptr(),
+        m->speculative_decoding ? log_probs.get_float_ptr() : nullptr,
+        m->speculative_decoding ? perturbed_log_probs.get_float_ptr() : nullptr,
+        indices.get_int32_ptr(),
+        batch_size,
+        length,
+        k,
+        m->sorted,
+        m->speculative_decoding ? bc : nullptr,
+        stream);
   } else {
     assert(false && "Unsupported data type");
   }
@@ -588,8 +603,11 @@ GumbelTopKMeta::GumbelTopKMeta(FFHandler handler,
                                Op const *op,
                                MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, op) {
-  state_max_length = BatchConfig::MAX_NUM_TOKENS * max(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, CUDA_NUM_THREADS);
-  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(curandState) * state_max_length);
+  state_max_length =
+      BatchConfig::MAX_NUM_TOKENS *
+      max(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, CUDA_NUM_THREADS);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, sizeof(curandState) * state_max_length);
   state = gpu_mem_allocator.allocate_instance<curandState>(state_max_length);
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 9220e9c38..d7dd96eb0 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -741,7 +741,7 @@ void IncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap IncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 158af9322..745e6806c 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -438,11 +438,12 @@ void LayerNorm::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap LayerNorm::inference(FFModel const &ff,
-                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                               std::vector<ParallelTensor> const &batch_inputs,
-                               std::vector<ParallelTensor> const &batch_outputs,
-                               MachineView const *mv) {
+FutureMap LayerNorm::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 981df5dca..6f01cf431 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -555,11 +555,12 @@ void Linear::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Linear::inference(FFModel const &ff,
-                            /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                            std::vector<ParallelTensor> const &batch_inputs,
-                            std::vector<ParallelTensor> const &batch_outputs,
-                            MachineView const *mv) {
+FutureMap Linear::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/multihead_self_attention_impl.cu b/src/ops/multihead_self_attention_impl.cu
index 08de141a6..33be22e50 100644
--- a/src/ops/multihead_self_attention_impl.cu
+++ b/src/ops/multihead_self_attention_impl.cu
@@ -26,141 +26,475 @@ namespace flashinfer {
 // }
 // head_dim[] = {64, 128, 256};
 
-
 /********** batch append instantiations for half precision **********/
 
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 64,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 128,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 256,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 64,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 128,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 256,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCustom, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
 
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
 
 /********** batch prefill instantiations for half precision **********/
 
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 64,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCausal, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 128,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCausal, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x2, 256,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCausal, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 64,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCausal, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 128,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCausal, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
-
-template cudaError_t BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices, WarpLayout::k4x1x1, 256,
-          LogitsPostHook::kNone, QKVLayout::kNHD, PosEncodingMode::kNone,
-          false, MaskMode::kCausal, half, half, int32_t>(
-  half* q, int32_t* request_indices, int32_t* q_tile_indices, int32_t* kv_tile_indices,
-  int32_t* q_indptr, int32_t* q_offset,
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv, uint8_t* custom_mask,
-  int32_t* qk_indptr, int32_t* o_indptr, half* o, half* tmp_v, float* tmp_s, float* lse,
-  int32_t* merge_indptr, bool* block_valid_mask, int32_t* kv_chunk_size_ptr,
-  uint32_t total_num_rows, uint32_t num_qo_heads, uint32_t padded_batch_size,
-  float logits_soft_cap, float sm_scale, float rope_scale, float rope_theta, cudaStream_t stream);
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           QKVLayout::kNHD,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
 } // namespace flashinfer
diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index a5561bcd1..a4b3222e7 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -258,11 +258,12 @@ void NoOp::init(FFModel const &ff) {
 
 void NoOp::forward(FFModel const &ff) {}
 
-FutureMap NoOp::inference(FFModel const &ff,
-                          /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                          std::vector<ParallelTensor> const &batch_inputs,
-                          std::vector<ParallelTensor> const &batch_outputs,
-                          MachineView const *mv) {
+FutureMap NoOp::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   FutureMap empty;
   return empty;
 }
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index b85ecdd13..be4ac4833 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -536,7 +536,7 @@ Op *ResidualLayerNorm::materialize(FFModel &ff,
 
 FutureMap ResidualLayerNorm::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 2723cb06b..78973a165 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -363,12 +363,12 @@ void ResidualRMSNorm::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap
-    ResidualRMSNorm::inference(FFModel const &ff,
-                               /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                               std::vector<ParallelTensor> const &batch_inputs,
-                               std::vector<ParallelTensor> const &batch_outputs,
-                               MachineView const *mv) {
+FutureMap ResidualRMSNorm::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 56cf08147..3070368ff 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -339,11 +339,12 @@ void RMSNorm::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap RMSNorm::inference(FFModel const &ff,
-                             /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
+FutureMap RMSNorm::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 2495f86bd..3b3d75146 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -262,7 +262,7 @@ void SigmoidSiluMulti::backward(FFModel const &ff) {
 
 FutureMap SigmoidSiluMulti::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index cf605f8fd..76991bd52 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -36,7 +36,8 @@ __global__ void spec_store_kv_cache(
     DT const *devQKVProjArray,
     DT *kCache_ptr,
     DT *vCache_ptr,
-    /* Reserved: BatchConfig Updated, leave HIP code to be updated */BatchConfig::PerTokenInfo *tokenInfos,
+    /* Reserved: BatchConfig Updated, leave HIP code to be updated */
+    BatchConfig::PerTokenInfo *tokenInfos,
     BatchConfig::PerRequestInfo *requestInfo,
     TreeSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos,
     TreeSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 706ea4f79..0e489fa07 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -218,13 +218,19 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   BatchPrefillHandler *handler = nullptr;
 
   if (!bc->prompt_phase) {
-    assert(m->handle.tree_search_attention_metadata->decode_handler_collections.count(batch_size) != 0 &&
+    assert(m->handle.tree_search_attention_metadata->decode_handler_collections
+                   .count(batch_size) != 0 &&
            "Handler is not initialized");
-    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_search_attention_metadata->decode_handler_collections[batch_size]);
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_search_attention_metadata
+            ->decode_handler_collections[batch_size]);
   } else {
-    assert(m->handle.tree_search_attention_metadata->prompt_handler_collections.count(batch_size) != 0 &&
+    assert(m->handle.tree_search_attention_metadata->prompt_handler_collections
+                   .count(batch_size) != 0 &&
            "Handler is not initialized");
-    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_search_attention_metadata->prompt_handler_collections[batch_size]);
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_search_attention_metadata
+            ->prompt_handler_collections[batch_size]);
   }
 
   //   cudaEventRecord(t_end, stream);
@@ -241,67 +247,67 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  DISPATCH_HEADDIM(
-    head_dim, HEAD_DIM, {
-      cudaError_t result;
-      if (bc->prompt_phase) {
-        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-            PageStorage::kIndices,
-            HEAD_DIM,
-            LogitsPostHook::kNone,
-            QKVLayout::kNHD,
-            PosEncodingMode::kNone,
-            false,
-            MaskMode::kCausal,
-            half,
-            half,
-            int32_t>(handler,
-                      q,
-                      m->handle.tree_search_attention_metadata->q_indptr,
-                      /*q_offset=*/nullptr,
-                      paged_kv,
-                      /*custom_mask=*/nullptr,
-                      /*qk_indptr=*/nullptr,
-                      o,
-                      /*lse=*/nullptr,
-                      num_q_heads,
-                      /*logits_soft_cap=*/0.f,
-                      sm_scale,
-                      /*rope_scale=*/1.f,
-                      /*rope_theta=*/static_cast<float>(1e4),
-                      stream);
-      } else {
-        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-            PageStorage::kIndices,
-            HEAD_DIM,
-            LogitsPostHook::kNone,
-            QKVLayout::kNHD,
-            PosEncodingMode::kNone,
-            false,
-            MaskMode::kCustom,
-            half,
-            half,
-            int32_t>(handler,
-                      q,
-                      m->handle.tree_search_attention_metadata->q_indptr,
-                      /*q_offset=*/nullptr,
-                      paged_kv,
-                      m->handle.tree_search_attention_metadata->custom_mask,
-                      m->handle.tree_search_attention_metadata->qk_indptr,
-                      o,
-                      /*lse=*/nullptr,
-                      num_q_heads,
-                      /*logits_soft_cap=*/0.f,
-                      sm_scale,
-                      /*rope_scale=*/1.f,
-                      /*rope_theta=*/static_cast<float>(1e4),
-                      stream);
-      }
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    cudaError_t result;
+    if (bc->prompt_phase) {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        QKVLayout::kNHD,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCausal,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_search_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              /*custom_mask=*/nullptr,
+              /*qk_indptr=*/nullptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    } else {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        QKVLayout::kNHD,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCustom,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_search_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              m->handle.tree_search_attention_metadata->custom_mask,
+              m->handle.tree_search_attention_metadata->qk_indptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    }
     if (result != cudaSuccess) {
-      throw std::runtime_error(
-          "Failed to run "
-          "BatchPrefillWithPagedKVCacheWrapperDispatched" +
-          std::string(cudaGetErrorString(result)));
+      throw std::runtime_error("Failed to run "
+                               "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                               std::string(cudaGetErrorString(result)));
     }
   });
 
@@ -515,10 +521,12 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
 
 SpecIncMultiHeadSelfAttentionMeta::~SpecIncMultiHeadSelfAttentionMeta(void) {
   // for (auto &decode_handler: decode_handler_collections) {
-  //   delete static_cast<flashinfer::BatchPrefillHandler *>(decode_handler.second);
+  //   delete static_cast<flashinfer::BatchPrefillHandler
+  //   *>(decode_handler.second);
   // }
   // for (auto &prompt_handler: prompt_handler_collections) {
-  //   delete static_cast<flashinfer::BatchPrefillHandler *>(prompt_handler.second);
+  //   delete static_cast<flashinfer::BatchPrefillHandler
+  //   *>(prompt_handler.second);
   // }
 }
 
diff --git a/src/ops/split.cc b/src/ops/split.cc
index a9b01dc21..f3deadd80 100644
--- a/src/ops/split.cc
+++ b/src/ops/split.cc
@@ -249,11 +249,12 @@ void Split::forward(FFModel const &ff) {
   }
   runtime->execute_index_space(ctx, launcher);
 }
-FutureMap Split::inference(FFModel const &ff,
-                           /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                           std::vector<ParallelTensor> const &batch_inputs,
-                           std::vector<ParallelTensor> const &batch_outputs,
-                           MachineView const *mv) {
+FutureMap Split::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index 17512328e..b8d53fb24 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -269,11 +269,12 @@ void TopK::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap TopK::inference(FFModel const &ff,
-                          /* Reserved: BatchConfig Updated */BatchConfigFuture const &bc,
-                          std::vector<ParallelTensor> const &batch_inputs,
-                          std::vector<ParallelTensor> const &batch_outputs,
-                          MachineView const *mv) {
+FutureMap TopK::inference(
+    FFModel const &ff,
+    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 23e71c699..1ca94692a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -301,13 +301,19 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   BatchPrefillHandler *handler = nullptr;
 
   if (!bc->prompt_phase) {
-    assert(m->handle.tree_verify_attention_metadata->decode_handler_collections.count(batch_size) != 0 &&
+    assert(m->handle.tree_verify_attention_metadata->decode_handler_collections
+                   .count(batch_size) != 0 &&
            "Handler is not initialized");
-    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_verify_attention_metadata->decode_handler_collections[batch_size]);
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_verify_attention_metadata
+            ->decode_handler_collections[batch_size]);
   } else {
-    assert(m->handle.tree_verify_attention_metadata->prompt_handler_collections.count(batch_size) != 0 &&
+    assert(m->handle.tree_verify_attention_metadata->prompt_handler_collections
+                   .count(batch_size) != 0 &&
            "Handler is not initialized");
-    handler = static_cast<BatchPrefillHandler *>(m->handle.tree_verify_attention_metadata->prompt_handler_collections[batch_size]);
+    handler = static_cast<BatchPrefillHandler *>(
+        m->handle.tree_verify_attention_metadata
+            ->prompt_handler_collections[batch_size]);
   }
 
   //   cudaEventRecord(t_end, stream);
@@ -324,67 +330,67 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  DISPATCH_HEADDIM(
-    head_dim, HEAD_DIM, {
-      cudaError_t result;
-      if (bc->prompt_phase) {
-        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-            PageStorage::kIndices,
-            HEAD_DIM,
-            LogitsPostHook::kNone,
-            QKVLayout::kNHD,
-            PosEncodingMode::kNone,
-            false,
-            MaskMode::kCausal,
-            half,
-            half,
-            int32_t>(handler,
-                      q,
-                      m->handle.tree_verify_attention_metadata->q_indptr,
-                      /*q_offset=*/nullptr,
-                      paged_kv,
-                      /*custom_mask=*/nullptr,
-                      /*qk_indptr=*/nullptr,
-                      o,
-                      /*lse=*/nullptr,
-                      num_q_heads,
-                      /*logits_soft_cap=*/0.f,
-                      sm_scale,
-                      /*rope_scale=*/1.f,
-                      /*rope_theta=*/static_cast<float>(1e4),
-                      stream);
-      } else {
-        result = BatchPrefillWithPagedKVCacheWrapperDispatched<
-            PageStorage::kIndices,
-            HEAD_DIM,
-            LogitsPostHook::kNone,
-            QKVLayout::kNHD,
-            PosEncodingMode::kNone,
-            false,
-            MaskMode::kCustom,
-            half,
-            half,
-            int32_t>(handler,
-                      q,
-                      m->handle.tree_verify_attention_metadata->q_indptr,
-                      /*q_offset=*/nullptr,
-                      paged_kv,
-                      m->handle.tree_verify_attention_metadata->custom_mask,
-                      m->handle.tree_verify_attention_metadata->qk_indptr,
-                      o,
-                      /*lse=*/nullptr,
-                      num_q_heads,
-                      /*logits_soft_cap=*/0.f,
-                      sm_scale,
-                      /*rope_scale=*/1.f,
-                      /*rope_theta=*/static_cast<float>(1e4),
-                      stream);
-      }
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    cudaError_t result;
+    if (bc->prompt_phase) {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        QKVLayout::kNHD,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCausal,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_verify_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              /*custom_mask=*/nullptr,
+              /*qk_indptr=*/nullptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    } else {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        QKVLayout::kNHD,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCustom,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              handler,
+              q,
+              m->handle.tree_verify_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              m->handle.tree_verify_attention_metadata->custom_mask,
+              m->handle.tree_verify_attention_metadata->qk_indptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    }
     if (result != cudaSuccess) {
-      throw std::runtime_error(
-          "Failed to run "
-          "BatchPrefillWithPagedKVCacheWrapperDispatched" +
-          std::string(cudaGetErrorString(result)));
+      throw std::runtime_error("Failed to run "
+                               "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                               std::string(cudaGetErrorString(result)));
     }
   });
 
@@ -736,17 +742,17 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
             sizeof(BatchConfig::requestsInfo) +
             sizeof(BatchConfig::request_available) +
             sizeof(BatchConfig::causalMask));
-    num_tokens_to_commit = 
-        reinterpret_cast<int *>(
-            reinterpret_cast<char *>(committed_token_infos) +
-            sizeof(BatchConfig::committed_tokens));
+    num_tokens_to_commit = reinterpret_cast<int *>(
+        reinterpret_cast<char *>(committed_token_infos) +
+        sizeof(BatchConfig::committed_tokens));
   }
 
   cudaStreamSynchronize(stream);
 }
 
 TreeIncMultiHeadSelfAttentionMeta::~TreeIncMultiHeadSelfAttentionMeta(void) {
-  // delete static_cast<flashinfer::BatchPrefillHandler *>(batch_prefill_handler);
+  // delete static_cast<flashinfer::BatchPrefillHandler
+  // *>(batch_prefill_handler);
 }
 
 }; // namespace FlexFlow
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 5d38e2890..f3b1a7eed 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -106,7 +106,12 @@ OpMeta *AllReduce::init_task(Task const *task,
                              Runtime *runtime) {
   AllReduce *ar = (AllReduce *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  AllReduceMeta *meta = new AllReduceMeta(handle, ar);
+  Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                       .only_kind(Memory::GPU_FB_MEM)
+                       .best_affinity_to(task->target_proc)
+                       .first();
+  MemoryAllocator gpu_mem_allocator(gpu_mem);
+  AllReduceMeta *meta = new AllReduceMeta(handle, ar, gpu_mem_allocator);
   meta->input_type[0] = ar->inputs[0]->data_type;
   meta->output_type[0] = ar->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
@@ -326,7 +331,7 @@ void AllReduce::inference_task(Task const *task,
   assert(regions.size() == 2);
   assert(task->regions.size() == 2);
 
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+  AllReduceMeta *m = *((AllReduceMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
 
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2c000137a..02fb760fd 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -15,37 +15,178 @@
 
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "tensorrt_llm/custom_allreduce_kernels.h"
+#include <cuda_runtime.h>
 
 namespace FlexFlow {
 
-AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+AllReduceMeta::AllReduceMeta(FFHandler handle,
+                             AllReduce const *reduct,
+                             MemoryAllocator &gpu_mem_allocator)
+    : OpMeta(handle) {
+  barrier_ptr_size = sizeof(uint32_t) *
+                     (tensorrt_llm::MAX_ALL_REDUCE_BLOCKS + 2) *
+                     tensorrt_llm::MAX_RANKS_PER_NODE;
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst,
+      sizeof(void *) * (handle.num_devices + 1) + barrier_ptr_size * 2);
+  allgather_src = gpu_mem_allocator.allocate_instance_untyped(sizeof(void *));
+  allgather_dst = gpu_mem_allocator.allocate_instance_untyped(
+      sizeof(void *) * handle.num_devices);
+  // Create barrier helpers for all-reduce.
+  barrier_in_ptr =
+      gpu_mem_allocator.allocate_instance_untyped(barrier_ptr_size);
+  barrier_out_ptr =
+      gpu_mem_allocator.allocate_instance_untyped(barrier_ptr_size);
+  checkCUDA(cudaMemset(barrier_in_ptr, 0, barrier_ptr_size));
+  checkCUDA(cudaMemset(barrier_out_ptr, 0, barrier_ptr_size));
+  // Reset allocated memory to zero.
+  // We explicitly synchronize after memset, to make sure memset finishes
+  // before using all-gather to exchange peer pointers.
+  // This is important to ensure the memory reset get ordered
+  // before any other peers read the memory.
+  checkCUDA(cudaDeviceSynchronize());
+}
+
+AllReduceMeta::~AllReduceMeta() {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
+CommunicationBuffer *get_or_create_comm_buffer(AllReduceMeta *m,
+                                               int num_devices,
+                                               int device_id,
+                                               ncclComm_t ncclComm,
+                                               void *local_ptr,
+                                               cudaStream_t stream) {
+  auto iter = m->comm_bufs.find(local_ptr);
+  if (iter != m->comm_bufs.end()) {
+    return iter->second;
+  } else {
+    CommunicationBuffer *comm_buffer =
+        create_comm_buf_with_local_ptr(num_devices,
+                                       device_id,
+                                       ncclComm,
+                                       m->allgather_src,
+                                       m->allgather_dst,
+                                       local_ptr,
+                                       m->barrier_in_ptr,
+                                       m->barrier_out_ptr,
+                                       &(m->barrier_flag),
+                                       stream);
+    m->comm_bufs[local_ptr] = comm_buffer;
+    return comm_buffer;
+  }
+}
+
+// Get the number of bits for a given data type.
+inline int get_bits(DataType dtype) {
+  switch (dtype) {
+    case DataType::DT_INT64:
+    case DataType::DT_DOUBLE:
+      return 64;
+    case DataType::DT_INT32:
+    case DataType::DT_FLOAT:
+      return 32;
+    case DataType::DT_HALF:
+      return 16;
+    case DataType::DT_INT8:
+      return 8;
+    case DataType::DT_INT4:
+      return 4;
+    default:
+      assert(false && "Unsupported data type");
+  }
+}
+
+// Check if customized all-reduce kernels can be applied.
+inline bool CanApplyCustomAllReduce(int64_t num_elements, DataType dtype) {
+  // The customized all-reduce kernel has the following requirement(s).
+  return num_elements % (16 / ((get_bits(dtype) + 7) / 8)) == 0;
+}
+
+// Check if the two-shot customized all-reduce kernel can be applied.
+inline bool CanApplyTwoShotAllReduce(int64_t num_elements,
+                                     DataType dtype,
+                                     int num_workers) {
+  // The two-shot customized all-reduce kernel has the following requirement(s).
+  return (num_elements / num_workers) % (16 / ((get_bits(dtype) + 7) / 8)) == 0;
+}
+
+// Customized all-reduce kernel backed by CUDA Peer memory.
+void inference_kernel_wrapper(AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output) {
+#ifndef FF_USE_NCCL
+  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   size_t num_elements = bc->num_tokens * hidden_dim_size;
-#ifdef FF_USE_NCCL
-  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
-  checkNCCL(ncclAllReduce(input.ptr,
-                          output.ptr,
-                          num_elements,
-                          nccl_data_type,
-                          ncclSum,
-                          m->handle.ncclComm,
-                          stream));
-#else
-  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
-#endif
+  int num_devices = m->handle.num_devices;
+  int device_id = m->handle.device_id;
+  ncclComm_t ncclComm = m->handle.ncclComm;
+  DataType dtype = input.data_type;
+
+  tensorrt_llm::AllReduceStrategyType strategy =
+      tensorrt_llm::SelectImplementation(
+          num_elements * ((get_bits(dtype) + 7) / 8), num_devices);
+
+  if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
+      !CanApplyCustomAllReduce(num_elements, dtype)) {
+    // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
+    ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
+    checkNCCL(ncclAllReduce(input.ptr,
+                            output.ptr,
+                            num_elements,
+                            nccl_data_type,
+                            ncclSum,
+                            ncclComm,
+                            stream));
+    return;
+  }
+
+  // Initialize the all-reduce kernel arguments.
+  tensorrt_llm::AllReduceParams params;
+  params.ranks_per_node = num_devices;
+  params.rank = device_id;
+  params.local_rank = device_id;
+  CommunicationBuffer *comm_buffer =
+      get_or_create_comm_buffer(m,
+                                num_devices,
+                                device_id,
+                                ncclComm,
+                                const_cast<void *>(input.ptr),
+                                stream);
+  params.barrier_flag = (*comm_buffer->barrier_flag)++;
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
+  }
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_barrier_ptrs_in[i] =
+        reinterpret_cast<uint32_t *>(comm_buffer->barrier_in[i]);
+  }
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_barrier_ptrs_out[i] =
+        reinterpret_cast<uint32_t *>(comm_buffer->barrier_out[i]);
+  }
+
+  if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) {
+    // Two-shot all-reduce does not support this case.
+    // So we fallback to the one-shot strategy.
+    strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
+  }
+
+  tensorrt_llm::customAllReduce(
+      params, output.ptr, num_elements, dtype, strategy, stream);
 }
 
 void forward_kernel_wrapper(AllReduceMeta const *m,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 1099033b6..d74f8084c 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -171,8 +171,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
     os << "Committed tokens info:\n";
     for (int i = 0; i < bc.num_tokens_to_commit; i++) {
       os << "  Token " << i << ":\n";
-      os << "    Index in kv cache: " << bc.committed_tokens[i].index_in_kv_cache
-         << std::endl;
+      os << "    Index in kv cache: "
+         << bc.committed_tokens[i].index_in_kv_cache << std::endl;
       os << "    Request index: " << bc.committed_tokens[i].request_index
          << std::endl;
       os << "    Token depth: " << bc.committed_tokens[i].token_depth
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 0c41091e5..3543f88d2 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -1605,15 +1605,17 @@ T SearchHelper::graph_cost(Graph const *graph,
                            bool include_sink_compute_time) const {
   TAG_ENTER(this->logger);
   this->logger->debug() << "PCG::SearchHelper::graph_cost: sink("
-                        << sink.node.guid << ") " << "sink.view("
-                        << sink.view.ndims << " " << sink.view.start_device_id
-                        << " " << sink.view.dim[0] << ") " << "source("
-                        << source.node.guid << ") " << "source.view("
-                        << source.view.ndims << " "
+                        << sink.node.guid << ") "
+                        << "sink.view(" << sink.view.ndims << " "
+                        << sink.view.start_device_id << " " << sink.view.dim[0]
+                        << ") "
+                        << "source(" << source.node.guid << ") "
+                        << "source.view(" << source.view.ndims << " "
                         << source.view.start_device_id << " "
-                        << source.view.dim[0] << ") " << "resources("
-                        << resources.num_nodes << " " << resources.start_gpu_id
-                        << " " << resources.available_gpus_per_node << ")";
+                        << source.view.dim[0] << ") "
+                        << "resources(" << resources.num_nodes << " "
+                        << resources.start_gpu_id << " "
+                        << resources.available_gpus_per_node << ")";
   if (this->model->config.profiling) {
     graph->print_dot();
   }
@@ -1736,11 +1738,11 @@ T SearchHelper::graph_cost(Graph const *graph,
     this->logger->spew() << "  op_total_mem: " << metrics.op_total_mem;
     float op_total_mem_mb = (float)((metrics.op_total_mem) / 1e4) / 1e2;
     this->logger->debug() << "[PCG::SearchHelper::graph_cost] Sink node cost ["
-                          << sink.node.to_string() << "]: " << "forward("
-                          << metrics.forward_time << ") " << "backward("
-                          << metrics.backward_time << ") " << "sync("
-                          << metrics.sync_time << ") " << "memory("
-                          << op_total_mem_mb << " MB)";
+                          << sink.node.to_string() << "]: "
+                          << "forward(" << metrics.forward_time << ") "
+                          << "backward(" << metrics.backward_time << ") "
+                          << "sync(" << metrics.sync_time << ") "
+                          << "memory(" << op_total_mem_mb << " MB)";
     this->add_sink_node_costs<T>(sink, metrics, &result);
   }
 
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index a27c2c0f9..fe4a8d4af 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -605,6 +605,29 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task,
   checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank));
   // fprintf(stderr, "ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n",
   //     ncclComm, allRanks, myRank, ncclId);
+
+  // Double check that we already enabled P2P access between all GPUs
+  for (int i = 0; i < allRanks; i++) {
+    if (i == myRank) {
+      continue;
+    }
+    cudaError_t err = cudaDeviceEnablePeerAccess(i, 0);
+    if (err == cudaSuccess) {
+      printf("P2P access successfully enabled between GPU %d and GPU %d\n",
+             myRank,
+             i);
+    } else if (err == cudaErrorPeerAccessAlreadyEnabled) {
+      printf("P2P access is already enabled between GPU %d and GPU %d\n",
+             myRank,
+             i);
+    } else {
+      printf("Failed to enable P2P access between GPU %d and GPU %d: %s\n",
+             myRank,
+             i,
+             cudaGetErrorString(err));
+      assert(false && "Failed to enable P2P access");
+    }
+  }
   return ncclComm;
 }
 
@@ -1243,12 +1266,15 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff,
 #define DIMFUNC(DIM)                                                           \
   case DIM: {                                                                  \
     Rect<DIM> rect = domain;                                                   \
-    int idx = 0;                                                               \
+    int idx = 0, num_devices = rect.volume();                                  \
     for (PointInRectIterator<DIM> it(rect); it(); it++) {                      \
       FFHandler handle = ff.handlers[view.get_device_id(*it)];                 \
       if (op_type == OP_ALLREDUCE) {                                           \
         ncclComm_t *nccl_comms = ff.find_nccl_comms(view);                     \
-        handle.ncclComm = nccl_comms[idx++];                                   \
+        handle.ncclComm = nccl_comms[idx];                                     \
+        handle.num_devices = num_devices;                                      \
+        handle.device_id = idx;                                                \
+        idx++;                                                                 \
       }                                                                        \
       argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler)));         \
     }                                                                          \
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index 37eda6e35..2f8631b24 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -155,7 +155,9 @@ FFHandler
   } else {
     handle.offload_reserve_space = nullptr;
   }
-  if (handle.batch_config_metadata_size + handle.attention_metadata->mem_size() > 0) {
+  if (handle.batch_config_metadata_size +
+          handle.attention_metadata->mem_size() >
+      0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                          .only_kind(Memory::GPU_FB_MEM)
@@ -163,7 +165,8 @@ FFHandler
                          .first();
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(handle.batch_config_metadata_size + handle.attention_metadata->mem_size() - 1));
+        Realm::Point<1, coord_t>(handle.batch_config_metadata_size +
+                                 handle.attention_metadata->mem_size() - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -187,6 +190,8 @@ FFHandler
   // checkCUDA(hipMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
+  handle.num_devices = 0;
+  handle.device_id = 0;
 #endif
   return handle;
 }
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index ab42539cb..67c13b201 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -91,8 +91,10 @@ FFHandler
   handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion;
   handle.tree_search_attention_metadata = new AttentionMetaData();
   handle.tree_verify_attention_metadata = new AttentionMetaData();
-  assert(handle.tree_search_attention_metadata != nullptr && "Attention metadata must be allocated");
-  assert(handle.tree_verify_attention_metadata != nullptr && "Attention metadata must be allocated");
+  assert(handle.tree_search_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
+  assert(handle.tree_verify_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
   checkCUDA(cublasCreate(&handle.blas));
   if (handle.allowTensorOpMathConversion) {
     checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH));
@@ -155,7 +157,10 @@ FFHandler
   } else {
     handle.offload_reserve_space = nullptr;
   }
-  if (handle.batch_config_metadata_size + handle.tree_search_attention_metadata->mem_size() + handle.tree_verify_attention_metadata->mem_size() > 0) {
+  if (handle.batch_config_metadata_size +
+          handle.tree_search_attention_metadata->mem_size() +
+          handle.tree_verify_attention_metadata->mem_size() >
+      0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                          .only_kind(Memory::GPU_FB_MEM)
@@ -163,7 +168,10 @@ FFHandler
                          .first();
     Realm::Rect<1, coord_t> bounds(
         Realm::Point<1, coord_t>(0),
-        Realm::Point<1, coord_t>(handle.batch_config_metadata_size + handle.tree_search_attention_metadata->mem_size() + handle.tree_verify_attention_metadata->mem_size() - 1));
+        Realm::Point<1, coord_t>(
+            handle.batch_config_metadata_size +
+            handle.tree_search_attention_metadata->mem_size() +
+            handle.tree_verify_attention_metadata->mem_size() - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -194,6 +202,8 @@ FFHandler
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
+  handle.num_devices = 0;
+  handle.device_id = 0;
 #endif
   return handle;
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e339168f3..3a6619b37 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -567,23 +567,23 @@ void RequestManager::request_complete_clean_up(int batch_index) {
       output_file.close();
     }
   }
-  RequestProfileInfo profile_info = profiling_requests[guid];
-  std::string str =
-      "[" + std::to_string(guid) +
-      "] Request completed:" + " decoding_time_ms(" +
-      std::to_string(
-          (profile_info.finish_time - profile_info.start_decoding_time) *
-          1e-3) +
-      ")" + " total_time_ms(" +
-      std::to_string((profile_info.finish_time - profile_info.start_time) *
-                     1e-3) +
-      ")" + " LLM_decoding_steps(" +
-      std::to_string(profile_info.llm_decoding_steps) + ")";
-  if (decoding_mode == SPECULATIVE_DECODING) {
-    str = str + " SSM_decoding_steps(" +
-          std::to_string(profile_info.ssm_decoding_steps) + ")";
-  }
-  write_to_output_file("", str);
+  // RequestProfileInfo profile_info = profiling_requests[guid];
+  // std::string str =
+  //     "[" + std::to_string(guid) +
+  //     "] Request completed:" + " decoding_time_ms(" +
+  //     std::to_string(
+  //         (profile_info.finish_time - profile_info.start_decoding_time) *
+  //         1e-3) +
+  //     ")" + " total_time_ms(" +
+  //     std::to_string((profile_info.finish_time - profile_info.start_time) *
+  //                    1e-3) +
+  //     ")" + " LLM_decoding_steps(" +
+  //     std::to_string(profile_info.llm_decoding_steps) + ")";
+  // if (decoding_mode == SPECULATIVE_DECODING) {
+  //   str = str + " SSM_decoding_steps(" +
+  //         std::to_string(profile_info.ssm_decoding_steps) + ")";
+  // }
+  // write_to_output_file("", str);
 
   trigger_request_completion_future(guid);
 }
@@ -1441,14 +1441,14 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  if (all_request_last_layer_empty or current_ssm_step == get_max_tree_depth()) {
+  if (all_request_last_layer_empty or
+      current_ssm_step == get_max_tree_depth()) {
     // Update profiling statistics before returning
     profiling.ssm_step_times.push_back(
         (Realm::Clock::current_time_in_microseconds() -
          profiling.ssm_step_start) *
         1e-3);
     profiling.ssm_steps.push_back(current_ssm_step);
-    printf("SSM step finished\n");
   }
   return all_request_last_layer_empty;
 }
@@ -2022,6 +2022,7 @@ void RequestManager::serve_decoding(FFModel *llm) {
   std::queue<InferenceResultFuture> batch_pipeline;
   { batch_pipeline.push(last_irf); }
 
+  reset_profiling_statistics();
   while (!is_background_server_terminated()) {
 
     if (batch_pipeline.size() >= 4) {
@@ -2089,6 +2090,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   std::queue<InferenceResultFuture> infer_result_future_pipeline;
   infer_result_future_pipeline.push(irf_0);
 
+  reset_profiling_statistics();
   while (!is_background_server_terminated()) {
     if (infer_result_future_pipeline.size() >= 4) {
       // Block here to avoid launching too many batches
@@ -2198,47 +2200,81 @@ void RequestManager::terminate_background_server() {
     assert(profiling.llm_step_times.size() ==
            profiling.requests_per_step.size());
     // Write the last profiling statistics to output file
-    std::string str = "[Profiling Statistics]\n llm_step_times_ms(";
-    std::string llm_step_times_ms = " ";
-    for (double time : profiling.llm_step_times) {
-      llm_step_times_ms += std::to_string(time) + " ";
-    }
-    llm_step_times_ms += ")";
-    str += llm_step_times_ms;
-    str += "\n requests_per_step(";
-    std::string req_per_step = " ";
+    std::string str = "[Profiling Statistics]";
+
+    long long total_time = Realm::Clock::current_time_in_microseconds() -
+                           profiling.server_start_time;
+    int total_requests = profiling_requests.size();
+    int total_tokens = 0;
+    for (int num_tokens : profiling.generated_tokens_per_step) {
+      total_tokens += num_tokens;
+    }
+    str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
+    str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
+    // throughput
+    str += "\n throughput_requests_per_sec(" +
+           std::to_string(total_requests / (total_time / 1e6)) + ")";
+    str += "\n throughput_tokens_per_sec(" +
+           std::to_string(total_tokens / (total_time / 1e6)) + ")";
+
+    double average_latency_per_request = 0;
+    std::string latency_per_request_ms = "\n latency_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double latency_ms = (profiling_info.second.finish_time -
+                           profiling_info.second.start_time) /
+                          1000.0;
+      // latency_per_request_ms += "[" + std::to_string(profiling_info.first) +
+      // ","; latency_per_request_ms += std::to_string(latency_ms) + "] ";
+      latency_per_request_ms += std::to_string(latency_ms) + " ";
+      average_latency_per_request += latency_ms;
+    }
+    latency_per_request_ms += ")";
+    str += latency_per_request_ms;
+    average_latency_per_request /= total_requests;
+    str += "\n average_latency_per_request_ms(" +
+           std::to_string(average_latency_per_request) + ")";
+
+    std::string req_per_step = "\n requests_per_step( ";
     for (int nb : profiling.requests_per_step) {
       req_per_step += std::to_string(nb) + " ";
     }
     req_per_step += ")";
     str += req_per_step;
+
     if (profiling.ssm_step_times.size() > 0) {
       // assert(profiling.ssm_step_times.size() ==
       //        profiling.llm_step_times.size());
-      str += "\n ssm_step_times_ms(";
-      std::string ssm_step_times_ms = " ";
+      std::string ssm_step_times_ms = "\n ssm_step_times_ms( ";
       for (double time : profiling.ssm_step_times) {
         ssm_step_times_ms += std::to_string(time) + " ";
       }
       ssm_step_times_ms += ")";
       str += ssm_step_times_ms;
     }
+
     if (profiling.ssm_steps.size() > 0) {
-      str += "\n ssm_steps(";
-      std::string ssm_steps = " ";
+      std::string ssm_steps = "\n ssm_steps( ";
       for (int nb : profiling.ssm_steps) {
         ssm_steps += std::to_string(nb) + " ";
       }
       ssm_steps += ")";
       str += ssm_steps;
     }
-    str += "\n generated_tokens_per_step(";
-    std::string generated_tokens_per_step = " ";
+
+    std::string llm_step_times_ms = "\n llm_step_times_ms( ";
+    for (double time : profiling.llm_step_times) {
+      llm_step_times_ms += std::to_string(time) + " ";
+    }
+    llm_step_times_ms += ")";
+    str += llm_step_times_ms;
+
+    std::string generated_tokens_per_step = "\n generated_tokens_per_step( ";
     for (int nb : profiling.generated_tokens_per_step) {
       generated_tokens_per_step += std::to_string(nb) + " ";
     }
     generated_tokens_per_step += ")";
     str += generated_tokens_per_step;
+
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate
@@ -2540,4 +2576,17 @@ std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
 }
 
 /* --------- Request Token Tree Related Functions --------- */
+
+/* --------- Profiling Related Functions --------- */
+void RequestManager::reset_profiling_statistics() {
+  profiling.llm_step_times.clear();
+  profiling.requests_per_step.clear();
+  profiling.ssm_step_times.clear();
+  profiling.ssm_steps.clear();
+  profiling.generated_tokens_per_step.clear();
+  profiling.llm_step_start = 0;
+  profiling.ssm_step_start = 0;
+  profiling.server_start_time = Realm::Clock::current_time_in_microseconds();
+}
+/* --------- Profiling Related Functions --------- */
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 7e03a848e..75c2051b5 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -119,7 +119,7 @@ __global__ void
   qk_indptr[request_idx + 1] = qk_lens;
 }
 
-#define test_bit_orig(bit_mask, idx, pos)                                           \
+#define test_bit_orig(bit_mask, idx, pos)                                      \
   (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
 __global__ void
@@ -153,7 +153,8 @@ __global__ void
   }
 
   int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch,
-            q_start = request_infos[requext_idx_in_batch].first_token_index_in_request;
+            q_start = request_infos[requext_idx_in_batch]
+                          .first_token_index_in_request;
 
   uint8_t packed_bits = 0;
   for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
@@ -163,7 +164,9 @@ __global__ void
     if (kv_idx < q_start || q_idx >= q_length) {
       packed_bits |= 1 << bit_idx;
     } else {
-      if (test_bit_orig(causalMask[requext_idx_in_batch].bit_mask, q_idx, kv_idx - q_start)) {
+      if (test_bit_orig(causalMask[requext_idx_in_batch].bit_mask,
+                        q_idx,
+                        kv_idx - q_start)) {
         packed_bits |= 1 << bit_idx;
       }
     }
@@ -224,8 +227,8 @@ void RequestManager::load_batch_config_task(
   if (batch_config->get_mode() == TREE_SEARCH_MODE) {
     if (handle.tree_search_attention_metadata->enabled()) {
       for (int request_idx = 0;
-          request_idx < BatchConfig::max_requests_per_batch();
-          request_idx++) {
+           request_idx < BatchConfig::max_requests_per_batch();
+           request_idx++) {
         if (batch_config->request_available[request_idx]) {
           checkCUDA(cudaMemcpyAsync(
               static_cast<char *>(handle.batch_config_metadata) +
@@ -240,56 +243,67 @@ void RequestManager::load_batch_config_task(
 
       // calculate the attention meta data
       {
-        BatchConfig::PerRequestInfo *request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
-          static_cast<char *>(handle.batch_config_metadata) +
-          sizeof(BatchConfig::tokensInfo));
+        BatchConfig::PerRequestInfo *request_infos =
+            reinterpret_cast<BatchConfig::PerRequestInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo));
         bool *request_available = reinterpret_cast<bool *>(
-          static_cast<char *>(handle.batch_config_metadata) +
-          sizeof(BatchConfig::tokensInfo) +
-          sizeof(BatchConfig::requestsInfo));
-        BatchConfig::BitMask *causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-          static_cast<char *>(handle.batch_config_metadata) +
-          sizeof(BatchConfig::tokensInfo) +
-          sizeof(BatchConfig::requestsInfo) +
-          sizeof(BatchConfig::request_available));
+            static_cast<char *>(handle.batch_config_metadata) +
+            sizeof(BatchConfig::tokensInfo) +
+            sizeof(BatchConfig::requestsInfo));
+        BatchConfig::BitMask *causalMask =
+            reinterpret_cast<BatchConfig::BitMask *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available));
         int batch_size = batch_config->num_active_requests();
-        uint32_t const max_num_pages = (BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+        uint32_t const max_num_pages =
+            (BatchConfig::max_sequence_length() +
+             BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+            kPagesize;
 
         int parallelism = batch_size;
         prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
                                           min(CUDA_NUM_THREADS, parallelism),
                                           0,
-                                          stream>>>(batch_size,
-                                                    request_infos,
-                                                    request_available,
-                                                    max_num_pages,
-                                                    handle.tree_search_attention_metadata->q_indptr,
-                                                    handle.tree_search_attention_metadata->kv_indptr,
-                                                    handle.tree_search_attention_metadata->kv_indices,
-                                                    handle.tree_search_attention_metadata->kv_last_page_len,
-                                                    handle.tree_search_attention_metadata->qk_indptr);
+                                          stream>>>(
+            batch_size,
+            request_infos,
+            request_available,
+            max_num_pages,
+            handle.tree_search_attention_metadata->q_indptr,
+            handle.tree_search_attention_metadata->kv_indptr,
+            handle.tree_search_attention_metadata->kv_indices,
+            handle.tree_search_attention_metadata->kv_last_page_len,
+            handle.tree_search_attention_metadata->qk_indptr);
 
         // Update gpu-side custom mask referring from CaualMask
         if (!batch_config->prompt_phase) {
           int parallelism = 0;
-          for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+          for (int req_idx = 0;
+               req_idx < batch_config->max_requests_per_batch();
+               req_idx++) {
             if (batch_config->request_available[req_idx]) {
-              int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-              int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+              int q_len =
+                  batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+              int kv_len =
+                  batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                  batch_config->requestsInfo[req_idx]
+                      .first_token_index_in_request;
               parallelism += (q_len * kv_len + 7) / 8;
             }
           }
           update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
-                                      stream>>>(handle.tree_search_attention_metadata->custom_mask,
-                                                handle.tree_search_attention_metadata->qk_indptr,
-                                                causalMask,
-                                                request_infos,
-                                                request_available,
-                                                batch_size);
+                                      stream>>>(
+              handle.tree_search_attention_metadata->custom_mask,
+              handle.tree_search_attention_metadata->qk_indptr,
+              causalMask,
+              request_infos,
+              request_available,
+              batch_size);
         }
       }
 
@@ -299,54 +313,69 @@ void RequestManager::load_batch_config_task(
         BatchPrefillHandler *handler = nullptr;
 
         if (!batch_config->prompt_phase) {
-          if (handle.tree_search_attention_metadata->decode_handler_collections.count(batch_size) == 0) {
-            handle.tree_search_attention_metadata->decode_handler_collections[batch_size] =
+          if (handle.tree_search_attention_metadata->decode_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_search_attention_metadata
+                ->decode_handler_collections[batch_size] =
                 static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
           }
           handler = static_cast<BatchPrefillHandler *>(
-            handle.tree_search_attention_metadata->decode_handler_collections[batch_size]);
+              handle.tree_search_attention_metadata
+                  ->decode_handler_collections[batch_size]);
         } else {
-          if (handle.tree_search_attention_metadata->prompt_handler_collections.count(batch_size) == 0) {
-            handle.tree_search_attention_metadata->prompt_handler_collections[batch_size] =
+          if (handle.tree_search_attention_metadata->prompt_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_search_attention_metadata
+                ->prompt_handler_collections[batch_size] =
                 static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
           }
           handler = static_cast<BatchPrefillHandler *>(
-            handle.tree_search_attention_metadata->prompt_handler_collections[batch_size]);
+              handle.tree_search_attention_metadata
+                  ->prompt_handler_collections[batch_size]);
         }
 
-        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+            kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
         q_indptr_h[0] = 0;
         kv_indptr_h[0] = 0;
-        for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+        for (int req_idx = 0, indptr_idx = 0;
+             req_idx < batch_config->max_requests_per_batch();
+             req_idx++) {
           if (batch_config->request_available[req_idx]) {
             int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-            int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                        batch_config->requestsInfo[req_idx].first_token_index_in_request;
+            int kv_len =
+                batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                batch_config->requestsInfo[req_idx]
+                    .first_token_index_in_request;
             q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-            kv_indptr_h[indptr_idx + 1] = kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+            kv_indptr_h[indptr_idx + 1] =
+                kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
             indptr_idx++;
           }
         }
 
         handler->SetCUDAStream(stream);
-        handler->BeginForward<half, int32_t>(static_cast<void*>(
-                                              static_cast<char*>(handle.tree_search_attention_metadata->workspace) +
-                                              handle.tree_search_attention_metadata->workspace_block * batch_size),
-                                            handle.tree_search_attention_metadata->workspace_block,
-                                            static_cast<int32_t *>(q_indptr_h),
-                                            static_cast<int32_t *>(kv_indptr_h),
-                                            batch_size,
-                                            handle.tree_search_attention_metadata->num_q_heads(),
-                                            handle.tree_search_attention_metadata->num_kv_heads(),
-                                            handle.tree_search_attention_metadata->head_dim(),
-                                            kPagesize);
+        handler->BeginForward<half, int32_t>(
+            static_cast<void *>(
+                static_cast<char *>(
+                    handle.tree_search_attention_metadata->workspace) +
+                handle.tree_search_attention_metadata->workspace_block *
+                    batch_size),
+            handle.tree_search_attention_metadata->workspace_block,
+            static_cast<int32_t *>(q_indptr_h),
+            static_cast<int32_t *>(kv_indptr_h),
+            batch_size,
+            handle.tree_search_attention_metadata->num_q_heads(),
+            handle.tree_search_attention_metadata->num_kv_heads(),
+            handle.tree_search_attention_metadata->head_dim(),
+            kPagesize);
       }
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     if (handle.tree_verify_attention_metadata->enabled()) {
       for (int request_idx = 0;
-          request_idx < BatchConfig::max_requests_per_batch();
-          request_idx++) {
+           request_idx < BatchConfig::max_requests_per_batch();
+           request_idx++) {
         if (batch_config->request_available[request_idx]) {
           checkCUDA(cudaMemcpyAsync(
               static_cast<char *>(handle.batch_config_metadata) +
@@ -380,56 +409,67 @@ void RequestManager::load_batch_config_task(
 
       // calculate the attention meta data
       {
-        BatchConfig::PerRequestInfo *request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
-          static_cast<char *>(handle.batch_config_metadata) +
-          sizeof(BatchConfig::tokensInfo));
+        BatchConfig::PerRequestInfo *request_infos =
+            reinterpret_cast<BatchConfig::PerRequestInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo));
         bool *request_available = reinterpret_cast<bool *>(
-          static_cast<char *>(handle.batch_config_metadata) +
-          sizeof(BatchConfig::tokensInfo) +
-          sizeof(BatchConfig::requestsInfo));
-        BatchConfig::BitMask *causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-          static_cast<char *>(handle.batch_config_metadata) +
-          sizeof(BatchConfig::tokensInfo) +
-          sizeof(BatchConfig::requestsInfo) +
-          sizeof(BatchConfig::request_available));
+            static_cast<char *>(handle.batch_config_metadata) +
+            sizeof(BatchConfig::tokensInfo) +
+            sizeof(BatchConfig::requestsInfo));
+        BatchConfig::BitMask *causalMask =
+            reinterpret_cast<BatchConfig::BitMask *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available));
         int batch_size = batch_config->num_active_requests();
-        uint32_t const max_num_pages = (BatchConfig::max_sequence_length() +
-          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) / kPagesize;
+        uint32_t const max_num_pages =
+            (BatchConfig::max_sequence_length() +
+             BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+            kPagesize;
 
         int parallelism = batch_size;
         prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
                                           min(CUDA_NUM_THREADS, parallelism),
                                           0,
-                                          stream>>>(batch_size,
-                                                    request_infos,
-                                                    request_available,
-                                                    max_num_pages,
-                                                    handle.tree_verify_attention_metadata->q_indptr,
-                                                    handle.tree_verify_attention_metadata->kv_indptr,
-                                                    handle.tree_verify_attention_metadata->kv_indices,
-                                                    handle.tree_verify_attention_metadata->kv_last_page_len,
-                                                    handle.tree_verify_attention_metadata->qk_indptr);
+                                          stream>>>(
+            batch_size,
+            request_infos,
+            request_available,
+            max_num_pages,
+            handle.tree_verify_attention_metadata->q_indptr,
+            handle.tree_verify_attention_metadata->kv_indptr,
+            handle.tree_verify_attention_metadata->kv_indices,
+            handle.tree_verify_attention_metadata->kv_last_page_len,
+            handle.tree_verify_attention_metadata->qk_indptr);
 
         // Update gpu-side custom mask referring from CaualMask
         if (!batch_config->prompt_phase) {
           int parallelism = 0;
-          for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+          for (int req_idx = 0;
+               req_idx < batch_config->max_requests_per_batch();
+               req_idx++) {
             if (batch_config->request_available[req_idx]) {
-              int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-              int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+              int q_len =
+                  batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+              int kv_len =
+                  batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                  batch_config->requestsInfo[req_idx]
+                      .first_token_index_in_request;
               parallelism += (q_len * kv_len + 7) / 8;
             }
           }
           update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
-                                      stream>>>(handle.tree_verify_attention_metadata->custom_mask,
-                                                handle.tree_verify_attention_metadata->qk_indptr,
-                                                causalMask,
-                                                request_infos,
-                                                request_available,
-                                                batch_size);
+                                      stream>>>(
+              handle.tree_verify_attention_metadata->custom_mask,
+              handle.tree_verify_attention_metadata->qk_indptr,
+              causalMask,
+              request_infos,
+              request_available,
+              batch_size);
         }
       }
 
@@ -439,47 +479,62 @@ void RequestManager::load_batch_config_task(
         BatchPrefillHandler *handler = nullptr;
 
         if (!batch_config->prompt_phase) {
-          if (handle.tree_verify_attention_metadata->decode_handler_collections.count(batch_size) == 0) {
-            handle.tree_verify_attention_metadata->decode_handler_collections[batch_size] =
+          if (handle.tree_verify_attention_metadata->decode_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_verify_attention_metadata
+                ->decode_handler_collections[batch_size] =
                 static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
           }
           handler = static_cast<BatchPrefillHandler *>(
-            handle.tree_verify_attention_metadata->decode_handler_collections[batch_size]);
+              handle.tree_verify_attention_metadata
+                  ->decode_handler_collections[batch_size]);
         } else {
-          if (handle.tree_verify_attention_metadata->prompt_handler_collections.count(batch_size) == 0) {
-            handle.tree_verify_attention_metadata->prompt_handler_collections[batch_size] =
+          if (handle.tree_verify_attention_metadata->prompt_handler_collections
+                  .count(batch_size) == 0) {
+            handle.tree_verify_attention_metadata
+                ->prompt_handler_collections[batch_size] =
                 static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
           }
           handler = static_cast<BatchPrefillHandler *>(
-            handle.tree_verify_attention_metadata->prompt_handler_collections[batch_size]);
+              handle.tree_verify_attention_metadata
+                  ->prompt_handler_collections[batch_size]);
         }
 
-        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+            kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
         q_indptr_h[0] = 0;
         kv_indptr_h[0] = 0;
-        for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+        for (int req_idx = 0, indptr_idx = 0;
+             req_idx < batch_config->max_requests_per_batch();
+             req_idx++) {
           if (batch_config->request_available[req_idx]) {
             int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-            int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                        batch_config->requestsInfo[req_idx].first_token_index_in_request;
+            int kv_len =
+                batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                batch_config->requestsInfo[req_idx]
+                    .first_token_index_in_request;
             q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-            kv_indptr_h[indptr_idx + 1] = kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+            kv_indptr_h[indptr_idx + 1] =
+                kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
             indptr_idx++;
           }
         }
 
         handler->SetCUDAStream(stream);
-        handler->BeginForward<half, int32_t>(static_cast<void*>(
-                                              static_cast<char*>(handle.tree_verify_attention_metadata->workspace) +
-                                              handle.tree_verify_attention_metadata->workspace_block * batch_size),
-                                            handle.tree_verify_attention_metadata->workspace_block,
-                                            static_cast<int32_t *>(q_indptr_h),
-                                            static_cast<int32_t *>(kv_indptr_h),
-                                            batch_size,
-                                            handle.tree_verify_attention_metadata->num_q_heads(),
-                                            handle.tree_verify_attention_metadata->num_kv_heads(),
-                                            handle.tree_verify_attention_metadata->head_dim(),
-                                            kPagesize);
+        handler->BeginForward<half, int32_t>(
+            static_cast<void *>(
+                static_cast<char *>(
+                    handle.tree_verify_attention_metadata->workspace) +
+                handle.tree_verify_attention_metadata->workspace_block *
+                    batch_size),
+            handle.tree_verify_attention_metadata->workspace_block,
+            static_cast<int32_t *>(q_indptr_h),
+            static_cast<int32_t *>(kv_indptr_h),
+            batch_size,
+            handle.tree_verify_attention_metadata->num_q_heads(),
+            handle.tree_verify_attention_metadata->num_kv_heads(),
+            handle.tree_verify_attention_metadata->head_dim(),
+            kPagesize);
       }
     }
   }
diff --git a/src/utils/communication_buffer.cu b/src/utils/communication_buffer.cu
new file mode 100644
index 000000000..cd6cc0db4
--- /dev/null
+++ b/src/utils/communication_buffer.cu
@@ -0,0 +1,135 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/utils/communication_buffer.h"
+#include "flexflow/utils/cuda_helper.h"
+#include "tensorrt_llm/custom_allreduce_kernels.h"
+#include <cuda_runtime.h>
+#include <string>
+
+// Given a local CUDA data pointer, return the peer memory pointers group.
+// For the i-th pointer, if i is the worker id of the given device,
+// then the returned i-th ptr_group is the local pointer,
+// or otherwise it is an peer memory pointer from the remote device.
+std::vector<void *> create_peer_ptr_group(int num_devices,
+                                          int device_id,
+                                          ncclComm_t ncclComm,
+                                          void *allgather_src,
+                                          void *allgather_dst,
+                                          void *local_ptr,
+                                          cudaStream_t stream) {
+  // Ensure we are on the correct device
+  int device = 0;
+  checkCUDA(cudaGetDevice(&device));
+  assert(device == device_id && "Device ID does not match current device.");
+
+  // Next we all-gather the peer memory pointers across all distributed workers.
+  // On each worker, we copy the peer pointers to GPU memory. And nccl AllGather
+  // is used to all-gather the pointers. Finally the all-gathered pointers
+  // on each worker are copied from GPU to CPU.
+
+  checkCUDA(cudaMemcpyAsync(allgather_src,
+                            &local_ptr,
+                            sizeof(void *),
+                            cudaMemcpyHostToDevice,
+                            stream));
+
+  checkNCCL(ncclAllGather(allgather_src,
+                          allgather_dst,
+                          sizeof(void *),
+                          ncclChar,
+                          ncclComm,
+                          stream));
+
+  std::vector<void *> peer_pointers(num_devices);
+  checkCUDA(cudaMemcpyAsync(peer_pointers.data(),
+                            allgather_dst,
+                            sizeof(void *) * num_devices,
+                            cudaMemcpyDeviceToHost,
+                            stream));
+  checkCUDA(cudaStreamSynchronize(stream));
+
+  return peer_pointers;
+}
+
+// Free the peer memory pointers group.
+void free_peer_ptr_group(std::vector<void *> ptr_group,
+                         int device_id,
+                         bool free_local) {
+  for (int i = 0; i < static_cast<int>(ptr_group.size()); ++i) {
+    if (i == device_id && free_local) {
+      // Free the local buffer.
+      checkCUDA(cudaFree(ptr_group[i]));
+    }
+    // No need to do anything for other devices.
+  }
+}
+
+// Given a local CUDA data pointer, return the CommunicationBuffer of the
+// pointer. The CommunicationBuffer contains the local pointer and the peer
+// memory pointers group. It contains the barrier helpers for synchronization
+// across distributed workers, which should also be peer-based. The
+// allgather_src and allgather_dst are device buffers, which are used for
+// all-gathering peer pointers across devices. The size of allgather_src should
+// be sizeof(void*), and the size of allgather_dst should be sizeof(void*) *
+// num_devices.
+CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
+                                                    int device_id,
+                                                    ncclComm_t ncclComm,
+                                                    void *allgather_src,
+                                                    void *allgather_dst,
+                                                    void *local_ptr,
+                                                    void *barrier_in_ptr,
+                                                    void *barrier_out_ptr,
+                                                    int *barrier_flag,
+                                                    cudaStream_t stream) {
+  assert(local_ptr != nullptr && "Local pointer is nullptr.");
+  CommunicationBuffer *comm_buf = new CommunicationBuffer();
+  comm_buf->num_devices = num_devices;
+  comm_buf->device_id = device_id;
+  comm_buf->local_ptr = local_ptr;
+  comm_buf->comm_ptrs = create_peer_ptr_group(num_devices,
+                                              device_id,
+                                              ncclComm,
+                                              allgather_src,
+                                              allgather_dst,
+                                              local_ptr,
+                                              stream);
+  comm_buf->barrier_in = create_peer_ptr_group(num_devices,
+                                               device_id,
+                                               ncclComm,
+                                               allgather_src,
+                                               allgather_dst,
+                                               barrier_in_ptr,
+                                               stream);
+  comm_buf->barrier_out = create_peer_ptr_group(num_devices,
+                                                device_id,
+                                                ncclComm,
+                                                allgather_src,
+                                                allgather_dst,
+                                                barrier_out_ptr,
+                                                stream);
+  comm_buf->barrier_flag = barrier_flag;
+
+  return comm_buf;
+}
+
+// Release the CommunicationBuffer.
+void release_comm_buf(CommunicationBuffer *comm_buf) {
+  free_peer_ptr_group(comm_buf->comm_ptrs, comm_buf->device_id, false);
+  free_peer_ptr_group(comm_buf->barrier_in, comm_buf->device_id, false);
+  free_peer_ptr_group(comm_buf->barrier_out, comm_buf->device_id, false);
+  delete comm_buf;
+}

From b0e6da286ab30f8f9f9aa2332ace93c5d2e5b8fa Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 17 Aug 2024 12:56:45 -0700
Subject: [PATCH 383/667] feat: support laama-2 architechture

---
 inference/models/llama.cc | 77 ++++++++++++++++++++-------------------
 inference/models/llama.h  | 11 +++++-
 2 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 94f58b0ef..3165b3920 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -94,10 +94,11 @@ void LLAMA::create_llama_model(FFModel &ff,
     Tensor mha;
     switch (mode) {
       case TREE_SEARCH_MODE: {
-        mha = ff.spec_inc_multihead_self_attention(
+        mha = ff.spec_inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -117,10 +118,11 @@ void LLAMA::create_llama_model(FFModel &ff,
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multihead_self_attention_verify(
+        mha = ff.inc_multiquery_self_attention_verify(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -140,10 +142,11 @@ void LLAMA::create_llama_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multihead_self_attention(
+        mha = ff.inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -180,47 +183,47 @@ void LLAMA::create_llama_model(FFModel &ff,
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
                      .c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
                      .c_str());
 
     Tensor multi = ff.sigmoid_silu_multi(w1, w3);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
+    w2 = ff.dense(
+        multi,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
   }
@@ -275,7 +278,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       "",
       weight_file_path,
       llama_config.num_attention_heads,
-      llama_config.num_attention_heads,
+      llama_config.num_key_value_heads,
       llama_config.hidden_size,
       llama_config.hidden_size / llama_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 7a7440982..1a6a9114e 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -36,6 +36,11 @@ class LLAMA {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           num_attention_heads = model_config["num_attention_heads"];
+          if (model_config.find("num_key_value_heads") != model_config.end()) {
+            num_key_value_heads = model_config["num_key_value_heads"];
+          } else {
+            num_key_value_heads = num_attention_heads;
+          }
           hidden_size = model_config["hidden_size"];
           rms_norm_eps = model_config["rms_norm_eps"];
           intermediate_size = model_config["intermediate_size"];
@@ -58,6 +63,8 @@ class LLAMA {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tnum_attention_heads: " << num_attention_heads
                 << std::endl;
+      std::cout << "\tnum_key_value_heads: " << num_key_value_heads
+                << std::endl;
       std::cout << "\thidden_size: " << hidden_size << std::endl;
       std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl;
       std::cout << "\tintermediate_size: " << intermediate_size << std::endl;
@@ -69,8 +76,8 @@ class LLAMA {
 
     // int max_seq_len, max_num_tokens;
     int k_of_arg_topk;
-    int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
-        intermediate_size;
+    int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads,
+        hidden_size, intermediate_size;
     float rms_norm_eps;
   };
 

From fdd6a6119972a0af01059bdbbe4623acd5a72fc9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 19 Aug 2024 18:50:29 -0700
Subject: [PATCH 384/667] chore: minor rename

---
 include/flexflow/flexflow_c.h             | 2 +-
 include/flexflow/model.h                  | 2 +-
 inference/models/falcon.cc                | 2 +-
 inference/models/llama.cc                 | 2 +-
 inference/models/starcoder.cc             | 2 +-
 python/flexflow/core/flexflow_cffi.py     | 4 ++--
 python/flexflow/serve/models/falcon.py    | 2 +-
 python/flexflow/serve/models/llama.py     | 2 +-
 python/flexflow/serve/models/starcoder.py | 2 +-
 src/c/flexflow_c.cc                       | 4 ++--
 src/ops/inc_multihead_self_attention.cc   | 4 ++--
 11 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 0a6eebb18..e7e9f87a5 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -490,7 +490,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     bool position_bias,
     char const *name);
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     flexflow_model_t handle_,
     flexflow_tensor_t const input_,
     int embed_dim,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 2f2706693..8c8b90ef8 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -761,7 +761,7 @@ class FFModel {
       bool qk_prod_scaling = true,
       bool position_bias = false,
       char const *name = NULL);
-  Tensor inc_multiquery_self_attention(Tensor const input,
+  Tensor groupquery_self_attention(Tensor const input,
                                        int embed_dim,
                                        int num_q_heads,
                                        int num_kv_heads,
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index d59274727..a9805bf8e 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -148,7 +148,7 @@ void FALCON::create_falcon_model(FFModel &ff,
       }
 
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
+        mha = ff.groupquery_self_attention(
             att_norm,
             falcon_config.hidden_size,
             falcon_config.n_head,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 3165b3920..a18a909f9 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -142,7 +142,7 @@ void LLAMA::create_llama_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
+        mha = ff.groupquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 3bc150865..8251ef71c 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -104,7 +104,7 @@ void STARCODER::create_starcoder_model(
     Tensor mha;
     switch (mode) {
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
+        mha = ff.groupquery_self_attention(
             ln_1,
             startcoder_config.hidden_size,
             startcoder_config.num_attention_heads,
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 14cf4eebf..5a09241d9 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -2972,7 +2972,7 @@ def inc_multihead_self_attention_verify(
         self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
         return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def inc_multiquery_self_attention(
+    def groupquery_self_attention(
         self,
         input,
         embed_dim,
@@ -3056,7 +3056,7 @@ def inc_multiquery_self_attention(
         c_name = get_c_name(name)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
         c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+        handle = ffc().flexflow_model_add_groupquery_self_attention(
             self.handle,
             input.handle,
             embed_dim,
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 7a55da26e..7d8091726 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -173,7 +173,7 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers_{i}_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multiquery_self_attention(
+                mha = ffmodel.groupquery_self_attention(
                     att_norm,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 6b33030f6..f1a5d4abf 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -167,7 +167,7 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers_{i}_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multiquery_self_attention(
+                mha = ffmodel.groupquery_self_attention(
                     attn_norm,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 37edaa4c4..dee5a5a2d 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -143,7 +143,7 @@ def build_model(self, max_tokens_per_batch):
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
-            mha = ffmodel.inc_multiquery_self_attention(
+            mha = ffmodel.groupquery_self_attention(
                 ln_1,
                 self.starcoder_config.hidden_size,
                 self.starcoder_config.num_attention_heads,
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index d6a34fa8f..078a2afbd 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1314,7 +1314,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   return FFCObjectWrapper::wrap(tensor);
 }
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     flexflow_model_t handle_,
     flexflow_tensor_t const input_,
     int embed_dim,
@@ -1338,7 +1338,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
-  Tensor tensor = handle->inc_multiquery_self_attention(input,
+  Tensor tensor = handle->groupquery_self_attention(input,
                                                         embed_dim,
                                                         num_q_heads,
                                                         num_kv_heads,
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index d7dd96eb0..425d9618f 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -71,7 +71,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                              bool qk_prod_scaling,
                                              bool position_bias,
                                              char const *name) {
-  return inc_multiquery_self_attention(input,
+  return groupquery_self_attention(input,
                                        embed_dim,
                                        num_heads,
                                        num_heads,
@@ -91,7 +91,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                        name);
 }
 
-Tensor FFModel::inc_multiquery_self_attention(const Tensor input,
+Tensor FFModel::groupquery_self_attention(const Tensor input,
                                               int embed_dim,
                                               int num_q_heads,
                                               int num_kv_heads,

From 90c0d40f1f482d2ca5c1367a956519c18a0382e6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 19 Aug 2024 18:50:47 -0700
Subject: [PATCH 385/667] chore: comment out debug output

---
 src/ops/inc_multihead_self_attention.cu | 64 ++++++++++++-------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 7f247e18c..35fc961e8 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -858,10 +858,10 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
 
-  cudaEvent_t t_start, t_end;
-  cudaEventCreate(&t_start);
-  cudaEventCreate(&t_end);
-  cudaEventRecord(t_start, stream);
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
 
   if (m->offload && m->biasSize > 0) {
     cudaMemcpyAsync(
@@ -880,17 +880,17 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      stream);
   update_kv_cache_kernel<DT>(m, bc, stream);
 
-  cudaEventRecord(t_end, stream);
-  checkCUDA(cudaEventSynchronize(t_end));
-  float elapsed = 0;
-  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  cudaEventDestroy(t_start);
-  cudaEventDestroy(t_end);
-  std::cout << "Prepare attn time: " << elapsed << " ms\n";
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // float elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+  // std::cout << "Prepare attn time: " << elapsed << " ms\n";
 
-  cudaEventCreate(&t_start);
-  cudaEventCreate(&t_end);
-  cudaEventRecord(t_start, stream);
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
 
   if (bc->prompt_phase) {
     // phase 3: Compute attention score for prompt tokens;
@@ -902,13 +902,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
         m, bc, static_cast<DT *>(m->attn_heads), stream);
   }
 
-  cudaEventRecord(t_end, stream);
-  checkCUDA(cudaEventSynchronize(t_end));
-  elapsed = 0;
-  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  cudaEventDestroy(t_start);
-  cudaEventDestroy(t_end);
-  std::cout << "Attn time: " << elapsed << " ms\n";
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+  // std::cout << "Attn time: " << elapsed << " ms\n";
 
   // Debug output:
   //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
@@ -1242,10 +1242,10 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
   checkCUDA(get_legion_stream(&stream));
   bool use_bias = *m->qkv_bias || *m->final_bias;
 
-  cudaEvent_t t_start, t_end;
-  cudaEventCreate(&t_start);
-  cudaEventCreate(&t_end);
-  cudaEventRecord(t_start, stream);
+  // cudaEvent_t t_start, t_end;
+  // cudaEventCreate(&t_start);
+  // cudaEventCreate(&t_end);
+  // cudaEventRecord(t_start, stream);
 
   // assert(input.data_type == weight.data_type);
   assert(input.data_type == output.data_type);
@@ -1288,13 +1288,13 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     assert(false && "Unspported data type");
   }
 
-  cudaEventRecord(t_end, stream);
-  checkCUDA(cudaEventSynchronize(t_end));
-  float elapsed = 0;
-  checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  cudaEventDestroy(t_start);
-  cudaEventDestroy(t_end);
-  printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
+  // cudaEventRecord(t_end, stream);
+  // checkCUDA(cudaEventSynchronize(t_end));
+  // float elapsed = 0;
+  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  // cudaEventDestroy(t_start);
+  // cudaEventDestroy(t_end);
+  // printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
 }
 
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(

From f5a2b1a22899bfc708869caf4ee732dd1c348703 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 19 Aug 2024 23:00:50 -0700
Subject: [PATCH 386/667] fix: temporarily support GQA (which is downgrade into
 MHA)

---
 src/ops/spec_inc_multihead_self_attention.cu | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu | 2 +-
 src/runtime/request_manager.cu               | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 0e489fa07..21d791f83 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -192,7 +192,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-      num_kv_heads,
+      num_q_heads,
       kPagesize,
       head_dim,
       batch_size,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 1ca94692a..31451201b 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -275,7 +275,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-      num_kv_heads,
+      num_q_heads,
       kPagesize,
       head_dim,
       batch_size,
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 75c2051b5..9a6394eca 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -366,7 +366,7 @@ void RequestManager::load_batch_config_task(
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,
             handle.tree_search_attention_metadata->num_q_heads(),
-            handle.tree_search_attention_metadata->num_kv_heads(),
+            handle.tree_search_attention_metadata->num_q_heads(),
             handle.tree_search_attention_metadata->head_dim(),
             kPagesize);
       }
@@ -532,7 +532,7 @@ void RequestManager::load_batch_config_task(
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,
             handle.tree_verify_attention_metadata->num_q_heads(),
-            handle.tree_verify_attention_metadata->num_kv_heads(),
+            handle.tree_verify_attention_metadata->num_q_heads(),
             handle.tree_verify_attention_metadata->head_dim(),
             kPagesize);
       }

From 16db39c0bb0c9e938c1d15a09081ed975fe8f443 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 11:54:10 -0700
Subject: [PATCH 387/667] chore: minor rename

---
 include/flexflow/flexflow_c.h         |  2 +-
 include/flexflow/utils/file_loader.h  |  4 ++--
 python/flexflow/core/flexflow_cffi.py |  4 ++--
 src/c/flexflow_c.cc                   |  4 ++--
 src/runtime/file_loader.cc            | 30 +++++++++++++--------------
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index e7e9f87a5..ddf9c7e8a 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -1027,7 +1027,7 @@ flexflow_file_data_loader_t
                                      int num_q_heads,
                                      int num_kv_heads,
                                      int hidden_dim,
-                                     int qkv_inner_dim,
+                                     int head_dim,
                                      int tensor_parallelism_degree,
                                      bool use_full_precision);
 
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 646eb18da..8861cfc48 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -29,7 +29,7 @@ class FileDataLoader {
                  int _num_heads,
                  int _num_kv_heads,
                  size_t _hidden_dim,
-                 size_t _qkv_inner_dim,
+                 size_t _head_dim,
                  int _tensor_parallelism_degree,
                  bool _use_full_precision);
 
@@ -49,7 +49,7 @@ class FileDataLoader {
 
 private:
   int num_heads, num_kv_heads, tensor_parallelism_degree;
-  size_t hidden_dim, qkv_inner_dim;
+  size_t hidden_dim, head_dim;
   std::string prompts_filepath;
   std::string weights_folder;
   bool use_full_precision;
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 5a09241d9..403f2cba5 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -4257,7 +4257,7 @@ def __init__(
         num_q_heads,
         num_kv_heads,
         hidden_dim,
-        qkv_inner_dim,
+        head_dim,
         tensor_parallelism_degree,
         use_full_precision
     ):
@@ -4267,7 +4267,7 @@ def __init__(
             num_q_heads,
             num_kv_heads,
             hidden_dim,
-            qkv_inner_dim,
+            head_dim,
             tensor_parallelism_degree,
             use_full_precision
         )
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 078a2afbd..e371a0cdf 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2698,7 +2698,7 @@ flexflow_file_data_loader_t
                                      int num_q_heads,
                                      int num_kv_heads,
                                      int hidden_dim,
-                                     int qkv_inner_dim,
+                                     int head_dim,
                                      int tensor_parallelism_degree,
                                      bool use_full_precision) {
   assert(weight_file_path != nullptr &&
@@ -2709,7 +2709,7 @@ flexflow_file_data_loader_t
                                               num_q_heads,
                                               num_kv_heads,
                                               hidden_dim,
-                                              qkv_inner_dim,
+                                              head_dim,
                                               tensor_parallelism_degree,
                                               use_full_precision);
   DEBUG_PRINT("[FileDataLoader] new %p", handle);
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 43ce9d700..9074ac3df 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
-#include "flexflow/utils/file_loader.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/utils/file_loader.h"
 
 #include <vector>
 using namespace std;
@@ -27,12 +27,12 @@ FileDataLoader::FileDataLoader(std::string _prompts_filepath,
                                int _num_heads,
                                int _num_kv_heads,
                                size_t _hidden_dim,
-                               size_t _qkv_inner_dim,
+                               size_t _head_dim,
                                int _tensor_parallelism_degree,
                                bool _use_full_precision)
     : prompts_filepath(_prompts_filepath), weights_folder(_weights_folder),
       num_heads(_num_heads), num_kv_heads(_num_kv_heads),
-      hidden_dim(_hidden_dim), qkv_inner_dim(_qkv_inner_dim),
+      hidden_dim(_hidden_dim), head_dim(_head_dim),
       tensor_parallelism_degree(_tensor_parallelism_degree),
       use_full_precision(_use_full_precision){};
 
@@ -132,7 +132,7 @@ void load_attention_bias_v2(DT *ptr,
                             int num_heads,
                             int num_kv_heads,
                             size_t hidden_dim,
-                            size_t qkv_inner_dim,
+                            size_t head_dim,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
@@ -159,8 +159,8 @@ void load_attention_bias_v2(DT *ptr,
 
     int replicate_num = num_heads / num_kv_heads;
 
-    size_t qkv_partial_size = qkv_inner_dim * n_heads;
-    size_t qkv_replicate_size = qkv_inner_dim * num_heads;
+    size_t qkv_partial_size = head_dim * n_heads;
+    size_t qkv_replicate_size = head_dim * num_heads;
     size_t out_partial_size = hidden_dim;
     size_t partial_size =
         (file_index < 3) ? qkv_partial_size : out_partial_size;
@@ -212,7 +212,7 @@ void load_attention_weights_v2(DT *ptr,
                                int num_heads,
                                int num_kv_heads,
                                size_t hidden_dim,
-                               size_t qkv_inner_dim,
+                               size_t head_dim,
                                std::string layer_name,
                                std::string weights_folder,
                                size_t volume,
@@ -229,7 +229,7 @@ void load_attention_weights_v2(DT *ptr,
   int base_index = 0;
   size_t single_proj_size =
       hidden_dim *
-      qkv_inner_dim; // size of each of Q,K,V,O weights for a single head
+      head_dim; // size of each of Q,K,V,O weights for a single head
   size_t one_weight_file_size =
       num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
 
@@ -324,7 +324,7 @@ void load_attention_weights_v2(DT *ptr,
     int data_index = 0;
 
     int one_partition_size =
-        qkv_inner_dim * (num_heads / tensor_parallelism_degree);
+        head_dim * (num_heads / tensor_parallelism_degree);
     for (int i = 0; i < one_weight_file_size; i++) {
       int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
       int block_num = (i / one_partition_size);
@@ -402,7 +402,7 @@ void FileDataLoader::load_positions(FFModel *ff,
 void load_attention_weights_quantized(char *ptr,
                                       int num_heads,
                                       size_t hidden_dim,
-                                      size_t qkv_inner_dim,
+                                      size_t head_dim,
                                       std::string layer_name,
                                       std::string weights_folder,
                                       DataType data_type,
@@ -419,7 +419,7 @@ void load_attention_weights_quantized(char *ptr,
 
   size_t single_proj_size =
       hidden_dim *
-      qkv_inner_dim; // size of each of Q,K,V,O weights for a single head
+      head_dim; // size of each of Q,K,V,O weights for a single head
   size_t one_weight_file_size =
       num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
 
@@ -671,7 +671,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
       load_attention_weights_quantized(data,
                                        num_heads,
                                        hidden_dim,
-                                       qkv_inner_dim,
+                                       head_dim,
                                        weight_filename,
                                        weights_folder,
                                        weight->data_type,
@@ -681,7 +681,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     //   load_attention_bias_quantized(data,
     //                                 num_heads,
     //                                 hidden_dim,
-    //                                 qkv_inner_dim,
+    //                                 head_dim,
     //                                 weight_filename,
     //                                 weights_folder);
     // }
@@ -745,7 +745,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                     num_heads,
                                     num_kv_heads,
                                     hidden_dim,
-                                    qkv_inner_dim,
+                                    head_dim,
                                     weight_filename,
                                     weights_folder,
                                     volume,
@@ -758,7 +758,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                  num_heads,
                                  num_kv_heads,
                                  hidden_dim,
-                                 qkv_inner_dim,
+                                 head_dim,
                                  final_bias,
                                  weight_filename,
                                  weights_folder);

From f78dfd722860aa4ed80584f513d9617bbcb72894 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 13:08:26 -0700
Subject: [PATCH 388/667] chore: minor update

---
 src/runtime/file_loader.cc | 99 ++++++++++++--------------------------
 1 file changed, 32 insertions(+), 67 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 9074ac3df..b11f13b35 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -223,65 +223,72 @@ void load_attention_weights_v2(DT *ptr,
   std::string k_file = layer_name + "_wk_weight";
   std::string v_file = layer_name + "_wv_weight";
   std::string o_file = layer_name + "_wo_weight";
-  std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
+  std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
   int file_index = 0;
 
   int base_index = 0;
   size_t single_proj_size =
       hidden_dim *
       head_dim; // size of each of Q,K,V,O weights for a single head
-  size_t one_weight_file_size =
-      num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
-
-  size_t q_size = one_weight_file_size, o_size = one_weight_file_size;
-  size_t k_size = single_proj_size * num_kv_heads,
-         v_size = single_proj_size * num_kv_heads;
-
-  size_t k_replicate_size = one_weight_file_size;
-  size_t v_replicate_size = one_weight_file_size;
 
+  size_t qo_size = single_proj_size * num_heads,
+         kv_size = single_proj_size * num_kv_heads;
+  size_t kv_replicate_size = qo_size;
   int replicate_num = num_heads / num_kv_heads;
-
   // stride for q, k, v, o
-  size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) /
-                       tensor_parallelism_degree;
+  size_t stride_size =
+      (qo_size + kv_replicate_size * 2 + qo_size) / tensor_parallelism_degree;
+
   for (auto filename : weight_filenames) {
     std::cout << "Loading weight file " << filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, filename});
-
-    int data_index = 0;
-    size_t partial_size = (file_index == 0 || file_index == 3)
-                              ? one_weight_file_size
-                              : single_proj_size * num_kv_heads;
-    size_t one_partition_size =
-        one_weight_file_size / tensor_parallelism_degree;
+    size_t file_size = (file_index == 0 || file_index == 3) ? qo_size : kv_size;
 
     std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
     if (!in.good()) {
       std::cout << "Could not open file: " << weight_filepath << std::endl;
     }
     assert(in.good() && "incorrect weight file path");
-    std::vector<DT> host_array(partial_size);
-    size_t loaded_data_size = sizeof(DT) * partial_size;
+    std::vector<DT> host_array(file_size);
+    size_t loaded_data_size = sizeof(DT) * file_size;
     in.seekg(0, in.end);
     in.seekg(0, in.beg);
     in.read((char *)host_array.data(), loaded_data_size);
     size_t in_get_size = in.gcount();
-
     if (in_get_size != loaded_data_size) {
       std::cout << "load attention data error " << in_get_size << ", "
                 << loaded_data_size << ", " << file_index << ", "
                 << weight_filepath << "\n";
       assert(false && "data size mismatch");
     }
-    // wq, wk, wo
+
     if (file_index == 0) {
+      // q, o
+      int one_partition_size = qo_size / tensor_parallelism_degree;
+      int data_index = 0;
       for (int i = 0; i < tensor_parallelism_degree; i++) {
         for (int j = 0; j < one_partition_size; j++) {
           ptr[base_index + i * stride_size + j] = host_array.at(data_index++);
         }
       }
+      base_index += one_partition_size;
+    } else if (file_index == 3) {
+      // o
+      int one_partition_size =
+          head_dim * (num_heads / tensor_parallelism_degree);
+      int data_index = 0;
+      for (int i = 0; i < qo_size; i++) {
+        int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
+        int block_num = (i / one_partition_size);
+        int offset =
+            block_num / tensor_parallelism_degree * one_partition_size +
+            (i % one_partition_size);
+        ptr[base_index + part_idx * stride_size + offset] =
+            host_array.at(data_index++);
+      }
+      base_index += one_partition_size;
     } else {
+      // k, v
       for (int i = 0; i < num_heads; i++) {
         int kv_idx = i / (num_heads / num_kv_heads);
         int head_idx = i % (num_heads / tensor_parallelism_degree);
@@ -291,53 +298,11 @@ void load_attention_weights_v2(DT *ptr,
               j] = host_array.at(kv_idx * single_proj_size + j);
         }
       }
+      base_index += kv_replicate_size / tensor_parallelism_degree;
     }
 
-    // assert(data_index == partial_size);
-    base_index += one_partition_size;
     file_index++;
   }
-  assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
-                           tensor_parallelism_degree);
-
-  {
-    std::cout << "Loading weight file " << o_file << std::endl;
-    std::string weight_filepath = join_path({weights_folder, o_file});
-
-    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
-    if (!in.good()) {
-      std::cout << "Could not open file: " << weight_filepath << std::endl;
-    }
-    assert(in.good() && "incorrect weight file path");
-    std::vector<DT> host_array(one_weight_file_size);
-    size_t loaded_data_size = sizeof(DT) * one_weight_file_size;
-    in.seekg(0, in.end);
-    in.seekg(0, in.beg);
-    in.read((char *)host_array.data(), loaded_data_size);
-    size_t in_get_size = in.gcount();
-
-    if (in_get_size != loaded_data_size) {
-      std::cout << "load data error" << std::endl;
-      assert(false);
-    }
-    assert(one_weight_file_size == host_array.size());
-    int data_index = 0;
-
-    int one_partition_size =
-        head_dim * (num_heads / tensor_parallelism_degree);
-    for (int i = 0; i < one_weight_file_size; i++) {
-      int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
-      int block_num = (i / one_partition_size);
-      int offset = block_num / tensor_parallelism_degree * one_partition_size +
-                   (i % one_partition_size);
-      ptr[base_index + part_idx * stride_size + offset] =
-          host_array.at(data_index++);
-    }
-
-    in.close();
-
-    assert(data_index == one_weight_file_size);
-  }
 }
 
 template <typename DT>

From a6751d9f97da339fa25a8a792e171977dfe3f038 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 13:10:07 -0700
Subject: [PATCH 389/667] chore: minor rename

---
 src/ops/{multihead_self_attention_impl.cu => attention_impl.cu} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/ops/{multihead_self_attention_impl.cu => attention_impl.cu} (100%)

diff --git a/src/ops/multihead_self_attention_impl.cu b/src/ops/attention_impl.cu
similarity index 100%
rename from src/ops/multihead_self_attention_impl.cu
rename to src/ops/attention_impl.cu

From 24f4f39383955dc54f42c4f04418dbdba98580b5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 15:41:13 -0700
Subject: [PATCH 390/667] feat: incr_decode switch to flashinfer-based
 implementation

---
 include/flexflow/config.h                     |  25 +
 .../ops/inc_multihead_self_attention.h        |   2 +-
 .../inc_multihead_self_attention_kernels.h    |   6 -
 src/ops/attention_impl.cu                     |  86 ++
 src/ops/fused.cu                              |   2 +-
 src/ops/inc_multihead_self_attention.cu       | 900 ++++++------------
 src/ops/spec_inc_multihead_self_attention.cu  |  24 -
 src/ops/tree_inc_multihead_self_attention.cu  |  24 -
 src/runtime/model.cu                          |  15 +-
 src/runtime/request_manager.cu                | 135 ++-
 10 files changed, 551 insertions(+), 668 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 6b3a5c08c..ba239237e 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -67,6 +67,30 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 class FFConfig;
 
 constexpr uint32_t kPagesize = 64;
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
+  }
+
 class AttentionMetaData {
 public:
   AttentionMetaData() {
@@ -223,6 +247,7 @@ struct FFHandler {
   void *workSpace;
   size_t workSpaceSize;
   void *batch_config_metadata;
+  AttentionMetaData *incr_attention_metadata;
   AttentionMetaData *tree_search_attention_metadata;
   AttentionMetaData *tree_verify_attention_metadata;
 
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 6dd43e333..2db4e2e96 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -113,7 +113,7 @@ class IncMultiHeadSelfAttention : public Op {
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
 
-  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m,
+  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
                                        BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9bf2f581e..27f448a4d 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -14,12 +14,6 @@ namespace FlexFlow {
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
-template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         ffStream_t stream);
-
 template <typename DT>
 void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
diff --git a/src/ops/attention_impl.cu b/src/ops/attention_impl.cu
index 33be22e50..4ee9528d4 100644
--- a/src/ops/attention_impl.cu
+++ b/src/ops/attention_impl.cu
@@ -497,4 +497,90 @@ template cudaError_t
         float rope_scale,
         float rope_theta,
         cudaStream_t stream);
+
+/********** batch decode instantiations for half precision **********/
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<64,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          QKVLayout::kNHD,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<128,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          QKVLayout::kNHD,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<256,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          QKVLayout::kNHD,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
+            paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
 } // namespace flashinfer
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index bb0fdff8c..434b91012 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -908,7 +908,7 @@ __host__ void
           case OP_INC_MULTIHEAD_SELF_ATTENTION: {
             assert(fused->op_num_inputs[op] == 1);
             assert(fused->op_num_outputs[op] == 1);
-            IncMultiHeadSelfAttentionMeta const *m =
+            IncMultiHeadSelfAttentionMeta *m =
                 (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
             assert(fused->op_num_weights[op] ==
                    (1 + (int)(*m->qkv_bias || *m->final_bias)));
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 35fc961e8..5d3c2ebd6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -15,6 +15,8 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/decode_attention_decl.cuh"
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
@@ -33,271 +35,100 @@ using Legion::Memory;
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
-// gridDim = num_heads
-// blockDim = num_tokens/num_request * head_size
-// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
-// one thread process one head_size
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_generation_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int max_seq_length,
-    int per_head_size,
-    int hidden_size,
-    /* Reserved: BatchConfig Updated */
-    BatchConfig::PerRequestInfo *request_infos,
-    bool *request_available) {
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
-  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
-  //  K_ELTS_PER_THREAD = 128 / 4 = 32
-  //  K_VECS_PER_THREAD = 32 / 1 = 32
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
-  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  int requext_idx_in_batch = -1;
-  int cnt_1 = 0;
-  while (cnt_1 < request_idx + 1) {
-    requext_idx_in_batch++;
-    if (request_available[requext_idx_in_batch]) {
-      cnt_1++;
-    }
-  }
-
-  // threads converge
-  //   __syncthreads();
-
-  int const first_step = 0;
-
-  int const tlength =
-      request_infos[requext_idx_in_batch].first_token_index_in_request +
-      request_infos[requext_idx_in_batch].num_tokens_in_batch;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-  // DT const *q_ptr =
-  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
-
-  // q tensor in this thread
-  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
-  // K_VECS_PER_THREAD elements
-  // QK_vec_k: 32->1, 64->2, 128->4... head_size
-  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-#pragma unroll
-  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
-  }
-  __syncthreads();
-  // first iter = 128 / 4 = 32
-  // K_VECS_PER_THREAD = 32
-  //  K_PER_ITER how many keys in this loop
-  //  The number of timesteps loaded per iteration.
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  //   // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch =
-      key_cache + requext_idx_in_batch * max_seq_length * hidden_size + ki;
-
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-  // get k, perform qk proj
-
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    K_vec k[K_VECS_PER_THREAD];
-    int const ti_circ = ti % max_seq_length;
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-      if (ti < tlength) {
-        k[ii] = *reinterpret_cast<K_vec const *>(k_cache_batch +
-                                                 ti_circ * hidden_size +
-                                                 head_idx * per_head_size + jj);
-      }
-      // Compute dot product.
-      // This includes a reduction across the threads in the same thread group.
-    }
-    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-    // // todo add positional embedding to the qk production
-    // // Store the product to shared memory. There's one qk value per
-    // timestep.
-    // // Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      // todo add alobi here
-      bool const mask = ti_circ >= tlength;
-      if (mask) {
-        assert(false);
-      }
-      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-      qk_smem[ti - first_step] = mask ? 0.f : qk;
-    }
-  }
-
-  __syncthreads();
-
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Decompose the thread index into warp and lane.
-  int const warp = tidx / WARP_SIZE;
-  int const lane = tidx % WARP_SIZE;
-
-  // The warp leader writes the max to shared memory.
-  if (lane == 0) {
-    red_smem[warp] = qk_max;
-  }
-
-  // Make sure the products are in shared memory.
-  __syncthreads();
-
-  // The warps finalize the reduction.
-  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Broadcast to all the threads in the warp.
-  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-  float exp_sum = 0.f;
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    float logit = __expf(qk_smem[ti - first_step] - qk_max);
-    exp_sum += logit;
-    qk_smem[ti - first_step] = logit;
-  }
-
-  // Compute the sum.
-  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+using flashinfer::BatchDecodeHandler;
+using flashinfer::BatchDecodeWithPagedKVCacheWrapperDispatched;
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
+using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
+
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) *
+         hidden_size;
+}
 
-  // softmax
-  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    qk_smem[ti - first_step] *= inv_sum;
-  }
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) *
+         hidden_size;
+}
 
-  __syncthreads();
-  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
-  //   printf("softmax %.10f\n", qk_smem[0]);
-  // }
-
-  // value projection
-  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-  // A vector of V elements for the current timestep.
-  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-  // The value computed by this thread.
-  int vo = tidx / THREADS_PER_VALUE;
-  // The hidden dimensions computed by this particular thread.
-  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-  Out_sum out;
-  zero(out);
-
-  // The base pointer for the value in the cache buffer.
-  DT const *v_cache_batch =
-      value_cache + requext_idx_in_batch * max_seq_length * hidden_size + vi;
-
-  if (Dh == Dh_MAX || vi < Dh) {
-    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-      // Load the values from the cache.
-      int const ti_circ = ti % max_seq_length;
-
-      V_vec v = *reinterpret_cast<V_vec const *>(
-          v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
-      float logit = qk_smem[ti - first_step];
-      out = FlexFlow::fma(logit, cast_to_float(v), out);
-    }
+template <typename DT>
+__global__ void
+    update_qkv_cache_kernel(DT *devQKVProjArray,
+                            half *qTmp_ptr,
+                            half *kCache_ptr,
+                            BatchConfig::PerTokenInfo const *tokenInfos,
+                            BatchConfig::PerRequestInfo *request_infos,
+                            int const max_num_pages,
+                            int hidden_size,
+                            int num_new_tokens) {
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / hidden_size;
+  int const offset = thread_idx % hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
   }
 
-  //   // Make sure we can start writing to shared memory.
-  __syncthreads();
-
-  // Run the final reduction amongst the different groups computing different
-  // partial outputs.
-  if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-    for (int active_groups = V_PER_ITER; active_groups >= 2;
-         active_groups /= 2) {
-
-      // The midpoint in the number of active groups.
-      int midpoint = active_groups / 2;
-
-      // The upper part of active threads store to shared memory.
-      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-            out;
-      }
-      __syncthreads();
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_k_idx = get_k_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size),
+         to_v_idx = get_v_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size);
+
+  // key and value cache should be stored interleaved
+  kCache_ptr[to_k_idx + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
+  kCache_ptr[to_v_idx + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
+  qTmp_ptr[token_idx * hidden_size + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + offset]);
+}
 
-      // The bottom warps update their values.
-      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                  out);
-      }
-      __syncthreads();
-    }
-  }
+template <typename DT>
+void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
+                      BatchConfig const *bc,
+                      cudaStream_t stream) {
+  // update the kv cache, compact the q array
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  int const max_num_pages =
+      (BatchConfig::max_sequence_length() +
+       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+      kPagesize;
+  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                      static_cast<half *>(m->queryTmp),
+                                      static_cast<half *>(m->keyCache),
+                                      m->token_infos,
+                                      m->request_infos,
+                                      max_num_pages,
+                                      m->hidden_size,
+                                      num_new_tokens);
+}
 
-  // Output the final values.
-  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-    convert_from_float(
-        *reinterpret_cast<V_vec *>(output_ptr + request_idx * hidden_size +
-                                   head_idx * per_head_size + vi),
-        out);
+template <typename DT>
+__global__ void produce_output_kernel(half const *input_ptr,
+                                      DT *output_ptr,
+                                      int parallelism) {
+  CUDA_KERNEL_LOOP(idx, parallelism) {
+    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
   }
 }
 
@@ -647,26 +478,6 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //   }
 }
 
-template <typename DT>
-void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                            BatchConfig const *bc,
-                            cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
-  if (num_tokens > 0) {
-    int parallelism = m->hidden_size * num_tokens;
-    store_kv_cache<<<GET_BLOCKS(parallelism),
-                     min(CUDA_NUM_THREADS, parallelism),
-                     0,
-                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                               static_cast<DT *>(m->keyCache),
-                               static_cast<DT *>(m->valueCache),
-                               m->token_infos,
-                               num_tokens,
-                               BatchConfig::max_sequence_length(),
-                               m->hidden_size);
-  }
-}
-
 template <typename DT>
 void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
@@ -740,54 +551,6 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
-#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  do {                                                                         \
-    smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                             \
-                                     BatchConfig::max_sequence_length(),       \
-                                     THREADS_PER_VALUE,                        \
-                                     THDS_PER_BLOCK);                          \
-    compute_attention_kernel_generation_kernel<DT,                             \
-                                               THDS_PER_BLOCK,                 \
-                                               Dh,                             \
-                                               Dh_MAX,                         \
-                                               THDS_PER_KEY,                   \
-                                               THREADS_PER_VALUE>              \
-        <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                           \
-            static_cast<DT *>(m->devQKVProjArray),                             \
-            static_cast<DT *>(m->keyCache),                                    \
-            static_cast<DT *>(m->valueCache),                                  \
-            output_ptr,                                                        \
-            scale,                                                             \
-            BatchConfig::max_sequence_length(),                                \
-            m->qProjSize,                                                      \
-            m->hidden_size,                                                    \
-            m->request_infos,                                                  \
-            m->request_available);                                             \
-  } while (0)
-
-template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         cudaStream_t stream) {
-  dim3 grid(m->num_q_heads, bc->num_tokens);
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  size_t smem_sz;
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
-  } else {
-    assert(false && "a unsupported head size");
-  }
-}
-
 template <typename DT>
 void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
@@ -849,7 +612,191 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void incr_attention(IncMultiHeadSelfAttentionMeta *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream) {
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const head_dim = m->qProjSize;
+  uint32_t const batch_size = bc->num_active_requests();
+  float const sm_scale =
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (device == 0) {
+  //     std::cout << "Update custom mask time: " << elapsed << " ms\n";
+  //   }
+
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->keyCache),
+       *o = static_cast<half *>(m->outputTmp);
+  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
+      num_q_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      kv,
+      m->handle.incr_attention_metadata->kv_indices,
+      m->handle.incr_attention_metadata->kv_indptr,
+      m->handle.incr_attention_metadata->kv_last_page_len);
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    attn prep time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  void *handler = nullptr;
+
+  if (!bc->prompt_phase) {
+    assert(m->handle.incr_attention_metadata->decode_handler_collections.count(
+               batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = m->handle.incr_attention_metadata
+                  ->decode_handler_collections[batch_size];
+  } else {
+    assert(m->handle.incr_attention_metadata->prompt_handler_collections.count(
+               batch_size) != 0 &&
+           "Handler is not initialized");
+    handler = m->handle.incr_attention_metadata
+                  ->prompt_handler_collections[batch_size];
+  }
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    BeginForward time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    cudaError_t result;
+    if (bc->prompt_phase) {
+      result =
+          BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                        HEAD_DIM,
+                                                        LogitsPostHook::kNone,
+                                                        QKVLayout::kNHD,
+                                                        PosEncodingMode::kNone,
+                                                        false,
+                                                        MaskMode::kCausal,
+                                                        half,
+                                                        half,
+                                                        int32_t>(
+              static_cast<BatchPrefillHandler *>(handler),
+              q,
+              m->handle.incr_attention_metadata->q_indptr,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              /*custom_mask=*/nullptr,
+              /*qk_indptr=*/nullptr,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    } else {
+      result =
+          BatchDecodeWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                       HEAD_DIM,
+                                                       LogitsPostHook::kNone,
+                                                       QKVLayout::kNHD,
+                                                       PosEncodingMode::kNone,
+                                                       half,
+                                                       half,
+                                                       half,
+                                                       int32_t>(
+              static_cast<BatchDecodeHandler *>(handler),
+              q,
+              /*q_offset=*/nullptr,
+              paged_kv,
+              o,
+              /*lse=*/nullptr,
+              num_q_heads,
+              /*logits_soft_cap=*/0.f,
+              sm_scale,
+              /*rope_scale=*/1.f,
+              /*rope_theta=*/static_cast<float>(1e4),
+              stream);
+    }
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to run "
+                               "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                               std::string(cudaGetErrorString(result)));
+    }
+  });
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    actual attn time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+
+  {
+    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+    produce_output_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(m->outputTmp, output_ptr, parallelism);
+  }
+
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   if (device == 0) {
+  //     printf("    produce_output_kernel time: %.4f ms\n", elapsed);
+  //   }
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+}
+
+template <typename DT>
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -878,7 +825,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
                      stream);
-  update_kv_cache_kernel<DT>(m, bc, stream);
+  // phase 2: Update key/val cache
+  update_qkv_cache<DT>(m, bc, stream);
 
   // cudaEventRecord(t_end, stream);
   // checkCUDA(cudaEventSynchronize(t_end));
@@ -892,15 +840,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
   // cudaEventCreate(&t_end);
   // cudaEventRecord(t_start, stream);
 
-  if (bc->prompt_phase) {
-    // phase 3: Compute attention score for prompt tokens;
-    compute_attention_kernel_prompt(
-        m, bc, shard_id, bias_ptr, weight_ptr, stream);
-  } else {
-    // phase 3: Compute attention score for generation tokens
-    compute_attention_kernel_generation<DT>(
-        m, bc, static_cast<DT *>(m->attn_heads), stream);
-  }
+  // phase 3: Compute attention score
+  incr_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // cudaEventRecord(t_end, stream);
   // checkCUDA(cudaEventSynchronize(t_end));
@@ -985,253 +926,9 @@ __global__ void fill_entries_above_diagonal(DT *matrix,
   }
 }
 
-template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
-                                     BatchConfig const *bc,
-                                     int shard_id,
-                                     DT const *bias_ptr,
-                                     DT const *weight_ptr,
-                                     cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-  // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
-  int tokens_previous_requests = 0;
-  int q_block_size = m->qProjSize;
-  int kt_block_size = m->kProjSize;
-  int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  int vt_block_size = m->vProjSize;
-  int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
-  assert(m->qProjSize == m->kProjSize);
-
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (!bc->request_available[i]) {
-      continue;
-    }
-    int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    if (num_new_tokens == 0) {
-      continue;
-    }
-    int total_tokens = bc->requestsInfo[i].first_token_index_in_request +
-                       bc->requestsInfo[i].num_tokens_in_batch;
-    // Step 1: compute query-key product QK.T/sqrt(d_k)
-    {
-      // Scale by sqrt(d_k) as per the original attention paper
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // after transpositions
-      int m_ = num_new_tokens;
-      int n = total_tokens;
-      int k = m->qProjSize;
-      // before transpositions
-      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-          ldc = m_;
-      // N.B. strides are applied before transpose operations
-      int strideA = q_block_size;
-      int strideB = kt_block_size;
-      int strideC = num_new_tokens * total_tokens;
-
-      // matrix A: devQKVProjArray
-      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
-      // To get query projection, skip over Q entries from previous requests
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    bc->requestsInfo[i].first_token_offset_in_batch *
-                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-      // matrix B: key cache
-      // matrix B's layout: [kProjSize * num_heads, total_tokens]
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-      // matrix C: qk_prods
-      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get C, skip over QK.T products from previous requests
-      DT *C = static_cast<DT *>(m->qk_prods);
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    // Step 2: Add alibi position bias to qk production
-    // matrix C: qk_prods
-    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-    // To get C, skip over QK.T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods);
-    if (*m->position_bias) {
-      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
-                                  min((size_t)CUDA_NUM_THREADS, parallelism),
-                                  0,
-                                  stream>>>(C,
-                                            num_new_tokens,
-                                            total_tokens,
-                                            m->num_q_heads,
-                                            m->global_num_q_heads,
-                                            shard_id);
-    }
-
-    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
-    // with -inf to force causal attention.
-    assert(num_new_tokens <= total_tokens);
-    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
-    if (entries_above_diagonal > 0) {
-      size_t parallelism = m->num_q_heads * entries_above_diagonal;
-      fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                    min((size_t)CUDA_NUM_THREADS, parallelism),
-                                    0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens,
-                                              m->num_q_heads,
-                                              entries_above_diagonal,
-                                              static_cast<DT>(-INFINITY));
-    }
-
-    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
-    {
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-    }
-    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
-    // softmax(QK.T/sqrt(d_k)).T
-    {
-      DT alpha = 1.0f, beta = 0.0f;
-      // after transpositions
-      int m_ = m->vProjSize;
-      int n = num_new_tokens;
-      int k = total_tokens;
-      // before transpositions
-      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
-      // N.B. strides are applied before transpose operations
-      int strideA = vt_block_size;
-      int strideB = num_new_tokens * total_tokens;
-      int strideC = m->vProjSize;
-      // matrix A: value cache
-      // matrix A's layout: [vProjSize, num_heads, total_tokens]
-      // To get A, skip over V.T entries from previous requests (all heads +
-      // padding)
-      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-      // matrix B: qk_prods_softmax
-      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
-      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      ;
-      // matrix C: attn heads
-      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
-      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
-      // requests
-      // store the result attn heads, also skip the genration tokens
-      DT *C = static_cast<DT *>(m->attn_heads) +
-              (bc->requestsInfo[i].first_token_offset_in_batch) *
-                  m->num_q_heads * m->vProjSize;
-      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_T,
-                                           m_,
-                                           n,
-                                           k,
-                                           &alpha,
-                                           A,
-                                           cublas_data_type,
-                                           lda,
-                                           strideA,
-                                           B,
-                                           cublas_data_type,
-                                           ldb,
-                                           strideB,
-                                           &beta,
-                                           C,
-                                           cublas_data_type,
-                                           ldc,
-                                           strideC,
-                                           m->num_q_heads,
-                                           compute_type,
-                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    tokens_previous_requests += num_new_tokens;
-  }
-  //   if (tokens_previous_requests != (num_tokens - bc->num_tokens)) {
-  //     bc->print();
-  //     printf("tokens_previous_requests: %i\n", tokens_previous_requests);
-  //     printf("num_tokens: %i\n", num_tokens);
-  //     printf("bc->num_tokens: %i\n", bc->num_tokens);
-  //   }
-  //   assert(tokens_previous_requests == (num_tokens - bc->num_tokens));
-}
-
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -1259,7 +956,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     }
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
-    Kernels::IncMultiHeadAttention::inference_kernel(
+    Kernels::IncMultiHeadAttention::inference_kernel<half>(
         m,
         bc,
         shard_id,
@@ -1274,7 +971,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
-    Kernels::IncMultiHeadAttention::inference_kernel(
+    Kernels::IncMultiHeadAttention::inference_kernel<float>(
         m,
         bc,
         shard_id,
@@ -1441,17 +1138,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
          BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
         kPagesize;
     switch (infer_mode) {
-      case INC_DECODING_MODE: {
-        key_cache_size = num_q_heads * kProjSize *
-                         BatchConfig::max_requests_per_batch() *
-                         BatchConfig::max_sequence_length();
-        value_cache_size = num_q_heads * vProjSize *
-                           BatchConfig::max_requests_per_batch() *
-                           BatchConfig::max_sequence_length();
-        qk_prod_size = BatchConfig::max_sequence_length() *
-                       BatchConfig::max_sequence_length() * num_q_heads;
-        break;
-      }
+      case INC_DECODING_MODE:
       case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
         query_tmp_size =
@@ -1588,6 +1275,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+
+  // set attention constants
+  handler.incr_attention_metadata->set_enabled(true);
+  handler.incr_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.incr_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.incr_attention_metadata->set_head_dim(qProjSize);
+
   cudaStreamSynchronize(stream);
 }
 
@@ -1628,18 +1322,4 @@ template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
     half const *bias_ptr,
     int num_tokens,
     cudaStream_t stream);
-
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        float *output_ptr,
-        cudaStream_t stream);
-
-template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
-        IncMultiHeadSelfAttentionMeta const *m,
-        BatchConfig const *bc,
-        half *output_ptr,
-        cudaStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 21d791f83..d73bb868d 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -25,30 +25,6 @@
 #include <sstream>
 #include <stdexcept>
 
-#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
-  switch (head_dim) {                                                          \
-    case 64: {                                                                 \
-      constexpr size_t HEAD_DIM = 64;                                          \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case 128: {                                                                \
-      constexpr size_t HEAD_DIM = 128;                                         \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case 256: {                                                                \
-      constexpr size_t HEAD_DIM = 256;                                         \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    default: {                                                                 \
-      std::ostringstream err_msg;                                              \
-      err_msg << "Unsupported head_dim: " << head_dim;                         \
-      throw std::invalid_argument(err_msg.str());                              \
-    }                                                                          \
-  }
-
 namespace FlexFlow {
 
 // declare Legion names
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 31451201b..7b2db3b2a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -25,30 +25,6 @@
 #include <sstream>
 #include <stdexcept>
 
-#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
-  switch (head_dim) {                                                          \
-    case 64: {                                                                 \
-      constexpr size_t HEAD_DIM = 64;                                          \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case 128: {                                                                \
-      constexpr size_t HEAD_DIM = 128;                                         \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case 256: {                                                                \
-      constexpr size_t HEAD_DIM = 256;                                         \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    default: {                                                                 \
-      std::ostringstream err_msg;                                              \
-      err_msg << "Unsupported head_dim: " << head_dim;                         \
-      throw std::invalid_argument(err_msg.str());                              \
-    }                                                                          \
-  }
-
 namespace FlexFlow {
 
 // declare Legion names
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 67c13b201..67d02c857 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -14,6 +14,7 @@
  */
 #include "flexflow/model.h"
 #include "flexflow/utils/cuda_helper.h"
+#include <cassert>
 
 namespace FlexFlow {
 // declare Legion names
@@ -89,8 +90,11 @@ FFHandler
   handle.offload_reserve_space_size = info->offload_reserve_space_size;
   handle.quantization_type = info->quantization_type;
   handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion;
+  handle.incr_attention_metadata = new AttentionMetaData();
   handle.tree_search_attention_metadata = new AttentionMetaData();
   handle.tree_verify_attention_metadata = new AttentionMetaData();
+  assert(handle.incr_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
   assert(handle.tree_search_attention_metadata != nullptr &&
          "Attention metadata must be allocated");
   assert(handle.tree_verify_attention_metadata != nullptr &&
@@ -158,6 +162,7 @@ FFHandler
     handle.offload_reserve_space = nullptr;
   }
   if (handle.batch_config_metadata_size +
+          handle.incr_attention_metadata->mem_size() +
           handle.tree_search_attention_metadata->mem_size() +
           handle.tree_verify_attention_metadata->mem_size() >
       0) {
@@ -170,6 +175,7 @@ FFHandler
         Realm::Point<1, coord_t>(0),
         Realm::Point<1, coord_t>(
             handle.batch_config_metadata_size +
+            handle.incr_attention_metadata->mem_size() +
             handle.tree_search_attention_metadata->mem_size() +
             handle.tree_verify_attention_metadata->mem_size() - 1));
     std::vector<size_t> field_sizes;
@@ -184,17 +190,24 @@ FFHandler
         .wait();
     handle.batch_config_metadata =
         workspaceInst.pointer_untyped(0, sizeof(char));
-    handle.tree_search_attention_metadata->assign_address(
+    handle.incr_attention_metadata->assign_address(
         static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
                             handle.batch_config_metadata_size),
+        handle.incr_attention_metadata->mem_size());
+    handle.tree_search_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size +
+                            handle.incr_attention_metadata->mem_size()),
         handle.tree_search_attention_metadata->mem_size());
     handle.tree_verify_attention_metadata->assign_address(
         static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
                             handle.batch_config_metadata_size +
+                            handle.incr_attention_metadata->mem_size() +
                             handle.tree_search_attention_metadata->mem_size()),
         handle.tree_verify_attention_metadata->mem_size());
   } else {
     handle.batch_config_metadata = nullptr;
+    handle.incr_attention_metadata->assign_address(nullptr, 0);
     handle.tree_search_attention_metadata->assign_address(nullptr, 0);
     handle.tree_verify_attention_metadata->assign_address(nullptr, 0);
   }
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 9a6394eca..b2b9ca803 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -13,7 +13,9 @@
  * limitations under the License.
  */
 
+#include "flashinfer/decode_attention_decl.cuh"
 #include "flashinfer/prefill_attention_decl.cuh"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 
@@ -21,7 +23,13 @@ namespace FlexFlow {
 
 using namespace Legion;
 
+using flashinfer::BatchDecodeHandler;
 using flashinfer::BatchPrefillHandler;
+using flashinfer::LogitsPostHook;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
 
 void RequestManager::load_tokens_task(
     Task const *task,
@@ -224,7 +232,132 @@ void RequestManager::load_batch_config_task(
   total_copy_size += sizeof(BatchConfig::request_available);
 
   // load attention metadata
-  if (batch_config->get_mode() == TREE_SEARCH_MODE) {
+  if (batch_config->get_mode() == INC_DECODING_MODE) {
+    if (handle.incr_attention_metadata->enabled()) {
+      // calculate the attention meta data
+      {
+        BatchConfig::PerRequestInfo *request_infos =
+            reinterpret_cast<BatchConfig::PerRequestInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo));
+        bool *request_available = reinterpret_cast<bool *>(
+            static_cast<char *>(handle.batch_config_metadata) +
+            sizeof(BatchConfig::tokensInfo) +
+            sizeof(BatchConfig::requestsInfo));
+        int batch_size = batch_config->num_active_requests();
+        uint32_t const max_num_pages =
+            (BatchConfig::max_sequence_length() +
+             BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+            kPagesize;
+
+        int parallelism = batch_size;
+        prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(
+            batch_size,
+            request_infos,
+            request_available,
+            max_num_pages,
+            handle.incr_attention_metadata->q_indptr,
+            handle.incr_attention_metadata->kv_indptr,
+            handle.incr_attention_metadata->kv_indices,
+            handle.incr_attention_metadata->kv_last_page_len,
+            handle.incr_attention_metadata->qk_indptr);
+      }
+
+      // prepare attention forward handler
+      {
+        int batch_size = batch_config->num_active_requests();
+        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+            kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+            kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
+        q_indptr_h[0] = 0;
+        kv_indptr_h[0] = 0;
+        for (int req_idx = 0, indptr_idx = 0;
+             req_idx < batch_config->max_requests_per_batch();
+             req_idx++) {
+          if (batch_config->request_available[req_idx]) {
+            int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+            int kv_len =
+                batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                batch_config->requestsInfo[req_idx]
+                    .first_token_index_in_request;
+            q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+            kv_indptr_h[indptr_idx + 1] =
+                kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+            kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
+            indptr_idx++;
+          }
+        }
+
+        if (!batch_config->prompt_phase) {
+          BatchDecodeHandler *handler = nullptr;
+          if (handle.incr_attention_metadata->decode_handler_collections.count(
+                  batch_size) == 0) {
+            handle.incr_attention_metadata
+                ->decode_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchDecodeHandler(true, batch_size));
+          }
+          handler = static_cast<BatchDecodeHandler *>(
+              handle.incr_attention_metadata
+                  ->decode_handler_collections[batch_size]);
+
+          handler->SetCUDAStream(stream);
+          DISPATCH_HEADDIM(handle.incr_attention_metadata->head_dim(), HEAD_DIM, {
+            handler->BeginForwardDispatched<HEAD_DIM,
+                                            PageStorage::kIndices,
+                                            LogitsPostHook::kNone,
+                                            QKVLayout::kNHD,
+                                            PosEncodingMode::kNone,
+                                            half,
+                                            half,
+                                            half,
+                                            int32_t>(
+                static_cast<void *>(
+                    static_cast<char *>(
+                        handle.incr_attention_metadata->workspace) +
+                    handle.incr_attention_metadata->workspace_block *
+                        batch_size),
+                handle.incr_attention_metadata->workspace_block,
+                static_cast<int32_t *>(kv_indptr_h),
+                static_cast<int32_t *>(kv_last_page_len_h),
+                batch_size,
+                handle.incr_attention_metadata->num_q_heads(),
+                handle.incr_attention_metadata->num_q_heads(),
+                kPagesize);
+          });
+        } else {
+          BatchPrefillHandler *handler = nullptr;
+          if (handle.incr_attention_metadata->prompt_handler_collections.count(
+                  batch_size) == 0) {
+            handle.incr_attention_metadata
+                ->prompt_handler_collections[batch_size] =
+                static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+          }
+          handler = static_cast<BatchPrefillHandler *>(
+              handle.incr_attention_metadata
+                  ->prompt_handler_collections[batch_size]);
+
+          handler->SetCUDAStream(stream);
+          handler->BeginForward<half, int32_t>(
+              static_cast<void *>(
+                  static_cast<char *>(
+                      handle.incr_attention_metadata->workspace) +
+                  handle.incr_attention_metadata->workspace_block *
+                      batch_size),
+              handle.incr_attention_metadata->workspace_block,
+              static_cast<int32_t *>(q_indptr_h),
+              static_cast<int32_t *>(kv_indptr_h),
+              batch_size,
+              handle.incr_attention_metadata->num_q_heads(),
+              handle.incr_attention_metadata->num_q_heads(),
+              handle.incr_attention_metadata->head_dim(),
+              kPagesize);
+        }
+      }
+    }
+  } else if (batch_config->get_mode() == TREE_SEARCH_MODE) {
     if (handle.tree_search_attention_metadata->enabled()) {
       for (int request_idx = 0;
            request_idx < BatchConfig::max_requests_per_batch();

From a14b9b20a43d122818d886a68dfae3eadfe76578 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 16:57:16 -0700
Subject: [PATCH 391/667] feat: clean up incr_attention, move global code into
 separate file

---
 .../inc_multihead_self_attention_kernels.h    |  60 --
 src/ops/inc_multihead_self_attention.cu       | 555 ------------------
 .../inc_multihead_self_attention_kernels.cu   | 551 +++++++++++++++++
 3 files changed, 551 insertions(+), 615 deletions(-)
 create mode 100644 src/ops/kernels/inc_multihead_self_attention_kernels.cu

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 27f448a4d..6bc52194b 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -24,66 +24,6 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
                          int num_tokens,
                          ffStream_t stream);
 
-template <typename DT>
-__global__ void apply_position_bias_qkprd(DT *input_ptr,
-                                          int num_tokens,
-                                          int num_total_tokens,
-                                          int num_heads,
-                                          int global_num_q_heads,
-                                          int shard_id);
-
-template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize);
-
-template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int num_heads,
-                                    int num_kv_heads,
-                                    bool scaling_query,
-                                    float scaling_factor);
-
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-template <typename DT>
-__global__ void
-    apply_rotary_embedding(DT *input_ptr,
-                           cuFloatComplex *complex_input,
-                           BatchConfig::PerTokenInfo const *tokenInfos,
-                           int qProjSize,
-                           int kProjSize,
-                           int num_heads,
-                           int num_tokens,
-                           int num_kv_heads,
-                           int q_block_size,
-                           int k_block_size,
-                           int q_array_size,
-                           bool q_tensor);
-#elif defined(FF_USE_HIP_ROCM)
-template <typename DT>
-__global__ void
-    apply_rotary_embedding(DT *input_ptr,
-                           hipFloatComplex *complex_input,
-                           BatchConfig::PerTokenInfo const *tokenInfos,
-                           int qProjSize,
-                           int kProjSize,
-                           int num_heads,
-                           int num_tokens,
-                           int num_kv_heads,
-                           int q_block_size,
-                           int k_block_size,
-                           int q_array_size,
-                           bool q_tensor);
-#endif
-
 template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 5d3c2ebd6..c6477b301 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -132,485 +132,6 @@ __global__ void produce_output_kernel(half const *input_ptr,
   }
 }
 
-// only used by MPT model. https://arxiv.org/abs/2108.12409
-template <typename DT>
-__global__ void apply_position_bias_qkprd(DT *input_ptr,
-                                          int num_tokens,
-                                          int num_total_tokens,
-                                          int num_heads,
-                                          int global_num_q_heads,
-                                          int shard_id) {
-  CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) {
-    // get head_idx,
-    int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id);
-    int position_idx = (i / num_tokens) % num_total_tokens;
-    position_idx = position_idx + 1 - num_total_tokens;
-    // 8 is alibi_bias_max in
-    // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json
-    float base = (float)(head_idx + 1) * 8 / global_num_q_heads;
-    float slopes = 1.0 / pow(2, base);
-    // if(i == 0){
-    //   printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes,
-    //   position_idx * slopes);
-    // }
-    input_ptr[i] += static_cast<DT>(position_idx * slopes);
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_w(DT *input_ptr,
-                                  DT const *bias_ptr,
-                                  int num_tokens,
-                                  int qkv_weight_size,
-                                  int oProjSize) {
-  CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) {
-    int bias_idx = qkv_weight_size + i % oProjSize;
-    input_ptr[i] += bias_ptr[bias_idx];
-  }
-}
-
-template <typename DT>
-__global__ void apply_proj_bias_qkv(DT *input_ptr,
-                                    DT const *bias_ptr,
-                                    int shard_id,
-                                    int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
-                                    int global_num_q_heads,
-                                    int num_q_heads,
-                                    bool scaling_query,
-                                    float scaling_factor,
-                                    int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
-    // for simplicity, assume q, k, v is in same shape
-    // 0->q, 1->k, 2->v
-    // int qkv_index = i / (num_tokens * qProjSize) % 3;
-
-    int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
-    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
-
-    int qkv_index = in_token_idx / hidden_size;
-
-    int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
-
-    int head_idx =
-        (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
-    int global_head_idx = head_idx + shard_id * num_q_heads;
-
-    size_t pre_length =
-        qkv_index == 0
-            ? 0
-            : (qkv_index == 1 ? qProjSize * global_num_q_heads
-                              : qProjSize * global_num_q_heads * KV_WEIGHT_NUM);
-
-    size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
-
-    input_ptr[i] += bias_ptr[bias_idx];
-
-    if (scaling_query && qkv_index == 0) {
-      input_ptr[i] *= scaling_factor;
-    }
-  }
-}
-
-template <typename DT>
-__global__ void scaling_query_kernel(DT *input_ptr,
-                                     int qProjSize,
-                                     int num_tokens,
-                                     int num_q_heads,
-                                     float scaling_factor,
-                                     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *=
-        scaling_factor;
-  }
-}
-
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  cuFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    // complex_input[i].y;
-
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_hf(DT *input_ptr,
-                              cuFloatComplex *complex_input,
-                              BatchConfig::PerTokenInfo const *tokenInfos,
-                              int qProjSize,
-                              int kProjSize,
-                              int num_tokens,
-                              size_t q_array_size,
-                              int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int token_idx = real_i / (hidden_size / 2);
-    int idx = real_i % (proj_size / 2);
-    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
-
-    int real_part_index = idx + head_idx * proj_size +
-                          token_idx * hidden_size * QKV_WEIGHT_NUM +
-                          hidden_size * (q_tensor ? 0 : 1);
-    int complex_part_index = real_part_index + (proj_size / 2);
-
-    // complex_input[i] = {input_ptr[real_part_index],
-    //                     input_ptr[complex_part_index]};
-    cuFloatComplex cii = {input_ptr[real_part_index],
-                          input_ptr[complex_part_index]};
-
-    // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
-    // apply a Cartesian coordinate transformation
-    // multiple with input & /copy back to q/k
-
-    // get position of token
-
-    // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    // complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    // input_ptr[real_part_index] = complex_input[i].x;
-    // input_ptr[complex_part_index] = complex_input[i].y;
-
-    cii = cuCmulf(cii, complex_pos);
-    input_ptr[real_part_index] = cii.x;
-    input_ptr[complex_part_index] = cii.y;
-  }
-}
-
-template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        cudaStream_t stream) {
-
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  assert(m->qSize == m->vSize && m->qSize == m->kSize);
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-
-  //   int device;
-  //   checkCUDA(cudaGetDevice(&device));
-  //   cudaEvent_t t_start, t_end;
-  //   checkCUDA(cudaEventCreate(&t_start));
-  //   checkCUDA(cudaEventCreate(&t_end));
-  //   checkCUDA(cudaEventRecord(t_start, stream));
-
-  // Step 1: Compute QKV projections
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_q = m->qProjSize * m->num_q_heads;
-    int m_k = m->kProjSize * m->num_q_heads;
-    int m_v = m->vProjSize * m->num_q_heads;
-    assert(m_q == m_k && m_k == m_v); // keep things simple for now
-    int n = bc->num_active_tokens();
-    int k = m->qSize;
-    int m_ = m_q * QKV_WEIGHT_NUM;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: QKV weights
-    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
-    // matrix B: input
-    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
-    // matrix C: devQKVProjArray
-    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           weight_ptr,
-                           cublas_data_type,
-                           lda,
-                           input_ptr,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           output_ptr,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-
-  //   checkCUDA(cudaEventRecord(t_end, stream));
-  //   checkCUDA(cudaEventSynchronize(t_end));
-  //   float elapsed = 0;
-  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  //   cudaEventDestroy(t_start);
-  //   cudaEventDestroy(t_end);
-  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
-  //     std::cout << "GEMM time: " << elapsed << " ms\n";
-  //   }
-
-  int num_tokens = bc->num_active_tokens();
-  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
-  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
-
-  // Step 2: apply bias for QKV, or scale the query
-  if (*m->qkv_bias) {
-    apply_proj_bias_qkv<<<GET_BLOCKS(parallelism),
-                          min(CUDA_NUM_THREADS, parallelism),
-                          0,
-                          stream>>>(output_ptr,
-                                    bias_ptr,
-                                    shard_id,
-                                    num_tokens,
-                                    m->qProjSize,
-                                    m->kProjSize,
-                                    m->vProjSize,
-                                    m->global_num_q_heads,
-                                    m->num_q_heads,
-                                    *m->scaling_query,
-                                    m->scaling_factor,
-                                    m->hidden_size);
-  } else if (m->scaling_query) {
-    scaling_query_kernel<<<GET_BLOCKS(parallelism),
-                           min(CUDA_NUM_THREADS, parallelism),
-                           0,
-                           stream>>>(output_ptr,
-                                     num_tokens,
-                                     m->num_q_heads,
-                                     m->qProjSize,
-                                     m->scaling_factor,
-                                     m->hidden_size);
-  }
-
-  //   checkCUDA(cudaEventCreate(&t_start));
-  //   checkCUDA(cudaEventCreate(&t_end));
-  //   checkCUDA(cudaEventRecord(t_start, stream));
-
-  // Step 3: apply rotary embedding if needed
-  if (*m->apply_rotary_embedding) {
-    /*q&k*/
-    parallelism = num_tokens * m->hidden_size;
-    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
-                                min(CUDA_NUM_THREADS, parallelism),
-                                0,
-                                stream>>>(output_ptr,
-                                          m->complex_input,
-                                          m->token_infos,
-                                          m->qProjSize,
-                                          m->kProjSize,
-                                          num_tokens,
-                                          q_array_size,
-                                          m->hidden_size);
-  }
-  //   checkCUDA(cudaEventRecord(t_end, stream));
-  //   checkCUDA(cudaEventSynchronize(t_end));
-  //   elapsed = 0;
-  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  //   cudaEventDestroy(t_start);
-  //   cudaEventDestroy(t_end);
-  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
-  //     std::cout << "Rotary time: " << elapsed << " ms\n";
-  //   }
-}
-
-template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         cudaStream_t stream) {
-  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
-  assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
-#endif
-  // Project to output, save result directly on output tensor
-  {
-    DT alpha = 1.0f, beta = 0.0f;
-    // after transpositions
-    int m_ = m->oProjSize;
-    int k = m->vProjSize * m->num_q_heads;
-    int n = num_tokens;
-    // before transpositions
-    int lda = k, ldb = k, ldc = m_;
-    // matrix A: output projection weight
-    // matrix A's layout: [vProjSize * num_heads, oProjSize]
-    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                           m->kProjSize * m->num_q_heads +
-                                           m->vProjSize * m->num_q_heads);
-    // matrix B: attn heads
-    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
-    DT const *B = static_cast<DT *>(m->attn_heads);
-    // matrix B: output
-    // matrix B's layout: [oProjSize, num_new_tokens]
-    DT *C = static_cast<DT *>(output_ptr);
-
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           A,
-                           cublas_data_type,
-                           lda,
-                           B,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           C,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  }
-  // Add final output bias
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
-                        min(CUDA_NUM_THREADS, parallelism),
-                        0,
-                        stream>>>(
-        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
-  }
-}
-
-template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             cudaStream_t stream) {
-  // additional processing for weight uploading
-  // Note that we update weight_ptr and bias_ptr when uploading weight and
-  // bias
-  if (m->quantization_type != DT_NONE) {
-    // copy weight_ptr to quantized_weight_ptr, do compression and store in
-    // m->weight_ptr
-    cudaMemcpyAsync(m->quantized_weight_ptr,
-                    weight.get_byte_ptr(),
-                    m->quantized_weightSize,
-                    cudaMemcpyHostToDevice,
-                    stream);
-
-    if (m->quantization_type == DT_INT4) {
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
-      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
-    } else {
-      assert(m->quantization_type == DT_INT8);
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
-      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
-    }
-  } else {
-    if (data_type == DT_FLOAT) {
-      cudaMemcpyAsync(m->weight_ptr,
-                      weight.get_float_ptr(),
-                      m->weightSize,
-                      cudaMemcpyHostToDevice,
-                      stream);
-    } else if (data_type == DT_HALF) {
-      cudaMemcpyAsync(m->weight_ptr,
-                      weight.get_half_ptr(),
-                      m->weightSize,
-                      cudaMemcpyHostToDevice,
-                      stream);
-    } else {
-      assert(false);
-    }
-  }
-}
-
 template <typename DT>
 void incr_attention(IncMultiHeadSelfAttentionMeta *m,
                     BatchConfig const *bc,
@@ -881,51 +402,6 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
 
 using namespace Kernels::IncMultiHeadAttention;
 
-template <typename DT>
-__global__ void store_kv_cache(DT const *devQKVProjArray,
-                               DT *kCache_ptr,
-                               DT *vCache_ptr,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int num_tokens,
-                               int max_seq_len,
-                               int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-
-    size_t val_idx =
-        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
-
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_index_in_request;
-
-    // key cache
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
-  }
-}
-
-template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
-  }
-}
-
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     IncMultiHeadSelfAttentionMeta *m,
@@ -1291,35 +767,4 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {
   }
 }
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
-
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
-    cudaStream_t stream);
-
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    int shard_id,
-    float *output_ptr,
-    float const *weight_ptr,
-    float const *bias_ptr,
-    int num_tokens,
-    cudaStream_t stream);
-
-template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    int shard_id,
-    half *output_ptr,
-    half const *weight_ptr,
-    half const *bias_ptr,
-    int num_tokens,
-    cudaStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
new file mode 100644
index 000000000..0d9d291d0
--- /dev/null
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -0,0 +1,551 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/inc_multihead_self_attention.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+
+// declare Legion names
+using Legion::coord_t;
+using Legion::Memory;
+
+#define WARP_SIZE 32
+
+namespace Kernels {
+namespace IncMultiHeadAttention {
+
+// only used by MPT model. https://arxiv.org/abs/2108.12409
+template <typename DT>
+__global__ void apply_position_bias_qkprd(DT *input_ptr,
+                                          int num_tokens,
+                                          int num_total_tokens,
+                                          int num_heads,
+                                          int global_num_q_heads,
+                                          int shard_id) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) {
+    // get head_idx,
+    int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id);
+    int position_idx = (i / num_tokens) % num_total_tokens;
+    position_idx = position_idx + 1 - num_total_tokens;
+    // 8 is alibi_bias_max in
+    // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json
+    float base = (float)(head_idx + 1) * 8 / global_num_q_heads;
+    float slopes = 1.0 / pow(2, base);
+    // if(i == 0){
+    //   printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes,
+    //   position_idx * slopes);
+    // }
+    input_ptr[i] += static_cast<DT>(position_idx * slopes);
+  }
+}
+
+template <typename DT>
+__global__ void apply_proj_bias_w(DT *input_ptr,
+                                  DT const *bias_ptr,
+                                  int num_tokens,
+                                  int qkv_weight_size,
+                                  int oProjSize) {
+  CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) {
+    int bias_idx = qkv_weight_size + i % oProjSize;
+    input_ptr[i] += bias_ptr[bias_idx];
+  }
+}
+
+template <typename DT>
+__global__ void apply_proj_bias_qkv(DT *input_ptr,
+                                    DT const *bias_ptr,
+                                    int shard_id,
+                                    int num_tokens,
+                                    int qProjSize,
+                                    int kProjSize,
+                                    int vProjSize,
+                                    int global_num_q_heads,
+                                    int num_q_heads,
+                                    bool scaling_query,
+                                    float scaling_factor,
+                                    int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
+    // for simplicity, assume q, k, v is in same shape
+    // 0->q, 1->k, 2->v
+    // int qkv_index = i / (num_tokens * qProjSize) % 3;
+
+    int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
+    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
+
+    int qkv_index = in_token_idx / hidden_size;
+
+    int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
+
+    int head_idx =
+        (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
+    int global_head_idx = head_idx + shard_id * num_q_heads;
+
+    size_t pre_length =
+        qkv_index == 0
+            ? 0
+            : (qkv_index == 1 ? qProjSize * global_num_q_heads
+                              : qProjSize * global_num_q_heads * KV_WEIGHT_NUM);
+
+    size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
+
+    input_ptr[i] += bias_ptr[bias_idx];
+
+    if (scaling_query && qkv_index == 0) {
+      input_ptr[i] *= scaling_factor;
+    }
+  }
+}
+
+template <typename DT>
+__global__ void scaling_query_kernel(DT *input_ptr,
+                                     int qProjSize,
+                                     int num_tokens,
+                                     int num_q_heads,
+                                     float scaling_factor,
+                                     int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *=
+        scaling_factor;
+  }
+}
+
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_native(DT *input_ptr,
+                                  cuFloatComplex *complex_input,
+                                  BatchConfig::PerTokenInfo const *tokenInfos,
+                                  int qProjSize,
+                                  int kProjSize,
+                                  int num_q_heads,
+                                  int num_tokens,
+                                  int num_kv_heads,
+                                  int q_block_size,
+                                  int k_block_size,
+                                  int q_array_size) {
+  CUDA_KERNEL_LOOP(
+      i,
+      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
+    // create complex number
+    bool q_tensor = i < (q_array_size / 2);
+    int proj_size = q_tensor ? qProjSize : kProjSize;
+    int real_i = q_tensor ? i : i - q_array_size / 2;
+
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    int idx = real_i % (num_tokens * proj_size / 2);
+    int real_part_index = idx * 2 +
+                          head_idx * (q_tensor ? q_block_size : k_block_size) +
+                          (q_tensor ? 0 : q_array_size);
+
+    int complex_part_index = real_part_index + 1;
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    int token_idx =
+        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    // float before_real = complex_input[i].x, before_complex =
+    // complex_input[i].y;
+
+    int pos_i = real_i % (proj_size / 2);
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_hf(DT *input_ptr,
+                              cuFloatComplex *complex_input,
+                              BatchConfig::PerTokenInfo const *tokenInfos,
+                              int qProjSize,
+                              int kProjSize,
+                              int num_tokens,
+                              size_t q_array_size,
+                              int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // create complex number
+    bool q_tensor = i < (q_array_size / 2);
+    int proj_size = q_tensor ? qProjSize : kProjSize;
+    int real_i = q_tensor ? i : i - q_array_size / 2;
+
+    int token_idx = real_i / (hidden_size / 2);
+    int idx = real_i % (proj_size / 2);
+    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
+
+    int real_part_index = idx + head_idx * proj_size +
+                          token_idx * hidden_size * QKV_WEIGHT_NUM +
+                          hidden_size * (q_tensor ? 0 : 1);
+    int complex_part_index = real_part_index + (proj_size / 2);
+
+    // complex_input[i] = {input_ptr[real_part_index],
+    //                     input_ptr[complex_part_index]};
+    cuFloatComplex cii = {input_ptr[real_part_index],
+                          input_ptr[complex_part_index]};
+
+    // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
+    // apply a Cartesian coordinate transformation
+    // multiple with input & /copy back to q/k
+
+    // get position of token
+
+    // size_t pos = id_map[token_idx].token_position;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    // float before_real = complex_input[i].x, before_complex =
+    int pos_i = real_i % (proj_size / 2);
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    // complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    // input_ptr[real_part_index] = complex_input[i].x;
+    // input_ptr[complex_part_index] = complex_input[i].y;
+
+    cii = cuCmulf(cii, complex_pos);
+    input_ptr[real_part_index] = cii.x;
+    input_ptr[complex_part_index] = cii.y;
+  }
+}
+
+template <typename DT>
+void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        int shard_id,
+                        DT const *input_ptr,
+                        DT const *weight_ptr,
+                        DT *output_ptr,
+                        DT const *bias_ptr,
+                        cudaStream_t stream) {
+
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  assert(m->qSize == m->vSize && m->qSize == m->kSize);
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  cudaDataType_t compute_type = cublas_data_type;
+#else
+  // For best performance, set the default cublas compute type to
+  // CUBLAS_COMPUTE_16F for half precision and to
+  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  if (m->output_type[0] == DT_FLOAT) {
+    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  }
+#endif
+
+  //   int device;
+  //   checkCUDA(cudaGetDevice(&device));
+  //   cudaEvent_t t_start, t_end;
+  //   checkCUDA(cudaEventCreate(&t_start));
+  //   checkCUDA(cudaEventCreate(&t_end));
+  //   checkCUDA(cudaEventRecord(t_start, stream));
+
+  // Step 1: Compute QKV projections
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_q = m->qProjSize * m->num_q_heads;
+    int m_k = m->kProjSize * m->num_q_heads;
+    int m_v = m->vProjSize * m->num_q_heads;
+    assert(m_q == m_k && m_k == m_v); // keep things simple for now
+    int n = bc->num_active_tokens();
+    int k = m->qSize;
+    int m_ = m_q * QKV_WEIGHT_NUM;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: QKV weights
+    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
+    // matrix B: input
+    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
+    // matrix C: devQKVProjArray
+    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           m_,
+                           n,
+                           k,
+                           &alpha,
+                           weight_ptr,
+                           cublas_data_type,
+                           lda,
+                           input_ptr,
+                           cublas_data_type,
+                           ldb,
+                           &beta,
+                           output_ptr,
+                           cublas_data_type,
+                           ldc,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+
+  //   checkCUDA(cudaEventRecord(t_end, stream));
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
+  //     std::cout << "GEMM time: " << elapsed << " ms\n";
+  //   }
+
+  int num_tokens = bc->num_active_tokens();
+  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
+  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
+
+  // Step 2: apply bias for QKV, or scale the query
+  if (*m->qkv_bias) {
+    apply_proj_bias_qkv<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(output_ptr,
+                                    bias_ptr,
+                                    shard_id,
+                                    num_tokens,
+                                    m->qProjSize,
+                                    m->kProjSize,
+                                    m->vProjSize,
+                                    m->global_num_q_heads,
+                                    m->num_q_heads,
+                                    *m->scaling_query,
+                                    m->scaling_factor,
+                                    m->hidden_size);
+  } else if (m->scaling_query) {
+    scaling_query_kernel<<<GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream>>>(output_ptr,
+                                     num_tokens,
+                                     m->num_q_heads,
+                                     m->qProjSize,
+                                     m->scaling_factor,
+                                     m->hidden_size);
+  }
+
+  //   checkCUDA(cudaEventCreate(&t_start));
+  //   checkCUDA(cudaEventCreate(&t_end));
+  //   checkCUDA(cudaEventRecord(t_start, stream));
+
+  // Step 3: apply rotary embedding if needed
+  if (*m->apply_rotary_embedding) {
+    /*q&k*/
+    parallelism = num_tokens * m->hidden_size;
+    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
+                                min(CUDA_NUM_THREADS, parallelism),
+                                0,
+                                stream>>>(output_ptr,
+                                          m->complex_input,
+                                          m->token_infos,
+                                          m->qProjSize,
+                                          m->kProjSize,
+                                          num_tokens,
+                                          q_array_size,
+                                          m->hidden_size);
+  }
+  //   checkCUDA(cudaEventRecord(t_end, stream));
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
+  //     std::cout << "Rotary time: " << elapsed << " ms\n";
+  //   }
+}
+
+template <typename DT>
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         cudaStream_t stream) {
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+#else
+  cudaDataType_t compute_type = cublas_data_type;
+#endif
+  // Project to output, save result directly on output tensor
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_ = m->oProjSize;
+    int k = m->vProjSize * m->num_q_heads;
+    int n = num_tokens;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: output projection weight
+    // matrix A's layout: [vProjSize * num_heads, oProjSize]
+    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                           m->kProjSize * m->num_q_heads +
+                                           m->vProjSize * m->num_q_heads);
+    // matrix B: attn heads
+    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+    DT const *B = static_cast<DT *>(m->attn_heads);
+    // matrix B: output
+    // matrix B's layout: [oProjSize, num_new_tokens]
+    DT *C = static_cast<DT *>(output_ptr);
+
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           m_,
+                           n,
+                           k,
+                           &alpha,
+                           A,
+                           cublas_data_type,
+                           lda,
+                           B,
+                           cublas_data_type,
+                           ldb,
+                           &beta,
+                           C,
+                           cublas_data_type,
+                           ldc,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+  // Add final output bias
+  if (*m->final_bias && shard_id == 0) {
+    int parallelism = m->oProjSize * num_tokens;
+    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+                          m->kProjSize * m->global_num_q_heads +
+                          m->vProjSize * m->global_num_q_heads;
+    apply_proj_bias_w<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>(
+        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
+  }
+}
+
+template <typename DT>
+void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                             GenericTensorAccessorR const weight,
+                             DataType data_type,
+                             cudaStream_t stream) {
+  // additional processing for weight uploading
+  // Note that we update weight_ptr and bias_ptr when uploading weight and
+  // bias
+  if (m->quantization_type != DT_NONE) {
+    // copy weight_ptr to quantized_weight_ptr, do compression and store in
+    // m->weight_ptr
+    cudaMemcpyAsync(m->quantized_weight_ptr,
+                    weight.get_byte_ptr(),
+                    m->quantized_weightSize,
+                    cudaMemcpyHostToDevice,
+                    stream);
+
+    if (m->quantization_type == DT_INT4) {
+      int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
+      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(
+          m->quantized_weight_ptr,
+          static_cast<DT *>(m->weight_ptr),
+          m->qProjSize,
+          m->qSize,
+          m->num_q_heads);
+    } else {
+      assert(m->quantization_type == DT_INT8);
+      int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
+      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
+                                          min(CUDA_NUM_THREADS, parallelism),
+                                          0,
+                                          stream>>>(
+          m->quantized_weight_ptr,
+          static_cast<DT *>(m->weight_ptr),
+          m->qProjSize,
+          m->qSize,
+          m->num_q_heads);
+    }
+  } else {
+    if (data_type == DT_FLOAT) {
+      cudaMemcpyAsync(m->weight_ptr,
+                      weight.get_float_ptr(),
+                      m->weightSize,
+                      cudaMemcpyHostToDevice,
+                      stream);
+    } else if (data_type == DT_HALF) {
+      cudaMemcpyAsync(m->weight_ptr,
+                      weight.get_half_ptr(),
+                      m->weightSize,
+                      cudaMemcpyHostToDevice,
+                      stream);
+    } else {
+      assert(false);
+    }
+  }
+}
+
+} // namespace IncMultiHeadAttention
+} // namespace Kernels
+
+using namespace Kernels::IncMultiHeadAttention;
+
+template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float *output_ptr,
+    float const *weight_ptr,
+    float const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half *output_ptr,
+    half const *weight_ptr,
+    half const *bias_ptr,
+    int num_tokens,
+    cudaStream_t stream);
+}; // namespace FlexFlow

From ae961350f9c61aa819cf22a6e68f06ac604430a5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 16:57:41 -0700
Subject: [PATCH 392/667] fix: template instantiate

---
 .../inc_multihead_self_attention_kernels.cu   | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 0d9d291d0..747ce5195 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -517,6 +517,26 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
 using namespace Kernels::IncMultiHeadAttention;
 
+template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float const *input_ptr,
+    float const *weight_ptr,
+    float *output_ptr,
+    float const *bias_ptr,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half const *input_ptr,
+    half const *weight_ptr,
+    half *output_ptr,
+    half const *bias_ptr,
+    cudaStream_t stream);
+
 template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     GenericTensorAccessorR const weight,

From 98641fb574e8f879b331a0197ef48fe48b1daee7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 17:34:05 -0700
Subject: [PATCH 393/667] chore: more clean up

---
 .../inc_multihead_self_attention_kernels.h    |  37 +++--
 src/ops/inc_multihead_self_attention.cpp      |  14 +-
 src/ops/inc_multihead_self_attention.cu       | 100 +------------
 .../inc_multihead_self_attention_kernels.cu   | 140 ++++++++++++++++--
 src/ops/spec_inc_multihead_self_attention.cpp |   2 +-
 src/ops/spec_inc_multihead_self_attention.cu  |  96 +-----------
 src/ops/tree_inc_multihead_self_attention.cpp |   6 +-
 src/ops/tree_inc_multihead_self_attention.cu  |  82 +---------
 8 files changed, 175 insertions(+), 302 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 6bc52194b..d58cde507 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -15,17 +15,13 @@ namespace Kernels {
 namespace IncMultiHeadAttention {
 
 template <typename DT>
-void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         int shard_id,
-                         DT *output_ptr,
-                         DT const *weight_ptr,
-                         DT const *bias_ptr,
-                         int num_tokens,
-                         ffStream_t stream);
+void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
+                             GenericTensorAccessorR const weight,
+                             DataType data_type,
+                             ffStream_t stream);
 
 template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         int shard_id,
                         DT const *input_ptr,
@@ -35,10 +31,25 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         ffStream_t stream);
 
 template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             ffStream_t stream);
+void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
+                      BatchConfig const *bc,
+                      cudaStream_t stream);
+
+template <typename DT>
+void produce_output(IncMultiHeadSelfAttentionMeta const *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream);
+
+template <typename DT>
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         ffStream_t stream);
 } // namespace IncMultiHeadAttention
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d020cc104..3fba4b562 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -244,7 +244,7 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
 }
 
 template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         int shard_id,
                         DT const *input_ptr,
@@ -376,7 +376,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
                              DataType data_type,
                              hipStream_t stream) {
@@ -458,7 +458,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            hipMemcpyHostToDevice,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
+  compute_qkv(m,
                      bc,
                      shard_id,
                      input_ptr,
@@ -774,7 +774,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
@@ -789,7 +789,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
@@ -1087,13 +1087,13 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) {
   }
 }
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
+template void Kernels::IncMultiHeadAttention::pre_build_weight<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     GenericTensorAccessorR const weight,
     DataType data_type,
     hipStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
+template void Kernels::IncMultiHeadAttention::pre_build_weight<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     GenericTensorAccessorR const weight,
     DataType data_type,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index c6477b301..86030a31f 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -46,92 +46,6 @@ using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
-__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) *
-         hidden_size;
-}
-
-__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) *
-         hidden_size;
-}
-
-template <typename DT>
-__global__ void
-    update_qkv_cache_kernel(DT *devQKVProjArray,
-                            half *qTmp_ptr,
-                            half *kCache_ptr,
-                            BatchConfig::PerTokenInfo const *tokenInfos,
-                            BatchConfig::PerRequestInfo *request_infos,
-                            int const max_num_pages,
-                            int hidden_size,
-                            int num_new_tokens) {
-  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / hidden_size;
-  int const offset = thread_idx % hidden_size;
-  if (token_idx >= num_new_tokens) {
-    return;
-  }
-
-  int const req_idx = tokenInfos[token_idx].request_index;
-  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = get_k_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size);
-
-  // key and value cache should be stored interleaved
-  kCache_ptr[to_k_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  kCache_ptr[to_v_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + offset]);
-}
-
-template <typename DT>
-void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
-                      BatchConfig const *bc,
-                      cudaStream_t stream) {
-  // update the kv cache, compact the q array
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  int const max_num_pages =
-      (BatchConfig::max_sequence_length() +
-       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-      kPagesize;
-  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                                      static_cast<half *>(m->queryTmp),
-                                      static_cast<half *>(m->keyCache),
-                                      m->token_infos,
-                                      m->request_infos,
-                                      max_num_pages,
-                                      m->hidden_size,
-                                      num_new_tokens);
-}
-
-template <typename DT>
-__global__ void produce_output_kernel(half const *input_ptr,
-                                      DT *output_ptr,
-                                      int parallelism) {
-  CUDA_KERNEL_LOOP(idx, parallelism) {
-    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
-  }
-}
-
 template <typename DT>
 void incr_attention(IncMultiHeadSelfAttentionMeta *m,
                     BatchConfig const *bc,
@@ -297,13 +211,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  {
-    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
-    produce_output_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(m->outputTmp, output_ptr, parallelism);
-  }
+  produce_output(m, bc, output_ptr, stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -338,7 +246,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   }
 
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
+  compute_qkv(m,
                      bc,
                      shard_id,
                      input_ptr,
@@ -428,7 +336,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
     half const *bias_ptr =
         use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
@@ -443,7 +351,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 747ce5195..1be0359d3 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -233,7 +233,7 @@ __global__ void
 }
 
 template <typename DT>
-void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         int shard_id,
                         DT const *input_ptr,
@@ -379,6 +379,104 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   //   }
 }
 
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) *
+         hidden_size;
+}
+
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) *
+         hidden_size;
+}
+
+template <typename DT>
+__global__ void
+    update_qkv_cache_kernel(DT *devQKVProjArray,
+                            half *qTmp_ptr,
+                            half *kCache_ptr,
+                            BatchConfig::PerTokenInfo const *tokenInfos,
+                            BatchConfig::PerRequestInfo *request_infos,
+                            int const max_num_pages,
+                            int hidden_size,
+                            int num_new_tokens) {
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / hidden_size;
+  int const offset = thread_idx % hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
+  size_t to_k_idx = get_k_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size),
+         to_v_idx = get_v_entry_offset(
+             req_idx, token_abs_idx, max_num_pages, hidden_size);
+
+  // key and value cache should be stored interleaved
+  kCache_ptr[to_k_idx + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
+  kCache_ptr[to_v_idx + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
+  qTmp_ptr[token_idx * hidden_size + offset] =
+      static_cast<half>(devQKVProjArray[from_idx + offset]);
+}
+
+template <typename DT>
+void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
+                      BatchConfig const *bc,
+                      cudaStream_t stream) {
+  // update the kv cache, compact the q array
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  int const max_num_pages =
+      (BatchConfig::max_sequence_length() +
+       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
+      kPagesize;
+  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+                            min(CUDA_NUM_THREADS, parallelism),
+                            0,
+                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                      static_cast<half *>(m->queryTmp),
+                                      static_cast<half *>(m->keyCache),
+                                      m->token_infos,
+                                      m->request_infos,
+                                      max_num_pages,
+                                      m->hidden_size,
+                                      num_new_tokens);
+}
+
+template <typename DT>
+__global__ void produce_output_kernel(half const *input_ptr,
+                                      DT *output_ptr,
+                                      int parallelism) {
+  CUDA_KERNEL_LOOP(idx, parallelism) {
+    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
+  }
+}
+
+template <typename DT>
+void produce_output(IncMultiHeadSelfAttentionMeta const *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream) {
+  int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+  produce_output_kernel<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(m->outputTmp, output_ptr, parallelism);
+}
+
 template <typename DT>
 void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
@@ -453,7 +551,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
                              DataType data_type,
                              cudaStream_t stream) {
@@ -517,7 +615,19 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
 using namespace Kernels::IncMultiHeadAttention;
 
-template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<float>(
+template void Kernels::IncMultiHeadAttention::pre_build_weight<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::pre_build_weight<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    GenericTensorAccessorR const weight,
+    DataType data_type,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_qkv<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     int shard_id,
@@ -527,7 +637,7 @@ template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<float>(
     float const *bias_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<half>(
+template void Kernels::IncMultiHeadAttention::compute_qkv<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     int shard_id,
@@ -537,16 +647,26 @@ template void Kernels::IncMultiHeadAttention::compute_qkv_kernel<half>(
     half const *bias_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<float>(
+template void Kernels::IncMultiHeadAttention::update_qkv_cache<float>(
     IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
+    BatchConfig const *bc,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
+template void Kernels::IncMultiHeadAttention::update_qkv_cache<half>(
     IncMultiHeadSelfAttentionMeta const *m,
-    GenericTensorAccessorR const weight,
-    DataType data_type,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::produce_output<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    float *output_ptr,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::produce_output<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    half *output_ptr,
     cudaStream_t stream);
 
 template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 76991bd52..7cb4837e2 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -495,7 +495,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       hipMemcpyHostToDevice,
       stream));
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
+  compute_qkv(m,
                      bc,
                      shard_id,
                      input_ptr,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index d73bb868d..e9df735da 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -44,92 +44,6 @@ using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
-__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) *
-         hidden_size;
-}
-
-__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) *
-         hidden_size;
-}
-
-template <typename DT>
-__global__ void
-    update_qkv_cache_kernel(DT *devQKVProjArray,
-                            half *qTmp_ptr,
-                            half *kCache_ptr,
-                            BatchConfig::PerTokenInfo const *tokenInfos,
-                            BatchConfig::PerRequestInfo *request_infos,
-                            int const max_num_pages,
-                            int hidden_size,
-                            int num_new_tokens) {
-  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / hidden_size;
-  int const offset = thread_idx % hidden_size;
-  if (token_idx >= num_new_tokens) {
-    return;
-  }
-
-  int const req_idx = tokenInfos[token_idx].request_index;
-  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = get_k_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size);
-
-  // key and value cache should be stored interleaved
-  kCache_ptr[to_k_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  kCache_ptr[to_v_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + offset]);
-}
-
-template <typename DT>
-void update_qkv_cache(SpecIncMultiHeadSelfAttentionMeta const *m,
-                      BatchConfig const *bc,
-                      cudaStream_t stream) {
-  // update the kv cache, compact the q array
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  int const max_num_pages =
-      (BatchConfig::max_sequence_length() +
-       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-      kPagesize;
-  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                                      static_cast<half *>(m->queryTmp),
-                                      static_cast<half *>(m->keyCache),
-                                      m->token_infos,
-                                      m->request_infos,
-                                      max_num_pages,
-                                      m->hidden_size,
-                                      num_new_tokens);
-}
-
-template <typename DT>
-__global__ void produce_output_kernel(half const *input_ptr,
-                                      DT *output_ptr,
-                                      int parallelism) {
-  CUDA_KERNEL_LOOP(idx, parallelism) {
-    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
-  }
-}
-
 template <typename DT>
 void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
                            BatchConfig const *bc,
@@ -301,13 +215,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  {
-    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
-    produce_output_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(m->outputTmp, output_ptr, parallelism);
-  }
+  produce_output(m, bc, output_ptr, stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -332,7 +240,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
   // phase 1: Implement kernel to compute KQV for input tokens
 
   // long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
-  compute_qkv_kernel(m,
+  compute_qkv(m,
                      bc,
                      shard_id,
                      input_ptr,
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index b2002453b..3fb731346 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -490,7 +490,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                            hipMemcpyHostToDevice,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
+  compute_qkv(m,
                      bc,
                      shard_id,
                      input_ptr,
@@ -540,7 +540,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
 
     half const *bias_ptr =
@@ -556,7 +556,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 7b2db3b2a..b152934ce 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -145,74 +145,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   //   cudaEventDestroy(t_end);
 }
 
-template <typename DT>
-__global__ void
-    update_qkv_cache_kernel(DT *devQKVProjArray,
-                            half *qTmp_ptr,
-                            half *kCache_ptr,
-                            BatchConfig::PerTokenInfo const *tokenInfos,
-                            BatchConfig::PerRequestInfo *request_infos,
-                            int const max_num_pages,
-                            int hidden_size,
-                            int num_new_tokens) {
-  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / hidden_size;
-  int const offset = thread_idx % hidden_size;
-  if (token_idx >= num_new_tokens) {
-    return;
-  }
-
-  int const req_idx = tokenInfos[token_idx].request_index;
-  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = get_k_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size);
-
-  // key and value cache should be stored interleaved
-  kCache_ptr[to_k_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  kCache_ptr[to_v_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + offset]);
-}
-
-template <typename DT>
-void update_qkv_cache(TreeIncMultiHeadSelfAttentionMeta const *m,
-                      BatchConfig const *bc,
-                      cudaStream_t stream) {
-  // update the kv cache, compact the q array
-  int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
-  int const max_num_pages =
-      (BatchConfig::max_sequence_length() +
-       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-      kPagesize;
-  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                                      static_cast<half *>(m->queryTmp),
-                                      static_cast<half *>(m->keyCache),
-                                      m->token_infos,
-                                      m->request_infos,
-                                      max_num_pages,
-                                      m->hidden_size,
-                                      num_new_tokens);
-}
-
-template <typename DT>
-__global__ void produce_output_kernel(half const *input_ptr,
-                                      DT *output_ptr,
-                                      int parallelism) {
-  CUDA_KERNEL_LOOP(idx, parallelism) {
-    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
-  }
-}
-
 template <typename DT>
 void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
                            BatchConfig const *bc,
@@ -384,13 +316,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventCreate(&t_end);
   //   cudaEventRecord(t_start, stream);
 
-  {
-    int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
-    produce_output_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(m->outputTmp, output_ptr, parallelism);
-  }
+  produce_output(m, bc, output_ptr, stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -473,7 +399,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
   // Implement kernel to compute KQV for input tokens
-  compute_qkv_kernel(m,
+  compute_qkv(m,
                      bc,
                      shard_id,
                      input_ptr,
@@ -618,7 +544,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
 
   if (input.data_type == DT_HALF) {
     if (m->offload) {
-      pre_build_weight_kernel<half>(m, weight, input.data_type, stream);
+      pre_build_weight<half>(m, weight, input.data_type, stream);
     }
 
     half const *bias_ptr =
@@ -634,7 +560,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
         stream);
   } else if (input.data_type == DT_FLOAT) {
     if (m->offload) {
-      pre_build_weight_kernel<float>(m, weight, input.data_type, stream);
+      pre_build_weight<float>(m, weight, input.data_type, stream);
     }
     float const *bias_ptr =
         use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);

From 8e9e35bef6ed21d9ee4bb6176650173ee2af7fbf Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 19:03:52 -0700
Subject: [PATCH 394/667] chore: minor

---
 .../flexflow/ops/inc_multihead_self_attention.h    |  2 +-
 src/ops/inc_multihead_self_attention.cpp           | 14 +++++++-------
 src/ops/inc_multihead_self_attention.cu            |  8 ++++----
 .../inc_multihead_self_attention_kernels.cu        | 12 ++++++------
 src/ops/spec_inc_multihead_self_attention.cpp      |  4 ++--
 src/ops/spec_inc_multihead_self_attention.cu       |  6 +++---
 src/ops/tree_inc_multihead_self_attention.cpp      |  8 ++++----
 src/ops/tree_inc_multihead_self_attention.cu       | 10 +++++-----
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 2db4e2e96..fa1912fee 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -177,7 +177,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
       quantized_weightSize;
   int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
   int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
-      hidden_size;
+      local_hidden_size;
   bool *has_load_weights;
   bool *apply_rotary_embedding;
   bool *qkv_bias;
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 3fba4b562..1d04953e1 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -320,7 +320,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                        m->num_q_heads,
                        *m->scaling_query,
                        m->scaling_factor,
-                       m->hidden_size);
+                       m->local_hidden_size);
   } else if (m->scaling_query) {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel<DT>),
                        GET_BLOCKS(parallelism),
@@ -332,11 +332,11 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                        m->num_q_heads,
                        m->qProjSize,
                        m->scaling_factor,
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
   if (*m->apply_rotary_embedding) {
     /*q&k*/
-    parallelism = num_tokens * m->hidden_size;
+    parallelism = num_tokens * m->local_hidden_size;
     hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
@@ -349,7 +349,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                        m->kProjSize,
                        num_tokens,
                        q_array_size,
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
 }
 
@@ -359,7 +359,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   if (num_tokens > 0) {
-    int parallelism = m->hidden_size * num_tokens;
+    int parallelism = m->local_hidden_size * num_tokens;
     hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
@@ -371,7 +371,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->token_infos,
                        num_tokens,
                        BatchConfig::max_sequence_length(),
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
 }
 
@@ -907,7 +907,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  hidden_size = num_q_heads * qProjSize;
+  local_hidden_size = num_q_heads * qProjSize;
 
   weightSize =
       ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 86030a31f..621c9f741 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -281,7 +281,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   // std::cout << "Attn time: " << elapsed << " ms\n";
 
   // Debug output:
-  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  //   int size = m->local_hidden_size * BatchConfig::max_tokens_per_batch();
   //   float *temp_output = new float[size];
   //   cudaDeviceSynchronize();
   //   cudaMemcpy(
@@ -290,8 +290,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   //   printf("Output: ");
   //   float temp = 0;
   //   for (int i = 0; i < 1; ++i) {
-  //     for (int j = 0; j < m->hidden_size; ++j) {
-  //       temp += temp_output[i * m->hidden_size + j];
+  //     for (int j = 0; j < m->local_hidden_size; ++j) {
+  //       temp += temp_output[i * m->local_hidden_size + j];
   //     }
   //     printf("%.6f ", temp);
   //   }
@@ -464,7 +464,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  hidden_size = num_q_heads * qProjSize;
+  local_hidden_size = num_q_heads * qProjSize;
 
   weightSize =
       ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 1be0359d3..951e40255 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -335,7 +335,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                                     m->num_q_heads,
                                     *m->scaling_query,
                                     m->scaling_factor,
-                                    m->hidden_size);
+                                    m->local_hidden_size);
   } else if (m->scaling_query) {
     scaling_query_kernel<<<GET_BLOCKS(parallelism),
                            min(CUDA_NUM_THREADS, parallelism),
@@ -345,7 +345,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                                      m->num_q_heads,
                                      m->qProjSize,
                                      m->scaling_factor,
-                                     m->hidden_size);
+                                     m->local_hidden_size);
   }
 
   //   checkCUDA(cudaEventCreate(&t_start));
@@ -355,7 +355,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
   // Step 3: apply rotary embedding if needed
   if (*m->apply_rotary_embedding) {
     /*q&k*/
-    parallelism = num_tokens * m->hidden_size;
+    parallelism = num_tokens * m->local_hidden_size;
     apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
                                 min(CUDA_NUM_THREADS, parallelism),
                                 0,
@@ -366,7 +366,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                                           m->kProjSize,
                                           num_tokens,
                                           q_array_size,
-                                          m->hidden_size);
+                                          m->local_hidden_size);
   }
   //   checkCUDA(cudaEventRecord(t_end, stream));
   //   checkCUDA(cudaEventSynchronize(t_end));
@@ -438,7 +438,7 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
                       cudaStream_t stream) {
   // update the kv cache, compact the q array
   int num_new_tokens = bc->num_active_tokens();
-  int parallelism = m->hidden_size * num_new_tokens;
+  int parallelism = m->local_hidden_size * num_new_tokens;
   int const max_num_pages =
       (BatchConfig::max_sequence_length() +
        BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
@@ -452,7 +452,7 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
                                       m->token_infos,
                                       m->request_infos,
                                       max_num_pages,
-                                      m->hidden_size,
+                                      m->local_hidden_size,
                                       num_new_tokens);
 }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 7cb4837e2..02d917165 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -147,7 +147,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   // printf("curr depth: %d\n", curr_depth);
   // assert(curr_depth < 3);
   if (num_tokens > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
+    int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_tokens;
     hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache<DT>),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
@@ -167,7 +167,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                        BatchConfig::max_sequence_length(),
                        TreeSearchBatchConfig::MAX_BEAM_WIDTH,
                        /*root*/ curr_depth == 0,
-                       m->hidden_size);
+                       m->local_hidden_size);
   }
 }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index e9df735da..caa0c6711 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -256,7 +256,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
   tree_search_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
-  //   int size = m->hidden_size * BatchConfig::max_tokens_per_batch();
+  //   int size = m->local_hidden_size * BatchConfig::max_tokens_per_batch();
   //   float *temp_output = new float[size];
   //   cudaDeviceSynchronize();
   //   cudaMemcpy(
@@ -266,8 +266,8 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
   //   printf("Output: ");
   //   for (int i = 0; i < bc->num_tokens; ++i) {
   //     float temp = 0;
-  //     for (int j = 0; j < m->hidden_size; ++j) {
-  //       temp += temp_output[i * m->hidden_size + j];
+  //     for (int j = 0; j < m->local_hidden_size; ++j) {
+  //       temp += temp_output[i * m->local_hidden_size + j];
   //     }
   //     printf("%.6f ", temp);
   //   }
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 3fb731346..c148eaf33 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -75,7 +75,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    hipStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
-    int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
+    int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(commit_tokens_kernel<DT>),
         GET_BLOCKS(parallelism),
@@ -92,7 +92,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         num_tokens_to_commit,
         m->num_active_tokens, // number of active tokens in previous batch
         BatchConfig::max_sequence_length(),
-        m->hidden_size);
+        m->local_hidden_size);
   }
 }
 
@@ -198,7 +198,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
       assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens);
       {
         // update K-V cache
-        int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens;
+        int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_new_tokens;
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(update_tree_branch_kv_cache<DT>),
             GET_BLOCKS(parallelism),
@@ -216,7 +216,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             processed_tokens_in_batch, // num_processed_tokens_in_batch
             m->num_active_tokens,      // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
-            m->hidden_size);
+            m->local_hidden_size);
       }
 
       // bc->token_last_available_idx[i] + 1;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index b152934ce..7417518b1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -125,7 +125,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
        BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
       kPagesize;
   int const num_requests = bc->num_active_requests();
-  int parallelism = m->hidden_size * num_requests;
+  int parallelism = m->local_hidden_size * num_requests;
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
@@ -133,7 +133,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                                    m->committed_token_infos,
                                    m->request_available,
                                    num_requests,
-                                   m->hidden_size,
+                                   m->local_hidden_size,
                                    m->num_tokens_to_commit,
                                    max_num_pages);
   //   cudaEventRecord(t_end, stream);
@@ -454,7 +454,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   // Debug output:
   // {
-  //   int size = m->hidden_size * bc->num_active_tokens();
+  //   int size = m->local_hidden_size * bc->num_active_tokens();
   //   float *temp_output = new float[size];
   //   cudaDeviceSynchronize();
   //   cudaMemcpy(
@@ -463,8 +463,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   printf("Output (flashinfer attention) :");
   //   for (int i = 0; i < 1; ++i) {
   //     float temp = 0;
-  //     for (int j = 0; j < m->hidden_size; ++j) {
-  //       temp += temp_output[i * m->hidden_size + j];
+  //     for (int j = 0; j < m->local_hidden_size; ++j) {
+  //       temp += temp_output[i * m->local_hidden_size + j];
   //     }
   //     printf("%.6f ", temp);
   //   }

From f03479ae211d622ad4d111e303da354a92ef2bb8 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 22:22:15 -0700
Subject: [PATCH 395/667] chore: q/k/vSize reduce to hidden_size

---
 .../ops/inc_multihead_self_attention.h        |  8 ++---
 .../ops/spec_inc_multihead_self_attention.h   |  2 +-
 .../ops/tree_inc_multihead_self_attention.h   |  2 +-
 src/ops/inc_multihead_self_attention.cc       | 32 +++++++++----------
 src/ops/inc_multihead_self_attention.cu       | 19 +++--------
 .../inc_multihead_self_attention_kernels.cu   | 17 +++++-----
 src/ops/spec_inc_multihead_self_attention.cc  | 32 +++++++++----------
 src/ops/spec_inc_multihead_self_attention.cu  |  4 +--
 src/ops/tree_inc_multihead_self_attention.cc  | 32 +++++++++----------
 src/ops/tree_inc_multihead_self_attention.cu  |  4 +--
 10 files changed, 65 insertions(+), 87 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index fa1912fee..5b335fb21 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -128,7 +128,7 @@ class IncMultiHeadSelfAttention : public Op {
   bool qkv_bias;
   bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
@@ -146,9 +146,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   IncMultiHeadSelfAttentionMeta(FFHandler handler,
                                 InferenceMode infer_mode,
                                 Op const *attn,
-                                int _qSize,
-                                int _kSize,
-                                int _vSize,
+                                int _hidden_size,
                                 int _qProjSize,
                                 int _kProjSize,
                                 int _vProjSize,
@@ -175,7 +173,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   size_t weights_params, weightSize, biasSize, reserveSpaceSize,
       quantized_weightSize;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
   int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
       local_hidden_size;
   bool *has_load_weights;
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index 0e97239eb..bba27f40f 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -122,7 +122,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
   bool qkv_bias;
   bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
 };
 
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 4cfec8b7a..f7100e09f 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -126,7 +126,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
   bool qkv_bias;
   bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling, position_bias;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 425d9618f..9f8a6f0c0 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -149,11 +149,11 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
   // Compute weight size
   int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
       oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
+  int hidden_size = input->dims[0];
+  int qParas = qProjSize * hidden_size;
+  int kParas = kProjSize * hidden_size;
+  int vParas = vProjSize * hidden_size;
+  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : hidden_size);
 
   // allocate num_q_heads for key, value for replication
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
@@ -315,8 +315,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
+      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
@@ -340,11 +339,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qProjSize * this->hidden_size;
+    int kParas = this->kProjSize * this->hidden_size;
+    int vParas = this->vProjSize * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -430,8 +429,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
+      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
@@ -453,11 +451,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qProjSize * this->hidden_size;
+    int kParas = this->kProjSize * this->hidden_size;
+    int vParas = this->vProjSize * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 621c9f741..ed3bc8ca4 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -389,9 +389,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     : IncMultiHeadSelfAttentionMeta(handler,
                                     INC_DECODING_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
+                                    attn->hidden_size,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -417,9 +415,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     InferenceMode infer_mode,
     Op const *attn,
-    int _qSize,
-    int _kSize,
-    int _vSize,
+    int _hidden_size,
     int _qProjSize,
     int _kProjSize,
     int _vProjSize,
@@ -445,12 +441,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
   checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor));
-  qSize = _qSize;
-  kSize = _kSize;
-  vSize = _vSize;
-  // assume dimensions match for now
-  assert(qSize == kSize);
-  assert(kSize == vSize);
+  hidden_size = _hidden_size;
   qProjSize = _qProjSize;
   kProjSize = _kProjSize;
   assert(qProjSize == kProjSize); // required for attention QK.T matmul
@@ -467,9 +458,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   local_hidden_size = num_q_heads * qProjSize;
 
   weightSize =
-      ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) *
+      ((hidden_size * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : hidden_size)) *
            num_q_heads +
-       (kSize * kProjSize + vSize * vProjSize) * num_q_heads) *
+       (hidden_size * kProjSize + hidden_size * vProjSize) * num_q_heads) *
       size_of_dt;
   if (quantization_type != DT_NONE) {
     quantized_weightSize = get_quantization_to_byte_size(
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 951e40255..e69f727dd 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -244,7 +244,6 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
@@ -274,14 +273,14 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
     int m_v = m->vProjSize * m->num_q_heads;
     assert(m_q == m_k && m_k == m_v); // keep things simple for now
     int n = bc->num_active_tokens();
-    int k = m->qSize;
+    int k = m->hidden_size;
     int m_ = m_q * QKV_WEIGHT_NUM;
     // before transpositions
     int lda = k, ldb = k, ldc = m_;
     // matrix A: QKV weights
-    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
+    // matrix A's layout: [hidden_size (hidden_dim), qProjSize, num_heads, 3]
     // matrix B: input
-    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
+    // matrix B's layout: [hidden_size (hidden_dim), num_new_tokens]
     // matrix C: devQKVProjArray
     // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
     checkCUDA(cublasGemmEx(m->handle.blas,
@@ -506,7 +505,7 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
     int lda = k, ldb = k, ldc = m_;
     // matrix A: output projection weight
     // matrix A's layout: [vProjSize * num_heads, oProjSize]
-    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+    DT const *A = weight_ptr + m->hidden_size * (m->qProjSize * m->num_q_heads +
                                            m->kProjSize * m->num_q_heads +
                                            m->vProjSize * m->num_q_heads);
     // matrix B: attn heads
@@ -568,7 +567,7 @@ void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
                     stream);
 
     if (m->quantization_type == DT_INT4) {
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
+      int parallelism = m->qProjSize * m->hidden_size * m->num_q_heads / 2;
       decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
                                           min(CUDA_NUM_THREADS, parallelism),
                                           0,
@@ -576,11 +575,11 @@ void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
           m->quantized_weight_ptr,
           static_cast<DT *>(m->weight_ptr),
           m->qProjSize,
-          m->qSize,
+          m->hidden_size,
           m->num_q_heads);
     } else {
       assert(m->quantization_type == DT_INT8);
-      int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
+      int parallelism = m->qProjSize * m->hidden_size * m->num_q_heads;
       decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
                                           min(CUDA_NUM_THREADS, parallelism),
                                           0,
@@ -588,7 +587,7 @@ void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
           m->quantized_weight_ptr,
           static_cast<DT *>(m->weight_ptr),
           m->qProjSize,
-          m->qSize,
+          m->hidden_size,
           m->num_q_heads);
     }
   } else {
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 5c3d097de..18fca3c8a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -147,11 +147,11 @@ Tensor
   // Compute weight size
   int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
       oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
+  int hidden_size = input->dims[0];
+  int qParas = qProjSize * hidden_size;
+  int kParas = kProjSize * hidden_size;
+  int vParas = vProjSize * hidden_size;
+  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : hidden_size);
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
   {
@@ -287,8 +287,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
+      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
@@ -309,11 +308,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qProjSize * this->hidden_size;
+    int kParas = this->kProjSize * this->hidden_size;
+    int vParas = this->vProjSize * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -390,8 +389,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
+      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
@@ -411,11 +409,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qProjSize * this->hidden_size;
+    int kParas = this->kProjSize * this->hidden_size;
+    int vParas = this->vProjSize * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index caa0c6711..e0a831aba 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -367,9 +367,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     : IncMultiHeadSelfAttentionMeta(handler,
                                     TREE_SEARCH_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
+                                    attn->hidden_size,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index efe4657a2..6280c02eb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -151,11 +151,11 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   // Compute weight size
   int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
       oProjSize = embed_dim;
-  int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0];
-  int qParas = qProjSize * qSize;
-  int kParas = kProjSize * kSize;
-  int vParas = vProjSize * vSize;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
+  int hidden_size = input->dims[0];
+  int qParas = qProjSize * hidden_size;
+  int kParas = kProjSize * hidden_size;
+  int vParas = vProjSize * hidden_size;
+  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : hidden_size);
   int one_head_size = qParas + kParas + vParas + oParas;
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
@@ -312,8 +312,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
+      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
@@ -336,11 +335,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qProjSize * this->hidden_size;
+    int kParas = this->kProjSize * this->hidden_size;
+    int vParas = this->vProjSize * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -428,8 +427,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
+      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
       vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
@@ -451,11 +449,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->qSize;
-    int kParas = this->kProjSize * this->kSize;
-    int vParas = this->vProjSize * this->vSize;
+    int qParas = this->qProjSize * this->hidden_size;
+    int kParas = this->kProjSize * this->hidden_size;
+    int vParas = this->vProjSize * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize);
+        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 7417518b1..4fc127ac3 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -601,9 +601,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     : IncMultiHeadSelfAttentionMeta(handler,
                                     TREE_VERIFY_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
+                                    attn->hidden_size,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,

From a363b79283d11a2d8c200ff2976ba7ff5c1c4cae Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 20 Aug 2024 23:03:41 -0700
Subject: [PATCH 396/667] chore: reduce projSize into head_dim

---
 .../ops/inc_multihead_self_attention.h        | 11 +--
 .../ops/spec_inc_multihead_self_attention.h   |  2 +-
 .../ops/tree_inc_multihead_self_attention.h   |  2 +-
 src/ops/inc_multihead_self_attention.cc       | 62 ++++++------
 src/ops/inc_multihead_self_attention.cu       | 60 ++++++------
 .../inc_multihead_self_attention_kernels.cu   | 95 +++++++++----------
 src/ops/spec_inc_multihead_self_attention.cc  | 62 ++++++------
 src/ops/spec_inc_multihead_self_attention.cu  | 13 ++-
 src/ops/tree_inc_multihead_self_attention.cc  | 62 ++++++------
 src/ops/tree_inc_multihead_self_attention.cu  | 17 ++--
 src/runtime/graph.cc                          | 18 ++--
 11 files changed, 196 insertions(+), 208 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 5b335fb21..2db071a56 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -128,7 +128,7 @@ class IncMultiHeadSelfAttention : public Op {
   bool qkv_bias;
   bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling, position_bias;
-  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
@@ -147,10 +147,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 InferenceMode infer_mode,
                                 Op const *attn,
                                 int _hidden_size,
-                                int _qProjSize,
-                                int _kProjSize,
-                                int _vProjSize,
-                                int _oProjSize,
+                                int _qk_dim,
+                                int _v_dim,
+                                int _o_dim,
                                 bool _apply_rotary_embedding,
                                 bool _qkv_bias,
                                 bool _scaling_query,
@@ -173,7 +172,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   size_t weights_params, weightSize, biasSize, reserveSpaceSize,
       quantized_weightSize;
-  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
       local_hidden_size;
   bool *has_load_weights;
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index bba27f40f..617263a05 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -122,7 +122,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
   bool qkv_bias;
   bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling, position_bias;
-  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
 };
 
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index f7100e09f..6126183d1 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -126,7 +126,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
   bool qkv_bias;
   bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
       qk_prod_scaling, position_bias;
-  int hidden_size, qProjSize, kProjSize, vProjSize, oProjSize;
+  int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 9f8a6f0c0..bff398104 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -147,13 +147,13 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
+  int qk_dim = kdim, v_dim = kdim,
+      o_dim = embed_dim;
   int hidden_size = input->dims[0];
-  int qParas = qProjSize * hidden_size;
-  int kParas = kProjSize * hidden_size;
-  int vParas = vProjSize * hidden_size;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : hidden_size);
+  int qParas = qk_dim * hidden_size;
+  int kParas = qk_dim * hidden_size;
+  int vParas = v_dim * hidden_size;
+  int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size);
 
   // allocate num_q_heads for key, value for replication
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
@@ -179,9 +179,9 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
   if (qkv_bias || final_bias) {
     // q, k, v, o
     int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+        qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
     int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
+                   (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -315,8 +315,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim),
+      v_dim(_vdim), o_dim(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -339,11 +339,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->hidden_size;
-    int kParas = this->kProjSize * this->hidden_size;
-    int vParas = this->vProjSize * this->hidden_size;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -368,9 +368,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
       int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -429,8 +429,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim),
+      v_dim(_vdim), o_dim(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -451,11 +451,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->hidden_size;
-    int kParas = this->kProjSize * this->hidden_size;
-    int vParas = this->vProjSize * this->hidden_size;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -481,9 +481,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
       int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -516,11 +516,11 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     : IncMultiHeadSelfAttention(model,
                                 other.layer_guid,
                                 input,
-                                other.oProjSize,
+                                other.o_dim,
                                 other.num_q_heads,
                                 other.num_kv_heads,
-                                other.qProjSize,
-                                other.vProjSize,
+                                other.qk_dim,
+                                other.v_dim,
                                 other.dropout,
                                 other.qkv_bias,
                                 other.final_bias,
@@ -694,7 +694,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -910,10 +910,10 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
 IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   IncMultiHeadSelfAttentionParams params;
   params.layer_guid = this->layer_guid;
-  params.embed_dim = this->oProjSize;
+  params.embed_dim = this->o_dim;
   params.num_q_heads = this->num_q_heads;
-  params.kdim = this->kProjSize;
-  params.vdim = this->vProjSize;
+  params.kdim = this->qk_dim;
+  params.vdim = this->v_dim;
   params.dropout = this->dropout;
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ed3bc8ca4..7e44cf628 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -61,10 +61,10 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
   // global constant parameters
   uint32_t const num_q_heads = m->num_q_heads;
   uint32_t const num_kv_heads = m->num_kv_heads;
-  uint32_t const head_dim = m->qProjSize;
+  uint32_t const head_dim = m->qk_dim;
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale =
-      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -390,10 +390,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     INC_DECODING_MODE,
                                     attn,
                                     attn->hidden_size,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
+                                    attn->qk_dim,
+                                    attn->v_dim,
+                                    attn->o_dim,
                                     attn->apply_rotary_embedding,
                                     attn->qkv_bias,
                                     attn->scaling_query,
@@ -416,10 +415,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     InferenceMode infer_mode,
     Op const *attn,
     int _hidden_size,
-    int _qProjSize,
-    int _kProjSize,
-    int _vProjSize,
-    int _oProjSize,
+    int _qk_dim,
+    int _v_dim,
+    int _o_dim,
     bool _apply_rotary_embedding,
     bool _qkv_bias,
     bool _scaling_query,
@@ -442,11 +440,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
   checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor));
   hidden_size = _hidden_size;
-  qProjSize = _qProjSize;
-  kProjSize = _kProjSize;
-  assert(qProjSize == kProjSize); // required for attention QK.T matmul
-  vProjSize = _vProjSize;
-  oProjSize = _oProjSize;
+  qk_dim = _qk_dim;
+  v_dim = _v_dim;
+  o_dim = _o_dim;
   size_t size_of_dt = data_type_size(attn->data_type);
   quantization_type = _quantization_type;
   offload = _offload;
@@ -455,22 +451,22 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  local_hidden_size = num_q_heads * qProjSize;
+  local_hidden_size = num_q_heads * qk_dim;
 
   weightSize =
-      ((hidden_size * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : hidden_size)) *
+      ((hidden_size * qk_dim + o_dim * (v_dim > 0 ? v_dim : hidden_size)) *
            num_q_heads +
-       (hidden_size * kProjSize + hidden_size * vProjSize) * num_q_heads) *
+       (hidden_size * qk_dim + hidden_size * v_dim) * num_q_heads) *
       size_of_dt;
   if (quantization_type != DT_NONE) {
     quantized_weightSize = get_quantization_to_byte_size(
         attn->data_type, quantization_type, weightSize);
   }
-  // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0;
+  // biasSize = _bias ? o_dim * size_of_dt * 4 : 0;
 
   int qkv_bias_size =
-      qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
-  int final_bias_size = oProjSize;
+      qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+  int final_bias_size = o_dim;
   biasSize =
       (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0);
 
@@ -499,9 +495,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
     int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-    size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
-                                                       kProjSize * num_q_heads +
-                                                       vProjSize * num_q_heads);
+    size_t qkv_max_proj_size = max_tokens_per_batch * (qk_dim * num_q_heads +
+                                                       qk_dim * num_q_heads +
+                                                       v_dim * num_q_heads);
     size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0,
            qk_prod_size = 0;
     // assert((BatchConfig::max_sequence_length() +
@@ -517,12 +513,12 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
         query_tmp_size =
-            num_q_heads * qProjSize * BatchConfig::max_tokens_per_batch();
+            num_q_heads * qk_dim * BatchConfig::max_tokens_per_batch();
         // a K-ary tree max node is (k^n - 1) / 2
-        key_cache_size = num_q_heads * kProjSize *
+        key_cache_size = num_q_heads * qk_dim *
                          BatchConfig::max_requests_per_batch() * max_num_pages *
                          kPagesize;
-        value_cache_size = num_q_heads * vProjSize *
+        value_cache_size = num_q_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
         qk_prod_size = BatchConfig::max_sequence_length() * max_num_pages *
@@ -532,10 +528,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       default:
         assert(false && "Unkown inference mode");
     }
-    size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
-    size_t output_tmp_size = max_tokens_per_batch * num_q_heads * vProjSize;
-    size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads +
-                                                   kProjSize * num_q_heads)) /
+    size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
+    size_t output_tmp_size = max_tokens_per_batch * num_q_heads * v_dim;
+    size_t complex_size = (max_tokens_per_batch * (qk_dim * num_q_heads +
+                                                   qk_dim * num_q_heads)) /
                           2;
     size_t totalSize =
         (qkv_max_proj_size + query_tmp_size + key_cache_size +
@@ -655,7 +651,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   handler.incr_attention_metadata->set_enabled(true);
   handler.incr_attention_metadata->set_num_q_heads(num_q_heads);
   handler.incr_attention_metadata->set_num_kv_heads(num_kv_heads);
-  handler.incr_attention_metadata->set_head_dim(qProjSize);
+  handler.incr_attention_metadata->set_head_dim(qk_dim);
 
   cudaStreamSynchronize(stream);
 }
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index e69f727dd..d535116fb 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -63,9 +63,9 @@ __global__ void apply_proj_bias_w(DT *input_ptr,
                                   DT const *bias_ptr,
                                   int num_tokens,
                                   int qkv_weight_size,
-                                  int oProjSize) {
-  CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) {
-    int bias_idx = qkv_weight_size + i % oProjSize;
+                                  int o_dim) {
+  CUDA_KERNEL_LOOP(i, num_tokens * o_dim) {
+    int bias_idx = qkv_weight_size + i % o_dim;
     input_ptr[i] += bias_ptr[bias_idx];
   }
 }
@@ -75,9 +75,8 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
                                     DT const *bias_ptr,
                                     int shard_id,
                                     int num_tokens,
-                                    int qProjSize,
-                                    int kProjSize,
-                                    int vProjSize,
+                                    int qk_dim,
+                                    int v_dim,
                                     int global_num_q_heads,
                                     int num_q_heads,
                                     bool scaling_query,
@@ -86,14 +85,14 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) {
     // for simplicity, assume q, k, v is in same shape
     // 0->q, 1->k, 2->v
-    // int qkv_index = i / (num_tokens * qProjSize) % 3;
+    // int qkv_index = i / (num_tokens * qk_dim) % 3;
 
     int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
     size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
 
     int qkv_index = in_token_idx / hidden_size;
 
-    int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
+    int proj_size = qkv_index == 0 ? qk_dim : qk_dim;
 
     int head_idx =
         (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size;
@@ -102,8 +101,8 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
     size_t pre_length =
         qkv_index == 0
             ? 0
-            : (qkv_index == 1 ? qProjSize * global_num_q_heads
-                              : qProjSize * global_num_q_heads * KV_WEIGHT_NUM);
+            : (qkv_index == 1 ? qk_dim * global_num_q_heads
+                              : qk_dim * global_num_q_heads * KV_WEIGHT_NUM);
 
     size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size;
 
@@ -117,7 +116,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
 
 template <typename DT>
 __global__ void scaling_query_kernel(DT *input_ptr,
-                                     int qProjSize,
+                                     int qk_dim,
                                      int num_tokens,
                                      int num_q_heads,
                                      float scaling_factor,
@@ -134,8 +133,7 @@ __global__ void
     apply_rotary_embedding_native(DT *input_ptr,
                                   cuFloatComplex *complex_input,
                                   BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
+                                  int qk_dim,
                                   int num_q_heads,
                                   int num_tokens,
                                   int num_kv_heads,
@@ -144,10 +142,10 @@ __global__ void
                                   int q_array_size) {
   CUDA_KERNEL_LOOP(
       i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
+      num_tokens * (qk_dim * num_q_heads + qk_dim * num_kv_heads) / 2) {
     // create complex number
     bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
+    int proj_size = q_tensor ? qk_dim : qk_dim;
     int real_i = q_tensor ? i : i - q_array_size / 2;
 
     int head_idx = real_i / (num_tokens * proj_size / 2);
@@ -183,15 +181,14 @@ __global__ void
     apply_rotary_embedding_hf(DT *input_ptr,
                               cuFloatComplex *complex_input,
                               BatchConfig::PerTokenInfo const *tokenInfos,
-                              int qProjSize,
-                              int kProjSize,
+                              int qk_dim,
                               int num_tokens,
                               size_t q_array_size,
                               int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
     // create complex number
     bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
+    int proj_size = q_tensor ? qk_dim : qk_dim;
     int real_i = q_tensor ? i : i - q_array_size / 2;
 
     int token_idx = real_i / (hidden_size / 2);
@@ -208,7 +205,7 @@ __global__ void
     cuFloatComplex cii = {input_ptr[real_part_index],
                           input_ptr[complex_part_index]};
 
-    // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64
+    // get the freq_cis: shape 1 * (qk_dim/2) = 1 * 64
     // apply a Cartesian coordinate transformation
     // multiple with input & /copy back to q/k
 
@@ -268,9 +265,9 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
   {
     DT alpha = 1.0f, beta = 0.0f;
     // after transpositions
-    int m_q = m->qProjSize * m->num_q_heads;
-    int m_k = m->kProjSize * m->num_q_heads;
-    int m_v = m->vProjSize * m->num_q_heads;
+    int m_q = m->qk_dim * m->num_q_heads;
+    int m_k = m->qk_dim * m->num_q_heads;
+    int m_v = m->v_dim * m->num_q_heads;
     assert(m_q == m_k && m_k == m_v); // keep things simple for now
     int n = bc->num_active_tokens();
     int k = m->hidden_size;
@@ -278,11 +275,11 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
     // before transpositions
     int lda = k, ldb = k, ldc = m_;
     // matrix A: QKV weights
-    // matrix A's layout: [hidden_size (hidden_dim), qProjSize, num_heads, 3]
+    // matrix A's layout: [hidden_size (hidden_dim), qk_dim, num_heads, 3]
     // matrix B: input
     // matrix B's layout: [hidden_size (hidden_dim), num_new_tokens]
     // matrix C: devQKVProjArray
-    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
+    // matrix B's layout: [qk_dim, num_heads, 3, num_new_tokens]
     checkCUDA(cublasGemmEx(m->handle.blas,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -315,8 +312,8 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
   //   }
 
   int num_tokens = bc->num_active_tokens();
-  int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
-  size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
+  int parallelism = m->qk_dim * num_tokens * m->num_q_heads;
+  size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
 
   // Step 2: apply bias for QKV, or scale the query
   if (*m->qkv_bias) {
@@ -327,9 +324,8 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                                     bias_ptr,
                                     shard_id,
                                     num_tokens,
-                                    m->qProjSize,
-                                    m->kProjSize,
-                                    m->vProjSize,
+                                    m->qk_dim,
+                                    m->v_dim,
                                     m->global_num_q_heads,
                                     m->num_q_heads,
                                     *m->scaling_query,
@@ -342,7 +338,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                            stream>>>(output_ptr,
                                      num_tokens,
                                      m->num_q_heads,
-                                     m->qProjSize,
+                                     m->qk_dim,
                                      m->scaling_factor,
                                      m->local_hidden_size);
   }
@@ -361,8 +357,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                                 stream>>>(output_ptr,
                                           m->complex_input,
                                           m->token_infos,
-                                          m->qProjSize,
-                                          m->kProjSize,
+                                          m->qk_dim,
                                           num_tokens,
                                           q_array_size,
                                           m->local_hidden_size);
@@ -469,7 +464,7 @@ void produce_output(IncMultiHeadSelfAttentionMeta const *m,
                     BatchConfig const *bc,
                     DT *output_ptr,
                     cudaStream_t stream) {
-  int parallelism = m->vProjSize * m->num_q_heads * bc->num_active_tokens();
+  int parallelism = m->v_dim * m->num_q_heads * bc->num_active_tokens();
   produce_output_kernel<<<GET_BLOCKS(parallelism),
                           min(CUDA_NUM_THREADS, parallelism),
                           0,
@@ -498,21 +493,21 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   {
     DT alpha = 1.0f, beta = 0.0f;
     // after transpositions
-    int m_ = m->oProjSize;
-    int k = m->vProjSize * m->num_q_heads;
+    int m_ = m->o_dim;
+    int k = m->v_dim * m->num_q_heads;
     int n = num_tokens;
     // before transpositions
     int lda = k, ldb = k, ldc = m_;
     // matrix A: output projection weight
-    // matrix A's layout: [vProjSize * num_heads, oProjSize]
-    DT const *A = weight_ptr + m->hidden_size * (m->qProjSize * m->num_q_heads +
-                                           m->kProjSize * m->num_q_heads +
-                                           m->vProjSize * m->num_q_heads);
+    // matrix A's layout: [v_dim * num_heads, o_dim]
+    DT const *A = weight_ptr + m->hidden_size * (m->qk_dim * m->num_q_heads +
+                                           m->qk_dim * m->num_q_heads +
+                                           m->v_dim * m->num_q_heads);
     // matrix B: attn heads
-    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+    // matrix B's layout: [v_dim * num_heads, num_new_tokens]
     DT const *B = static_cast<DT *>(m->attn_heads);
     // matrix B: output
-    // matrix B's layout: [oProjSize, num_new_tokens]
+    // matrix B's layout: [o_dim, num_new_tokens]
     DT *C = static_cast<DT *>(output_ptr);
 
     checkCUDA(cublasGemmEx(m->handle.blas,
@@ -537,15 +532,15 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   }
   // Add final output bias
   if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
+    int parallelism = m->o_dim * num_tokens;
+    int qkv_weight_size = m->qk_dim * m->global_num_q_heads +
+                          m->qk_dim * m->global_num_q_heads +
+                          m->v_dim * m->global_num_q_heads;
     apply_proj_bias_w<<<GET_BLOCKS(parallelism),
                         min(CUDA_NUM_THREADS, parallelism),
                         0,
                         stream>>>(
-        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize);
+        output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->o_dim);
   }
 }
 
@@ -567,26 +562,26 @@ void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
                     stream);
 
     if (m->quantization_type == DT_INT4) {
-      int parallelism = m->qProjSize * m->hidden_size * m->num_q_heads / 2;
+      int parallelism = m->qk_dim * m->hidden_size * m->num_q_heads / 2;
       decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
                                           min(CUDA_NUM_THREADS, parallelism),
                                           0,
                                           stream>>>(
           m->quantized_weight_ptr,
           static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
+          m->qk_dim,
           m->hidden_size,
           m->num_q_heads);
     } else {
       assert(m->quantization_type == DT_INT8);
-      int parallelism = m->qProjSize * m->hidden_size * m->num_q_heads;
+      int parallelism = m->qk_dim * m->hidden_size * m->num_q_heads;
       decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
                                           min(CUDA_NUM_THREADS, parallelism),
                                           0,
                                           stream>>>(
           m->quantized_weight_ptr,
           static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
+          m->qk_dim,
           m->hidden_size,
           m->num_q_heads);
     }
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 18fca3c8a..9e6da5238 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -145,13 +145,13 @@ Tensor
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
+  int qk_dim = kdim, v_dim = kdim,
+      o_dim = embed_dim;
   int hidden_size = input->dims[0];
-  int qParas = qProjSize * hidden_size;
-  int kParas = kProjSize * hidden_size;
-  int vParas = vProjSize * hidden_size;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : hidden_size);
+  int qParas = qk_dim * hidden_size;
+  int kParas = qk_dim * hidden_size;
+  int vParas = v_dim * hidden_size;
+  int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size);
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
   {
@@ -167,9 +167,9 @@ Tensor
   if (qkv_bias || final_bias) {
     // q, k, v, o
     int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+        qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
     int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
+                   (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -287,8 +287,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim),
+      v_dim(_vdim), o_dim(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) {
@@ -308,11 +308,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->hidden_size;
-    int kParas = this->kProjSize * this->hidden_size;
-    int vParas = this->vProjSize * this->hidden_size;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -331,9 +331,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
       int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -389,8 +389,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim),
+      v_dim(_vdim), o_dim(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias)
@@ -409,11 +409,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->hidden_size;
-    int kParas = this->kProjSize * this->hidden_size;
-    int vParas = this->vProjSize * this->hidden_size;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -433,9 +433,9 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
       int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -468,11 +468,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     : SpecIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
                                     input,
-                                    other.oProjSize,
+                                    other.o_dim,
                                     other.num_q_heads,
                                     other.num_kv_heads,
-                                    other.qProjSize,
-                                    other.vProjSize,
+                                    other.qk_dim,
+                                    other.v_dim,
                                     other.dropout,
                                     other.qkv_bias,
                                     other.final_bias,
@@ -636,7 +636,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   int num_q_heads = attn->num_q_heads;
   int num_kv_heads = attn->num_kv_heads;
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -837,11 +837,11 @@ SpecIncMultiHeadSelfAttentionParams
     SpecIncMultiHeadSelfAttention::get_params() const {
   SpecIncMultiHeadSelfAttentionParams params;
   params.layer_guid = this->layer_guid;
-  params.embed_dim = this->oProjSize;
+  params.embed_dim = this->o_dim;
   params.num_q_heads = this->num_q_heads;
   params.num_kv_heads = this->num_kv_heads;
-  params.kdim = this->kProjSize;
-  params.vdim = this->vProjSize;
+  params.kdim = this->qk_dim;
+  params.vdim = this->v_dim;
   params.dropout = this->dropout;
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index e0a831aba..1d645703c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -59,10 +59,10 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   // global constant parameters
   uint32_t const num_q_heads = m->num_q_heads;
   uint32_t const num_kv_heads = m->num_kv_heads;
-  uint32_t const head_dim = m->qProjSize;
+  uint32_t const head_dim = m->qk_dim;
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale =
-      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -368,10 +368,9 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     TREE_SEARCH_MODE,
                                     attn,
                                     attn->hidden_size,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
+                                    attn->qk_dim,
+                                    attn->v_dim,
+                                    attn->o_dim,
                                     attn->apply_rotary_embedding,
                                     attn->qkv_bias,
                                     attn->scaling_query,
@@ -396,7 +395,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
   handler.tree_search_attention_metadata->set_enabled(true);
   handler.tree_search_attention_metadata->set_num_q_heads(num_q_heads);
   handler.tree_search_attention_metadata->set_num_kv_heads(num_kv_heads);
-  handler.tree_search_attention_metadata->set_head_dim(qProjSize);
+  handler.tree_search_attention_metadata->set_head_dim(qk_dim);
 
   cudaStreamSynchronize(stream);
 }
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 6280c02eb..647c1a42d 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -149,13 +149,13 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-      oProjSize = embed_dim;
+  int qk_dim = kdim, v_dim = kdim,
+      o_dim = embed_dim;
   int hidden_size = input->dims[0];
-  int qParas = qProjSize * hidden_size;
-  int kParas = kProjSize * hidden_size;
-  int vParas = vProjSize * hidden_size;
-  int oParas = oProjSize * (vProjSize > 0 ? vProjSize : hidden_size);
+  int qParas = qk_dim * hidden_size;
+  int kParas = qk_dim * hidden_size;
+  int vParas = v_dim * hidden_size;
+  int oParas = o_dim * (v_dim > 0 ? v_dim : hidden_size);
   int one_head_size = qParas + kParas + vParas + oParas;
   int weight_size = qParas * num_q_heads + kParas * num_q_heads +
                     vParas * num_q_heads + oParas * num_q_heads;
@@ -179,9 +179,9 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   if (qkv_bias || final_bias) {
     // q, k, v, o
     int qkv_bias_size =
-        qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+        qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
     int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? oProjSize : 0)};
+                   (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -312,8 +312,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim),
+      v_dim(_vdim), o_dim(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -335,11 +335,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->hidden_size;
-    int kParas = this->kProjSize * this->hidden_size;
-    int vParas = this->vProjSize * this->hidden_size;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -366,9 +366,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
       int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -427,8 +427,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      hidden_size(_input->dims[0].size), qk_dim(_kdim),
+      v_dim(_vdim), o_dim(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -449,11 +449,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     // Create weight tensor
     int num_dims = inputs[0]->num_dims;
     // Compute weight size
-    int qParas = this->qProjSize * this->hidden_size;
-    int kParas = this->kProjSize * this->hidden_size;
-    int vParas = this->vProjSize * this->hidden_size;
+    int qParas = this->qk_dim * this->hidden_size;
+    int kParas = this->qk_dim * this->hidden_size;
+    int vParas = this->v_dim * this->hidden_size;
     int oParas =
-        this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->hidden_size);
+        this->o_dim * (this->v_dim > 0 ? this->v_dim : this->hidden_size);
     ParallelDim dims[2];
     dims[0] = inputs[0]->dims[num_dims - 2];
     dims[0].size = dims[0].degree;
@@ -478,9 +478,9 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
       int qkv_bias_size =
-          qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads;
+          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
-          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0);
+          (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
       weights[1] =
           model.create_parallel_weight_legion_ordering(bias_shape.num_dims,
@@ -513,11 +513,11 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     : TreeIncMultiHeadSelfAttention(model,
                                     other.layer_guid,
                                     input,
-                                    other.oProjSize,
+                                    other.o_dim,
                                     other.num_q_heads,
                                     other.num_kv_heads,
-                                    other.qProjSize,
-                                    other.vProjSize,
+                                    other.qk_dim,
+                                    other.v_dim,
                                     other.dropout,
                                     other.qkv_bias,
                                     other.final_bias,
@@ -693,7 +693,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
       attn->num_kv_heads / attn->tensor_parallelism_degree +
       (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
 
-  assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1);
+  assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
                        .only_kind(Memory::GPU_FB_MEM)
@@ -909,11 +909,11 @@ TreeIncMultiHeadSelfAttentionParams
     TreeIncMultiHeadSelfAttention::get_params() const {
   TreeIncMultiHeadSelfAttentionParams params;
   params.layer_guid = this->layer_guid;
-  params.embed_dim = this->oProjSize;
+  params.embed_dim = this->o_dim;
   params.num_q_heads = this->num_q_heads;
   params.num_kv_heads = this->num_kv_heads;
-  params.kdim = this->kProjSize;
-  params.vdim = this->vProjSize;
+  params.kdim = this->qk_dim;
+  params.vdim = this->v_dim;
   params.dropout = this->dropout;
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 4fc127ac3..9e658fb7a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -160,10 +160,10 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   // global constant parameters
   uint32_t const num_q_heads = m->num_q_heads;
   uint32_t const num_kv_heads = m->num_kv_heads;
-  uint32_t const head_dim = m->qProjSize;
+  uint32_t const head_dim = m->qk_dim;
   uint32_t const batch_size = bc->num_active_requests();
   float const sm_scale =
-      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -497,11 +497,11 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //     std::cout << "Compute output proj time: " << elapsed << " ms\n";
   //   }
   // {
-  //   int size = m->oProjSize;
+  //   int size = m->o_dim;
   //   DT *temp_output = new DT[size];
   //   cudaDeviceSynchronize();
   //   cudaMemcpy(
-  //       temp_output, output_ptr + m->oProjSize * (bc->num_active_tokens() -
+  //       temp_output, output_ptr + m->o_dim * (bc->num_active_tokens() -
   //       1), size * sizeof(DT), cudaMemcpyDeviceToHost);
   //   printf("Output :");
   //   for (int i = 0; i < size; ++i) {
@@ -602,10 +602,9 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     TREE_VERIFY_MODE,
                                     attn,
                                     attn->hidden_size,
-                                    attn->qProjSize,
-                                    attn->kProjSize,
-                                    attn->vProjSize,
-                                    attn->oProjSize,
+                                    attn->qk_dim,
+                                    attn->v_dim,
+                                    attn->o_dim,
                                     attn->apply_rotary_embedding,
                                     attn->qkv_bias,
                                     attn->scaling_query,
@@ -631,7 +630,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   handler.tree_verify_attention_metadata->set_enabled(true);
   handler.tree_verify_attention_metadata->set_num_q_heads(num_q_heads);
   handler.tree_verify_attention_metadata->set_num_kv_heads(num_kv_heads);
-  handler.tree_verify_attention_metadata->set_head_dim(qProjSize);
+  handler.tree_verify_attention_metadata->set_head_dim(qk_dim);
 
   // allocate memory for the seqArray and reserve space
   {
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 3543f88d2..8cae8e059 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2327,10 +2327,10 @@ GraphOptimalViewSerialized
         sez.serialize(attn->layer_guid.id);
         sez.serialize(attn->layer_guid.transformer_layer_id);
         sez.serialize(attn->layer_guid.model_id);
-        sez.serialize(attn->oProjSize);
+        sez.serialize(attn->o_dim);
         sez.serialize(attn->num_q_heads);
-        sez.serialize(attn->qProjSize);
-        sez.serialize(attn->vProjSize);
+        sez.serialize(attn->qk_dim);
+        sez.serialize(attn->v_dim);
         sez.serialize(attn->dropout);
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
@@ -2354,10 +2354,10 @@ GraphOptimalViewSerialized
         sez.serialize(attn->layer_guid.id);
         sez.serialize(attn->layer_guid.transformer_layer_id);
         sez.serialize(attn->layer_guid.model_id);
-        sez.serialize(attn->oProjSize);
+        sez.serialize(attn->o_dim);
         sez.serialize(attn->num_q_heads);
-        sez.serialize(attn->qProjSize);
-        sez.serialize(attn->vProjSize);
+        sez.serialize(attn->qk_dim);
+        sez.serialize(attn->v_dim);
         sez.serialize(attn->dropout);
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
@@ -2378,10 +2378,10 @@ GraphOptimalViewSerialized
         sez.serialize(attn->layer_guid.id);
         sez.serialize(attn->layer_guid.transformer_layer_id);
         sez.serialize(attn->layer_guid.model_id);
-        sez.serialize(attn->oProjSize);
+        sez.serialize(attn->o_dim);
         sez.serialize(attn->num_q_heads);
-        sez.serialize(attn->qProjSize);
-        sez.serialize(attn->vProjSize);
+        sez.serialize(attn->qk_dim);
+        sez.serialize(attn->v_dim);
         sez.serialize(attn->dropout);
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);

From 6eba778ac13a5ab58d3215ca4a87e4c8b9803f27 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 07:04:12 -0700
Subject: [PATCH 397/667] chore: minor

---
 .../inc_multihead_self_attention_kernels.h    | 40 ++++++++++++++-----
 .../inc_multihead_self_attention_kernels.cu   | 18 ---------
 src/ops/tree_inc_multihead_self_attention.cu  | 18 ---------
 3 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index d58cde507..d0939150a 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -14,21 +14,41 @@ namespace FlexFlow {
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) *
+         hidden_size;
+}
+
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const hidden_size) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) *
+         hidden_size;
+}
+
 template <typename DT>
 void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             ffStream_t stream);
+                      GenericTensorAccessorR const weight,
+                      DataType data_type,
+                      ffStream_t stream);
 
 template <typename DT>
 void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        ffStream_t stream);
+                 BatchConfig const *bc,
+                 int shard_id,
+                 DT const *input_ptr,
+                 DT const *weight_ptr,
+                 DT *output_ptr,
+                 DT const *bias_ptr,
+                 ffStream_t stream);
 
 template <typename DT>
 void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index d535116fb..dae783565 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -373,24 +373,6 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
   //   }
 }
 
-__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) *
-         hidden_size;
-}
-
-__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) *
-         hidden_size;
-}
-
 template <typename DT>
 __global__ void
     update_qkv_cache_kernel(DT *devQKVProjArray,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 9e658fb7a..6c3b96e88 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -47,24 +47,6 @@ using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
-__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) *
-         hidden_size;
-}
-
-__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
-                                                     int const hidden_size) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) *
-         hidden_size;
-}
-
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,

From 77ac4fdf1ec1383068796d4d77cb529bb95bd67a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 14:39:33 -0700
Subject: [PATCH 398/667] feat: support GQA after compute_qkv, but got runtime
 error

---
 .../inc_multihead_self_attention_kernels.h    | 18 +++++---
 src/ops/inc_multihead_self_attention.cu       |  2 +-
 .../inc_multihead_self_attention_kernels.cu   | 44 ++++++++++++-------
 src/ops/spec_inc_multihead_self_attention.cu  |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 23 +++++-----
 src/runtime/request_manager.cu                |  8 ++--
 6 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index d0939150a..29d2cd1dd 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -15,23 +15,27 @@ namespace Kernels {
 namespace IncMultiHeadAttention {
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-
 __device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
                                                      int const token_idx,
                                                      int const max_num_pages,
-                                                     int const hidden_size) {
+                                                     int const num_heads,
+                                                     int const head_dim) {
   return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) *
-         hidden_size;
+          token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
 }
 
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
 __device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
                                                      int const token_idx,
                                                      int const max_num_pages,
-                                                     int const hidden_size) {
+                                                     int const num_heads,
+                                                     int const head_dim) {
   return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) *
-         hidden_size;
+          kPagesize + token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
 }
 
 template <typename DT>
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 7e44cf628..8cd624dae 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -84,7 +84,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-      num_q_heads,
+      num_kv_heads,
       kPagesize,
       head_dim,
       batch_size,
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index dae783565..dc81f9e6a 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -381,11 +381,16 @@ __global__ void
                             BatchConfig::PerTokenInfo const *tokenInfos,
                             BatchConfig::PerRequestInfo *request_infos,
                             int const max_num_pages,
-                            int hidden_size,
+                            int num_q_heads,
+                            int num_kv_heads,
+                            int head_dim,
                             int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / hidden_size;
-  int const offset = thread_idx % hidden_size;
+  int const token_idx = thread_idx / q_hidden_size;
+  int const offset = thread_idx % q_hidden_size;
   if (token_idx >= num_new_tokens) {
     return;
   }
@@ -393,19 +398,24 @@ __global__ void
   int const req_idx = tokenInfos[token_idx].request_index;
   int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
-  size_t from_idx = token_idx * QKV_WEIGHT_NUM * hidden_size;
-  size_t to_k_idx = get_k_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size),
-         to_v_idx = get_v_entry_offset(
-             req_idx, token_abs_idx, max_num_pages, hidden_size);
-
-  // key and value cache should be stored interleaved
-  kCache_ptr[to_k_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size + offset]);
-  kCache_ptr[to_v_idx + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + hidden_size * 2 + offset]);
-  qTmp_ptr[token_idx * hidden_size + offset] =
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  qTmp_ptr[token_idx * q_hidden_size + offset] =
       static_cast<half>(devQKVProjArray[from_idx + offset]);
+
+  if (offset < kv_hidden_size) {
+    size_t to_k_idx = get_k_entry_offset(
+              req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
+          to_v_idx = get_v_entry_offset(
+              req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
+    // key and value cache should be stored interleaved
+    int const stride = num_q_heads / num_kv_heads; // temporary hard code
+    int const kv_offset = offset / head_dim * stride * head_dim +
+                      offset % head_dim; // temporary hard code
+    kCache_ptr[to_k_idx + offset] =
+        static_cast<half>(devQKVProjArray[from_idx + q_hidden_size + kv_offset]);
+    kCache_ptr[to_v_idx + offset] =
+        static_cast<half>(devQKVProjArray[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]);
+  }
 }
 
 template <typename DT>
@@ -428,7 +438,9 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
                                       m->token_infos,
                                       m->request_infos,
                                       max_num_pages,
-                                      m->local_hidden_size,
+                                      m->num_q_heads,
+                                      m->num_kv_heads,
+                                      m->qk_dim,
                                       num_new_tokens);
 }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 1d645703c..2ea10b61c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -82,7 +82,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-      num_q_heads,
+      num_kv_heads,
       kPagesize,
       head_dim,
       batch_size,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 6c3b96e88..d3e55f94f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -52,12 +52,14 @@ __global__ void commit_tokens_kernel(
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     bool const *request_available,
     int num_requests,
-    int hidden_size,
+    int num_kv_heads,
+    int head_dim,
     int const *num_committed_tokens,
     int const max_num_pages) {
+  int const kv_hidden_size = num_kv_heads * head_dim;
   int const idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const request_compact_idx = idx / hidden_size;
-  int const offset = idx % hidden_size;
+  int const request_compact_idx = idx / kv_hidden_size;
+  int const offset = idx % kv_hidden_size;
   // request id in batch config
   int requext_idx_in_batch = -1;
   int cnt_1 = 0;
@@ -79,13 +81,13 @@ __global__ void commit_tokens_kernel(
       int const tok_id = committedTokenInfos[i].token_depth;
 
       size_t from_k_idx = get_k_entry_offset(
-                 req_id, index_in_kv_cache, max_num_pages, hidden_size),
+                 req_id, index_in_kv_cache, max_num_pages, num_kv_heads, head_dim),
              from_v_idx = get_v_entry_offset(
-                 req_id, index_in_kv_cache, max_num_pages, hidden_size);
+                 req_id, index_in_kv_cache, max_num_pages, num_kv_heads, head_dim);
       size_t to_k_idx =
-                 get_k_entry_offset(req_id, tok_id, max_num_pages, hidden_size),
+                 get_k_entry_offset(req_id, tok_id, max_num_pages, num_kv_heads, head_dim),
              to_v_idx =
-                 get_v_entry_offset(req_id, tok_id, max_num_pages, hidden_size);
+                 get_v_entry_offset(req_id, tok_id, max_num_pages, num_kv_heads, head_dim);
       assert(to_k_idx <= from_k_idx);
 
       kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
@@ -107,7 +109,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
        BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
       kPagesize;
   int const num_requests = bc->num_active_requests();
-  int parallelism = m->local_hidden_size * num_requests;
+  int parallelism = m->num_kv_heads * m->qk_dim * num_requests;
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
@@ -115,7 +117,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                                    m->committed_token_infos,
                                    m->request_available,
                                    num_requests,
-                                   m->local_hidden_size,
+                                   m->num_kv_heads,
+                                   m->qk_dim,
                                    m->num_tokens_to_commit,
                                    max_num_pages);
   //   cudaEventRecord(t_end, stream);
@@ -165,7 +168,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
-      num_q_heads,
+      num_kv_heads,
       kPagesize,
       head_dim,
       batch_size,
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index b2b9ca803..27abbe935 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -324,7 +324,7 @@ void RequestManager::load_batch_config_task(
                 static_cast<int32_t *>(kv_last_page_len_h),
                 batch_size,
                 handle.incr_attention_metadata->num_q_heads(),
-                handle.incr_attention_metadata->num_q_heads(),
+                handle.incr_attention_metadata->num_kv_heads(),
                 kPagesize);
           });
         } else {
@@ -351,7 +351,7 @@ void RequestManager::load_batch_config_task(
               static_cast<int32_t *>(kv_indptr_h),
               batch_size,
               handle.incr_attention_metadata->num_q_heads(),
-              handle.incr_attention_metadata->num_q_heads(),
+              handle.incr_attention_metadata->num_kv_heads(),
               handle.incr_attention_metadata->head_dim(),
               kPagesize);
         }
@@ -499,7 +499,7 @@ void RequestManager::load_batch_config_task(
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,
             handle.tree_search_attention_metadata->num_q_heads(),
-            handle.tree_search_attention_metadata->num_q_heads(),
+            handle.tree_search_attention_metadata->num_kv_heads(),
             handle.tree_search_attention_metadata->head_dim(),
             kPagesize);
       }
@@ -665,7 +665,7 @@ void RequestManager::load_batch_config_task(
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,
             handle.tree_verify_attention_metadata->num_q_heads(),
-            handle.tree_verify_attention_metadata->num_q_heads(),
+            handle.tree_verify_attention_metadata->num_kv_heads(),
             handle.tree_verify_attention_metadata->head_dim(),
             kPagesize);
       }

From 880309e89f997d77920b2cbdfad516b3eba6aa0b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 15:54:31 -0700
Subject: [PATCH 399/667] feat: update flashinfer version

---
 deps/flashinfer                              |   2 +-
 include/flexflow/config.h                    |  31 +-
 src/ops/attention_impl.cu                    | 324 ++++++++++++++++---
 src/ops/inc_multihead_self_attention.cu      |   8 +-
 src/ops/spec_inc_multihead_self_attention.cu |   9 +-
 src/ops/tree_inc_multihead_self_attention.cu |   9 +-
 src/runtime/request_manager.cu               |  49 ++-
 7 files changed, 358 insertions(+), 74 deletions(-)

diff --git a/deps/flashinfer b/deps/flashinfer
index 457eb7893..be6bf5bb2 160000
--- a/deps/flashinfer
+++ b/deps/flashinfer
@@ -1 +1 @@
-Subproject commit 457eb7893f3fbf3d7bd087a5f0e111261cf2a5b2
+Subproject commit be6bf5bb26f1f1b3edf094d903544600c574ee09
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index ba239237e..1168478f5 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -18,6 +18,7 @@
 #include "ffconst.h"
 #include "flexflow/batch_config.h"
 #include "legion.h"
+#include <cstddef>
 #include <cstring>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include <cublas_v2.h>
@@ -104,7 +105,11 @@ class AttentionMetaData {
     qk_indptr = nullptr;
     custom_mask = nullptr;
     workspace = nullptr;
-    workspace_block = 0;
+    workspace_size = 0;
+    float_workspace = nullptr;
+    float_workspace_size = 0;
+    int_workspace = nullptr;
+    int_workspace_size = 0;
     mem_size_ = 0;
     enabled_ = false;
   }
@@ -119,7 +124,11 @@ class AttentionMetaData {
     qk_indptr = rhs.qk_indptr;
     custom_mask = rhs.custom_mask;
     workspace = rhs.workspace;
-    workspace_block = rhs.workspace_block;
+    workspace_size = rhs.workspace_size;
+    float_workspace = rhs.float_workspace;
+    float_workspace_size = rhs.float_workspace_size;
+    int_workspace = rhs.int_workspace;
+    int_workspace_size = rhs.int_workspace_size;
     mem_size_ = rhs.mem_size_;
     enabled_ = rhs.enabled_;
     decode_handler_collections = rhs.decode_handler_collections;
@@ -143,11 +152,14 @@ class AttentionMetaData {
                                      BatchConfig::max_sequence_length()) +
                                 7) /
                                8);
-    workspace_block = 16 * 1024 * 1024; // 16MB
+
+    float_workspace_size = 32 * 1024 * 1024; // 32MB
+    int_workspace_size = 8 * 1024 * 1024;    // 8MB
+    workspace_size = float_workspace_size + int_workspace_size; // float + int workspace
 
     mem_size_ = sizeof(int32_t) * indices_size +
                 sizeof(uint8_t) * custom_mask_size +
-                workspace_block * BatchConfig::max_requests_per_batch();
+                workspace_size * BatchConfig::max_requests_per_batch();
     return mem_size_;
   }
 
@@ -160,6 +172,8 @@ class AttentionMetaData {
       qk_indptr = nullptr;
       custom_mask = nullptr;
       workspace = nullptr;
+      float_workspace = nullptr;
+      int_workspace = nullptr;
       return;
     }
     assert(size >= mem_size() &&
@@ -187,6 +201,9 @@ class AttentionMetaData {
     workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
                                     sizeof(int32_t) * indices_size +
                                     sizeof(uint8_t) * custom_mask_size);
+    float_workspace = workspace;
+    int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
+                                        float_workspace_size);
   }
 
   void set_num_q_heads(uint32_t const num_q_heads) {
@@ -226,7 +243,11 @@ class AttentionMetaData {
   int32_t *qk_indptr;
   uint8_t *custom_mask;
   void *workspace;
-  size_t workspace_block;
+  size_t workspace_size;
+  void * float_workspace;
+  size_t float_workspace_size;
+  void * int_workspace;
+  size_t int_workspace_size;
 
   size_t mem_size_;
 
diff --git a/src/ops/attention_impl.cu b/src/ops/attention_impl.cu
index 4ee9528d4..f3cc8df92 100644
--- a/src/ops/attention_impl.cu
+++ b/src/ops/attention_impl.cu
@@ -22,7 +22,8 @@ namespace flashinfer {
 
 // warp_layout_literal[] = {
 //   "WarpLayout::k4x1x2",
-//   "WarpLayout::k4x1x1"
+//   "WarpLayout::k4x1x1",
+//   "WarpLayout::k1x4x1",
 // }
 // head_dim[] = {64, 128, 256};
 
@@ -33,12 +34,12 @@ template cudaError_t
                                            WarpLayout::k4x1x2,
                                            64,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCustom,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -46,8 +47,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -61,6 +61,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -72,12 +73,12 @@ template cudaError_t
                                            WarpLayout::k4x1x2,
                                            128,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCustom,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -85,8 +86,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -100,6 +100,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -111,12 +112,12 @@ template cudaError_t
                                            WarpLayout::k4x1x2,
                                            256,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCustom,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -124,8 +125,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -139,6 +139,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -150,12 +151,12 @@ template cudaError_t
                                            WarpLayout::k4x1x1,
                                            64,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCustom,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -163,8 +164,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -178,6 +178,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -189,12 +190,12 @@ template cudaError_t
                                            WarpLayout::k4x1x1,
                                            128,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCustom,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -202,8 +203,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -217,6 +217,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -228,12 +229,129 @@ template cudaError_t
                                            WarpLayout::k4x1x1,
                                            256,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCustom,
                                            half,
                                            half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -241,8 +359,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -256,6 +373,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -269,12 +387,12 @@ template cudaError_t
                                            WarpLayout::k4x1x2,
                                            64,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCausal,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -282,8 +400,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -297,6 +414,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -308,12 +426,12 @@ template cudaError_t
                                            WarpLayout::k4x1x2,
                                            128,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCausal,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -321,8 +439,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -336,6 +453,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -347,12 +465,12 @@ template cudaError_t
                                            WarpLayout::k4x1x2,
                                            256,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCausal,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -360,8 +478,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -375,6 +492,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -386,12 +504,12 @@ template cudaError_t
                                            WarpLayout::k4x1x1,
                                            64,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCausal,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -399,8 +517,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -414,6 +531,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -425,12 +543,12 @@ template cudaError_t
                                            WarpLayout::k4x1x1,
                                            128,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCausal,
                                            half,
                                            half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -438,8 +556,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -453,6 +570,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -464,12 +582,129 @@ template cudaError_t
                                            WarpLayout::k4x1x1,
                                            256,
                                            LogitsPostHook::kNone,
-                                           QKVLayout::kNHD,
                                            PosEncodingMode::kNone,
                                            false,
                                            MaskMode::kCausal,
                                            half,
                                            half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
                                            int32_t>(
         half *q,
         int32_t *request_indices,
@@ -477,8 +712,7 @@ template cudaError_t
         int32_t *kv_tile_indices,
         int32_t *q_indptr,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         uint8_t *custom_mask,
         int32_t *qk_indptr,
         int32_t *o_indptr,
@@ -492,6 +726,7 @@ template cudaError_t
         uint32_t total_num_rows,
         uint32_t num_qo_heads,
         uint32_t padded_batch_size,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -503,7 +738,6 @@ template cudaError_t
     BatchDecodeWithPagedKVCacheDispatched<64,
                                           PageStorage::kIndices,
                                           LogitsPostHook::kNone,
-                                          QKVLayout::kNHD,
                                           PosEncodingMode::kNone,
                                           half,
                                           half,
@@ -511,8 +745,7 @@ template cudaError_t
                                           int32_t>(
         half *q,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         kv_partition_info_t<int32_t> kv_partition_info,
         half *o,
         half *tmp_v,
@@ -521,6 +754,7 @@ template cudaError_t
         bool *block_valid_mask,
         uint32_t padded_batch_size,
         uint32_t num_qo_heads,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -531,7 +765,6 @@ template cudaError_t
     BatchDecodeWithPagedKVCacheDispatched<128,
                                           PageStorage::kIndices,
                                           LogitsPostHook::kNone,
-                                          QKVLayout::kNHD,
                                           PosEncodingMode::kNone,
                                           half,
                                           half,
@@ -539,8 +772,7 @@ template cudaError_t
                                           int32_t>(
         half *q,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         kv_partition_info_t<int32_t> kv_partition_info,
         half *o,
         half *tmp_v,
@@ -549,6 +781,7 @@ template cudaError_t
         bool *block_valid_mask,
         uint32_t padded_batch_size,
         uint32_t num_qo_heads,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
@@ -559,7 +792,6 @@ template cudaError_t
     BatchDecodeWithPagedKVCacheDispatched<256,
                                           PageStorage::kIndices,
                                           LogitsPostHook::kNone,
-                                          QKVLayout::kNHD,
                                           PosEncodingMode::kNone,
                                           half,
                                           half,
@@ -567,8 +799,7 @@ template cudaError_t
                                           int32_t>(
         half *q,
         int32_t *q_offset,
-        paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t>
-            paged_kv,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
         kv_partition_info_t<int32_t> kv_partition_info,
         half *o,
         half *tmp_v,
@@ -577,6 +808,7 @@ template cudaError_t
         bool *block_valid_mask,
         uint32_t padded_batch_size,
         uint32_t num_qo_heads,
+        int32_t window_left,
         float logits_soft_cap,
         float sm_scale,
         float rope_scale,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 8cd624dae..a191a41f6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -83,11 +83,12 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
   half *q = static_cast<half *>(m->queryTmp),
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
       num_kv_heads,
       kPagesize,
       head_dim,
       batch_size,
+      QKVLayout::kNHD,
       kv,
       m->handle.incr_attention_metadata->kv_indices,
       m->handle.incr_attention_metadata->kv_indptr,
@@ -144,12 +145,12 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
           BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
                                                         HEAD_DIM,
                                                         LogitsPostHook::kNone,
-                                                        QKVLayout::kNHD,
                                                         PosEncodingMode::kNone,
                                                         false,
                                                         MaskMode::kCausal,
                                                         half,
                                                         half,
+                                                        half,
                                                         int32_t>(
               static_cast<BatchPrefillHandler *>(handler),
               q,
@@ -161,6 +162,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
               o,
               /*lse=*/nullptr,
               num_q_heads,
+              /*window_left=*/-1,
               /*logits_soft_cap=*/0.f,
               sm_scale,
               /*rope_scale=*/1.f,
@@ -171,7 +173,6 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
           BatchDecodeWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
                                                        HEAD_DIM,
                                                        LogitsPostHook::kNone,
-                                                       QKVLayout::kNHD,
                                                        PosEncodingMode::kNone,
                                                        half,
                                                        half,
@@ -184,6 +185,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
               o,
               /*lse=*/nullptr,
               num_q_heads,
+              /*window_left=*/-1,
               /*logits_soft_cap=*/0.f,
               sm_scale,
               /*rope_scale=*/1.f,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 2ea10b61c..f09ab5ed8 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -81,11 +81,12 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   half *q = static_cast<half *>(m->queryTmp),
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
       num_kv_heads,
       kPagesize,
       head_dim,
       batch_size,
+      QKVLayout::kNHD,
       kv,
       m->handle.tree_search_attention_metadata->kv_indices,
       m->handle.tree_search_attention_metadata->kv_indptr,
@@ -144,12 +145,12 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
           BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
                                                         HEAD_DIM,
                                                         LogitsPostHook::kNone,
-                                                        QKVLayout::kNHD,
                                                         PosEncodingMode::kNone,
                                                         false,
                                                         MaskMode::kCausal,
                                                         half,
                                                         half,
+                                                        half,
                                                         int32_t>(
               handler,
               q,
@@ -161,6 +162,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
               o,
               /*lse=*/nullptr,
               num_q_heads,
+              /*window_left=*/-1,
               /*logits_soft_cap=*/0.f,
               sm_scale,
               /*rope_scale=*/1.f,
@@ -171,12 +173,12 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
           BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
                                                         HEAD_DIM,
                                                         LogitsPostHook::kNone,
-                                                        QKVLayout::kNHD,
                                                         PosEncodingMode::kNone,
                                                         false,
                                                         MaskMode::kCustom,
                                                         half,
                                                         half,
+                                                        half,
                                                         int32_t>(
               handler,
               q,
@@ -188,6 +190,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
               o,
               /*lse=*/nullptr,
               num_q_heads,
+              /*window_left=*/-1,
               /*logits_soft_cap=*/0.f,
               sm_scale,
               /*rope_scale=*/1.f,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d3e55f94f..e687728b8 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -167,11 +167,12 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   half *q = static_cast<half *>(m->queryTmp),
        *kv = static_cast<half *>(m->keyCache),
        *o = static_cast<half *>(m->outputTmp);
-  paged_kv_t<PageStorage::kIndices, QKVLayout::kNHD, half, int32_t> paged_kv(
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
       num_kv_heads,
       kPagesize,
       head_dim,
       batch_size,
+      QKVLayout::kNHD,
       kv,
       m->handle.tree_verify_attention_metadata->kv_indices,
       m->handle.tree_verify_attention_metadata->kv_indptr,
@@ -230,12 +231,12 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
           BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
                                                         HEAD_DIM,
                                                         LogitsPostHook::kNone,
-                                                        QKVLayout::kNHD,
                                                         PosEncodingMode::kNone,
                                                         false,
                                                         MaskMode::kCausal,
                                                         half,
                                                         half,
+                                                        half,
                                                         int32_t>(
               handler,
               q,
@@ -247,6 +248,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
               o,
               /*lse=*/nullptr,
               num_q_heads,
+              /*window_left=*/-1,
               /*logits_soft_cap=*/0.f,
               sm_scale,
               /*rope_scale=*/1.f,
@@ -257,12 +259,12 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
           BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
                                                         HEAD_DIM,
                                                         LogitsPostHook::kNone,
-                                                        QKVLayout::kNHD,
                                                         PosEncodingMode::kNone,
                                                         false,
                                                         MaskMode::kCustom,
                                                         half,
                                                         half,
+                                                        half,
                                                         int32_t>(
               handler,
               q,
@@ -274,6 +276,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
               o,
               /*lse=*/nullptr,
               num_q_heads,
+              /*window_left=*/-1,
               /*logits_soft_cap=*/0.f,
               sm_scale,
               /*rope_scale=*/1.f,
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 27abbe935..76de89705 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -308,7 +308,6 @@ void RequestManager::load_batch_config_task(
             handler->BeginForwardDispatched<HEAD_DIM,
                                             PageStorage::kIndices,
                                             LogitsPostHook::kNone,
-                                            QKVLayout::kNHD,
                                             PosEncodingMode::kNone,
                                             half,
                                             half,
@@ -316,10 +315,16 @@ void RequestManager::load_batch_config_task(
                                             int32_t>(
                 static_cast<void *>(
                     static_cast<char *>(
-                        handle.incr_attention_metadata->workspace) +
-                    handle.incr_attention_metadata->workspace_block *
+                        handle.incr_attention_metadata->float_workspace) +
+                    handle.incr_attention_metadata->workspace_size *
                         batch_size),
-                handle.incr_attention_metadata->workspace_block,
+                handle.incr_attention_metadata->float_workspace_size,
+                static_cast<void *>(
+                    static_cast<char *>(
+                        handle.incr_attention_metadata->int_workspace) +
+                    handle.incr_attention_metadata->workspace_size *
+                        batch_size),
+                handle.incr_attention_metadata->int_workspace_size,
                 static_cast<int32_t *>(kv_indptr_h),
                 static_cast<int32_t *>(kv_last_page_len_h),
                 batch_size,
@@ -343,10 +348,16 @@ void RequestManager::load_batch_config_task(
           handler->BeginForward<half, int32_t>(
               static_cast<void *>(
                   static_cast<char *>(
-                      handle.incr_attention_metadata->workspace) +
-                  handle.incr_attention_metadata->workspace_block *
+                      handle.incr_attention_metadata->float_workspace) +
+                  handle.incr_attention_metadata->workspace_size *
+                      batch_size),
+              handle.incr_attention_metadata->float_workspace_size,
+              static_cast<void *>(
+                  static_cast<char *>(
+                      handle.incr_attention_metadata->int_workspace) +
+                  handle.incr_attention_metadata->workspace_size *
                       batch_size),
-              handle.incr_attention_metadata->workspace_block,
+              handle.incr_attention_metadata->int_workspace_size,
               static_cast<int32_t *>(q_indptr_h),
               static_cast<int32_t *>(kv_indptr_h),
               batch_size,
@@ -491,10 +502,16 @@ void RequestManager::load_batch_config_task(
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
                 static_cast<char *>(
-                    handle.tree_search_attention_metadata->workspace) +
-                handle.tree_search_attention_metadata->workspace_block *
+                    handle.incr_attention_metadata->float_workspace) +
+                handle.incr_attention_metadata->workspace_size *
                     batch_size),
-            handle.tree_search_attention_metadata->workspace_block,
+            handle.incr_attention_metadata->float_workspace_size,
+            static_cast<void *>(
+                static_cast<char *>(
+                    handle.incr_attention_metadata->int_workspace) +
+                handle.incr_attention_metadata->workspace_size *
+                    batch_size),
+            handle.incr_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,
@@ -657,10 +674,16 @@ void RequestManager::load_batch_config_task(
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
                 static_cast<char *>(
-                    handle.tree_verify_attention_metadata->workspace) +
-                handle.tree_verify_attention_metadata->workspace_block *
+                    handle.incr_attention_metadata->float_workspace) +
+                handle.incr_attention_metadata->workspace_size *
+                    batch_size),
+            handle.incr_attention_metadata->float_workspace_size,
+            static_cast<void *>(
+                static_cast<char *>(
+                    handle.incr_attention_metadata->int_workspace) +
+                handle.incr_attention_metadata->workspace_size *
                     batch_size),
-            handle.tree_verify_attention_metadata->workspace_block,
+            handle.incr_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,

From e75137b3125e549879d109a3a07f9ab140748b35 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 16:08:43 -0700
Subject: [PATCH 400/667] chore: minor

---
 src/runtime/request_manager.cu | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 76de89705..b4ec4ab09 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -502,16 +502,16 @@ void RequestManager::load_batch_config_task(
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
                 static_cast<char *>(
-                    handle.incr_attention_metadata->float_workspace) +
-                handle.incr_attention_metadata->workspace_size *
+                    handle.tree_search_attention_metadata->float_workspace) +
+                handle.tree_search_attention_metadata->workspace_size *
                     batch_size),
-            handle.incr_attention_metadata->float_workspace_size,
+            handle.tree_search_attention_metadata->float_workspace_size,
             static_cast<void *>(
                 static_cast<char *>(
-                    handle.incr_attention_metadata->int_workspace) +
-                handle.incr_attention_metadata->workspace_size *
+                    handle.tree_search_attention_metadata->int_workspace) +
+                handle.tree_search_attention_metadata->workspace_size *
                     batch_size),
-            handle.incr_attention_metadata->int_workspace_size,
+            handle.tree_search_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,
@@ -674,16 +674,16 @@ void RequestManager::load_batch_config_task(
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
                 static_cast<char *>(
-                    handle.incr_attention_metadata->float_workspace) +
-                handle.incr_attention_metadata->workspace_size *
+                    handle.tree_verify_attention_metadata->float_workspace) +
+                handle.tree_verify_attention_metadata->workspace_size *
                     batch_size),
-            handle.incr_attention_metadata->float_workspace_size,
+            handle.tree_verify_attention_metadata->float_workspace_size,
             static_cast<void *>(
                 static_cast<char *>(
-                    handle.incr_attention_metadata->int_workspace) +
-                handle.incr_attention_metadata->workspace_size *
+                    handle.tree_verify_attention_metadata->int_workspace) +
+                handle.tree_verify_attention_metadata->workspace_size *
                     batch_size),
-            handle.incr_attention_metadata->int_workspace_size,
+            handle.tree_verify_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),
             batch_size,

From e34bde90d1f1d1ce95be6dcc95b5ae77cd760db6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 20:14:14 -0700
Subject: [PATCH 401/667] fix: reserve enough space for batch_handler

---
 include/flexflow/config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 1168478f5..ce6d85705 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -153,8 +153,8 @@ class AttentionMetaData {
                                 7) /
                                8);
 
-    float_workspace_size = 32 * 1024 * 1024; // 32MB
-    int_workspace_size = 8 * 1024 * 1024;    // 8MB
+    float_workspace_size = 128 * 1024 * 1024; // 128 MB
+    int_workspace_size = 8 * 1024 * 1024;    // 8 MB
     workspace_size = float_workspace_size + int_workspace_size; // float + int workspace
 
     mem_size_ = sizeof(int32_t) * indices_size +

From 739a333183813237467e883c0e371429e2a8bbdd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 21:28:25 -0700
Subject: [PATCH 402/667] feat: significantly reduce memory consumption of
 batch_handler

---
 include/flexflow/config.h      |  2 +-
 src/runtime/request_manager.cu | 40 +++++++---------------------------
 2 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index ce6d85705..4c247fa73 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -159,7 +159,7 @@ class AttentionMetaData {
 
     mem_size_ = sizeof(int32_t) * indices_size +
                 sizeof(uint8_t) * custom_mask_size +
-                workspace_size * BatchConfig::max_requests_per_batch();
+                workspace_size;
     return mem_size_;
   }
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index b4ec4ab09..06925e3a7 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -314,16 +314,10 @@ void RequestManager::load_batch_config_task(
                                             half,
                                             int32_t>(
                 static_cast<void *>(
-                    static_cast<char *>(
-                        handle.incr_attention_metadata->float_workspace) +
-                    handle.incr_attention_metadata->workspace_size *
-                        batch_size),
+                        handle.incr_attention_metadata->float_workspace),
                 handle.incr_attention_metadata->float_workspace_size,
                 static_cast<void *>(
-                    static_cast<char *>(
-                        handle.incr_attention_metadata->int_workspace) +
-                    handle.incr_attention_metadata->workspace_size *
-                        batch_size),
+                        handle.incr_attention_metadata->int_workspace),
                 handle.incr_attention_metadata->int_workspace_size,
                 static_cast<int32_t *>(kv_indptr_h),
                 static_cast<int32_t *>(kv_last_page_len_h),
@@ -347,16 +341,10 @@ void RequestManager::load_batch_config_task(
           handler->SetCUDAStream(stream);
           handler->BeginForward<half, int32_t>(
               static_cast<void *>(
-                  static_cast<char *>(
-                      handle.incr_attention_metadata->float_workspace) +
-                  handle.incr_attention_metadata->workspace_size *
-                      batch_size),
+                      handle.incr_attention_metadata->float_workspace),
               handle.incr_attention_metadata->float_workspace_size,
               static_cast<void *>(
-                  static_cast<char *>(
-                      handle.incr_attention_metadata->int_workspace) +
-                  handle.incr_attention_metadata->workspace_size *
-                      batch_size),
+                      handle.incr_attention_metadata->int_workspace),
               handle.incr_attention_metadata->int_workspace_size,
               static_cast<int32_t *>(q_indptr_h),
               static_cast<int32_t *>(kv_indptr_h),
@@ -501,16 +489,10 @@ void RequestManager::load_batch_config_task(
         handler->SetCUDAStream(stream);
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
-                static_cast<char *>(
-                    handle.tree_search_attention_metadata->float_workspace) +
-                handle.tree_search_attention_metadata->workspace_size *
-                    batch_size),
+                    handle.tree_search_attention_metadata->float_workspace),
             handle.tree_search_attention_metadata->float_workspace_size,
             static_cast<void *>(
-                static_cast<char *>(
-                    handle.tree_search_attention_metadata->int_workspace) +
-                handle.tree_search_attention_metadata->workspace_size *
-                    batch_size),
+                    handle.tree_search_attention_metadata->int_workspace),
             handle.tree_search_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),
@@ -673,16 +655,10 @@ void RequestManager::load_batch_config_task(
         handler->SetCUDAStream(stream);
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
-                static_cast<char *>(
-                    handle.tree_verify_attention_metadata->float_workspace) +
-                handle.tree_verify_attention_metadata->workspace_size *
-                    batch_size),
+                    handle.tree_verify_attention_metadata->float_workspace),
             handle.tree_verify_attention_metadata->float_workspace_size,
             static_cast<void *>(
-                static_cast<char *>(
-                    handle.tree_verify_attention_metadata->int_workspace) +
-                handle.tree_verify_attention_metadata->workspace_size *
-                    batch_size),
+                    handle.tree_verify_attention_metadata->int_workspace),
             handle.tree_verify_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),

From 3902b74eee9f369aea57a59402116d6df3a6c4a7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 21:33:41 -0700
Subject: [PATCH 403/667] chore: minor

---
 include/flexflow/ops/inc_multihead_self_attention.h     | 2 +-
 src/ops/inc_multihead_self_attention.cu                 | 6 ++----
 src/ops/kernels/inc_multihead_self_attention_kernels.cu | 2 +-
 src/ops/spec_inc_multihead_self_attention.cu            | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu            | 4 ++--
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 2db071a56..5a90dd61b 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -184,7 +184,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   bool *position_bias;
   float scaling_factor;
   void *weight_ptr, *bias_ptr; // for weight offload
-  void *devQKVProjArray, *queryTmp, *keyCache, *valueCache;
+  void *devQKVProjArray, *queryTmp, *kvCache;
   half *outputTmp;
   void *qk_prods, *qk_prods_softmax;
   void *attn_heads;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index a191a41f6..fe1e842d2 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -81,7 +81,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
   //   }
 
   half *q = static_cast<half *>(m->queryTmp),
-       *kv = static_cast<half *>(m->keyCache),
+       *kv = static_cast<half *>(m->kvCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
       num_kv_heads,
@@ -585,10 +585,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size *
                                                              size_of_dt);
     }
-    keyCache = gpu_mem_allocator.allocate_instance_untyped(
+    kvCache = gpu_mem_allocator.allocate_instance_untyped(
         (key_cache_size + value_cache_size) * size_of_dt);
-    valueCache = static_cast<void *>(static_cast<char *>(keyCache) +
-                                     key_cache_size * size_of_dt);
     outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
 
     token_infos =
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index dc81f9e6a..d3a88fcf3 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -434,7 +434,7 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
                             0,
                             stream>>>(static_cast<DT *>(m->devQKVProjArray),
                                       static_cast<half *>(m->queryTmp),
-                                      static_cast<half *>(m->keyCache),
+                                      static_cast<half *>(m->kvCache),
                                       m->token_infos,
                                       m->request_infos,
                                       max_num_pages,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index f09ab5ed8..d9b7b95e3 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -79,7 +79,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   //   }
 
   half *q = static_cast<half *>(m->queryTmp),
-       *kv = static_cast<half *>(m->keyCache),
+       *kv = static_cast<half *>(m->kvCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
       num_kv_heads,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index e687728b8..2a2d7da93 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -113,7 +113,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
-                         stream>>>(static_cast<half *>(m->keyCache),
+                         stream>>>(static_cast<half *>(m->kvCache),
                                    m->committed_token_infos,
                                    m->request_available,
                                    num_requests,
@@ -165,7 +165,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   //   }
 
   half *q = static_cast<half *>(m->queryTmp),
-       *kv = static_cast<half *>(m->keyCache),
+       *kv = static_cast<half *>(m->kvCache),
        *o = static_cast<half *>(m->outputTmp);
   paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
       num_kv_heads,

From ce40f7e8e8ef45916be434ca5fd73571643968e0 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 21:37:08 -0700
Subject: [PATCH 404/667] style: format code

---
 include/flexflow/config.h                     |  12 +-
 include/flexflow/model.h                      |  34 +++---
 inference/models/llama.cc                     |  66 +++++------
 src/c/flexflow_c.cc                           |  34 +++---
 src/ops/inc_multihead_self_attention.cc       | 109 ++++++++----------
 src/ops/inc_multihead_self_attention.cpp      |  34 +++---
 src/ops/inc_multihead_self_attention.cu       |  32 +++--
 .../inc_multihead_self_attention_kernels.cu   |  42 +++----
 src/ops/spec_inc_multihead_self_attention.cc  |  35 +++---
 src/ops/spec_inc_multihead_self_attention.cpp |  14 +--
 src/ops/spec_inc_multihead_self_attention.cu  |  17 ++-
 src/ops/tree_inc_multihead_self_attention.cc  |  41 +++----
 src/ops/tree_inc_multihead_self_attention.cpp |  17 +--
 src/ops/tree_inc_multihead_self_attention.cu  |  39 ++++---
 src/runtime/file_loader.cc                    |   2 +-
 src/runtime/request_manager.cu                |  57 ++++-----
 16 files changed, 285 insertions(+), 300 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 4c247fa73..0e9325f09 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -154,12 +154,12 @@ class AttentionMetaData {
                                8);
 
     float_workspace_size = 128 * 1024 * 1024; // 128 MB
-    int_workspace_size = 8 * 1024 * 1024;    // 8 MB
-    workspace_size = float_workspace_size + int_workspace_size; // float + int workspace
+    int_workspace_size = 8 * 1024 * 1024;     // 8 MB
+    workspace_size =
+        float_workspace_size + int_workspace_size; // float + int workspace
 
     mem_size_ = sizeof(int32_t) * indices_size +
-                sizeof(uint8_t) * custom_mask_size +
-                workspace_size;
+                sizeof(uint8_t) * custom_mask_size + workspace_size;
     return mem_size_;
   }
 
@@ -244,9 +244,9 @@ class AttentionMetaData {
   uint8_t *custom_mask;
   void *workspace;
   size_t workspace_size;
-  void * float_workspace;
+  void *float_workspace;
   size_t float_workspace_size;
-  void * int_workspace;
+  void *int_workspace;
   size_t int_workspace_size;
 
   size_t mem_size_;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 8c8b90ef8..948feb364 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -762,23 +762,23 @@ class FFModel {
       bool position_bias = false,
       char const *name = NULL);
   Tensor groupquery_self_attention(Tensor const input,
-                                       int embed_dim,
-                                       int num_q_heads,
-                                       int num_kv_heads,
-                                       int kdim = 0,
-                                       int vdim = 0,
-                                       float dropout = 0.0f,
-                                       bool bias = false,
-                                       bool add_bias_kv = false,
-                                       bool add_zero_attn = false,
-                                       DataType data_type = DT_NONE,
-                                       Initializer *kernel_initializer = NULL,
-                                       bool apply_rotary_embedding = false,
-                                       bool scaling_query = false,
-                                       float scaling_factor = 1.0f,
-                                       bool qk_prod_scaling = true,
-                                       bool position_bias = false,
-                                       char const *name = NULL);
+                                   int embed_dim,
+                                   int num_q_heads,
+                                   int num_kv_heads,
+                                   int kdim = 0,
+                                   int vdim = 0,
+                                   float dropout = 0.0f,
+                                   bool bias = false,
+                                   bool add_bias_kv = false,
+                                   bool add_zero_attn = false,
+                                   DataType data_type = DT_NONE,
+                                   Initializer *kernel_initializer = NULL,
+                                   bool apply_rotary_embedding = false,
+                                   bool scaling_query = false,
+                                   float scaling_factor = 1.0f,
+                                   bool qk_prod_scaling = true,
+                                   bool position_bias = false,
+                                   char const *name = NULL);
   Tensor
       spec_inc_multiquery_self_attention(Tensor const input,
                                          int embed_dim,
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index a18a909f9..a1f4d370f 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -183,47 +183,47 @@ void LLAMA::create_llama_model(FFModel &ff,
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 = ff.dense(
-        ff_norm,
-        llama_config.intermediate_size,
-        AC_MODE_NONE,
-        false,
-        DT_NONE,
-        nullptr,
-        nullptr,
-        nullptr,
-        REG_MODE_NONE,
-        0.0f,
+    Tensor w1 =
+        ff.dense(ff_norm,
+                 llama_config.intermediate_size,
+                 AC_MODE_NONE,
+                 false,
+                 DT_NONE,
+                 nullptr,
+                 nullptr,
+                 nullptr,
+                 REG_MODE_NONE,
+                 0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
                      .c_str());
 
-    Tensor w3 = ff.dense(
-        ff_norm,
-        llama_config.intermediate_size,
-        AC_MODE_NONE,
-        false,
-        DT_NONE,
-        nullptr,
-        nullptr,
-        nullptr,
-        REG_MODE_NONE,
-        0.0f,
+    Tensor w3 =
+        ff.dense(ff_norm,
+                 llama_config.intermediate_size,
+                 AC_MODE_NONE,
+                 false,
+                 DT_NONE,
+                 nullptr,
+                 nullptr,
+                 nullptr,
+                 REG_MODE_NONE,
+                 0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
                      .c_str());
 
     Tensor multi = ff.sigmoid_silu_multi(w1, w3);
 
-    w2 = ff.dense(
-        multi,
-        llama_config.hidden_size,
-        AC_MODE_NONE,
-        false,
-        DT_NONE,
-        nullptr,
-        nullptr,
-        nullptr,
-        REG_MODE_NONE,
-        0.0f,
+    w2 =
+        ff.dense(multi,
+                 llama_config.hidden_size,
+                 AC_MODE_NONE,
+                 false,
+                 DT_NONE,
+                 nullptr,
+                 nullptr,
+                 nullptr,
+                 REG_MODE_NONE,
+                 0.0f,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
                      .c_str());
   }
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e371a0cdf..d086d6d16 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1339,23 +1339,23 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
   Tensor tensor = handle->groupquery_self_attention(input,
-                                                        embed_dim,
-                                                        num_q_heads,
-                                                        num_kv_heads,
-                                                        kdim,
-                                                        vdim,
-                                                        dropout,
-                                                        bias,
-                                                        add_bias_kv,
-                                                        add_zero_attn,
-                                                        data_type,
-                                                        kernel_initializer,
-                                                        apply_rotary_embedding,
-                                                        scaling_query,
-                                                        scaling_factor,
-                                                        qk_prod_scaling,
-                                                        position_bias,
-                                                        name);
+                                                    embed_dim,
+                                                    num_q_heads,
+                                                    num_kv_heads,
+                                                    kdim,
+                                                    vdim,
+                                                    dropout,
+                                                    bias,
+                                                    add_bias_kv,
+                                                    add_zero_attn,
+                                                    data_type,
+                                                    kernel_initializer,
+                                                    apply_rotary_embedding,
+                                                    scaling_query,
+                                                    scaling_factor,
+                                                    qk_prod_scaling,
+                                                    position_bias,
+                                                    name);
   return FFCObjectWrapper::wrap(tensor);
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index bff398104..c35a07a4e 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -72,43 +72,43 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                              bool position_bias,
                                              char const *name) {
   return groupquery_self_attention(input,
-                                       embed_dim,
-                                       num_heads,
-                                       num_heads,
-                                       kdim,
-                                       vdim,
-                                       dropout,
-                                       qkv_bias,
-                                       final_bias,
-                                       add_zero_attn,
-                                       data_type,
-                                       kernel_initializer,
-                                       apply_rotary_embedding,
-                                       scaling_query,
-                                       scaling_factor,
-                                       qk_prod_scaling,
-                                       position_bias,
-                                       name);
+                                   embed_dim,
+                                   num_heads,
+                                   num_heads,
+                                   kdim,
+                                   vdim,
+                                   dropout,
+                                   qkv_bias,
+                                   final_bias,
+                                   add_zero_attn,
+                                   data_type,
+                                   kernel_initializer,
+                                   apply_rotary_embedding,
+                                   scaling_query,
+                                   scaling_factor,
+                                   qk_prod_scaling,
+                                   position_bias,
+                                   name);
 }
 
 Tensor FFModel::groupquery_self_attention(const Tensor input,
-                                              int embed_dim,
-                                              int num_q_heads,
-                                              int num_kv_heads,
-                                              int kdim,
-                                              int vdim,
-                                              float dropout,
-                                              bool qkv_bias,
-                                              bool final_bias,
-                                              bool add_zero_attn,
-                                              DataType data_type,
-                                              Initializer *kernel_initializer,
-                                              bool apply_rotary_embedding,
-                                              bool scaling_query,
-                                              float scaling_factor,
-                                              bool qk_prod_scaling,
-                                              bool position_bias,
-                                              char const *name) {
+                                          int embed_dim,
+                                          int num_q_heads,
+                                          int num_kv_heads,
+                                          int kdim,
+                                          int vdim,
+                                          float dropout,
+                                          bool qkv_bias,
+                                          bool final_bias,
+                                          bool add_zero_attn,
+                                          DataType data_type,
+                                          Initializer *kernel_initializer,
+                                          bool apply_rotary_embedding,
+                                          bool scaling_query,
+                                          float scaling_factor,
+                                          bool qk_prod_scaling,
+                                          bool position_bias,
+                                          char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
@@ -147,8 +147,7 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qk_dim = kdim, v_dim = kdim,
-      o_dim = embed_dim;
+  int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim;
   int hidden_size = input->dims[0];
   int qParas = qk_dim * hidden_size;
   int kParas = qk_dim * hidden_size;
@@ -178,10 +177,8 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
   }
   if (qkv_bias || final_bias) {
     // q, k, v, o
-    int qkv_bias_size =
-        qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? o_dim : 0)};
+    int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -315,13 +312,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qk_dim(_kdim),
-      v_dim(_vdim), o_dim(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree) {
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   numOutputs = 1;
@@ -367,8 +363,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
           (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
@@ -429,13 +424,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qk_dim(_kdim),
-      v_dim(_vdim), o_dim(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree)
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -480,8 +474,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
           (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 1d04953e1..123b2ee05 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -245,13 +245,13 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
 
 template <typename DT>
 void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        hipStream_t stream) {
+                 BatchConfig const *bc,
+                 int shard_id,
+                 DT const *input_ptr,
+                 DT const *weight_ptr,
+                 DT *output_ptr,
+                 DT const *bias_ptr,
+                 hipStream_t stream) {
 
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
@@ -377,9 +377,9 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             hipStream_t stream) {
+                      GenericTensorAccessorR const weight,
+                      DataType data_type,
+                      hipStream_t stream) {
   // additional processing for weight uploading
   // Note that we update weight_ptr and bias_ptr when uploading weight and
   // bias
@@ -459,13 +459,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
 
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index fe1e842d2..83ff630a6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -63,8 +63,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const head_dim = m->qk_dim;
   uint32_t const batch_size = bc->num_active_requests();
-  float const sm_scale =
-      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -249,13 +248,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
 
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
   // phase 2: Update key/val cache
   update_qkv_cache<DT>(m, bc, stream);
 
@@ -466,8 +465,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   }
   // biasSize = _bias ? o_dim * size_of_dt * 4 : 0;
 
-  int qkv_bias_size =
-      qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+  int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
   int final_bias_size = o_dim;
   biasSize =
       (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0);
@@ -497,9 +495,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
     int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-    size_t qkv_max_proj_size = max_tokens_per_batch * (qk_dim * num_q_heads +
-                                                       qk_dim * num_q_heads +
-                                                       v_dim * num_q_heads);
+    size_t qkv_max_proj_size =
+        max_tokens_per_batch *
+        (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads);
     size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0,
            qk_prod_size = 0;
     // assert((BatchConfig::max_sequence_length() +
@@ -532,9 +530,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     }
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
     size_t output_tmp_size = max_tokens_per_batch * num_q_heads * v_dim;
-    size_t complex_size = (max_tokens_per_batch * (qk_dim * num_q_heads +
-                                                   qk_dim * num_q_heads)) /
-                          2;
+    size_t complex_size =
+        (max_tokens_per_batch * (qk_dim * num_q_heads + qk_dim * num_q_heads)) /
+        2;
     size_t totalSize =
         (qkv_max_proj_size + query_tmp_size + key_cache_size +
          value_cache_size + 2 * qk_prod_size + attn_heads_size) *
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index d3a88fcf3..b93d5a877 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -141,8 +141,7 @@ __global__ void
                                   int k_block_size,
                                   int q_array_size) {
   CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qk_dim * num_q_heads + qk_dim * num_kv_heads) / 2) {
+      i, num_tokens * (qk_dim * num_q_heads + qk_dim * num_kv_heads) / 2) {
     // create complex number
     bool q_tensor = i < (q_array_size / 2);
     int proj_size = q_tensor ? qk_dim : qk_dim;
@@ -231,13 +230,13 @@ __global__ void
 
 template <typename DT>
 void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        int shard_id,
-                        DT const *input_ptr,
-                        DT const *weight_ptr,
-                        DT *output_ptr,
-                        DT const *bias_ptr,
-                        cudaStream_t stream) {
+                 BatchConfig const *bc,
+                 int shard_id,
+                 DT const *input_ptr,
+                 DT const *weight_ptr,
+                 DT *output_ptr,
+                 DT const *bias_ptr,
+                 cudaStream_t stream) {
 
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
@@ -404,17 +403,18 @@ __global__ void
 
   if (offset < kv_hidden_size) {
     size_t to_k_idx = get_k_entry_offset(
-              req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
-          to_v_idx = get_v_entry_offset(
-              req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset(
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
     int const stride = num_q_heads / num_kv_heads; // temporary hard code
     int const kv_offset = offset / head_dim * stride * head_dim +
-                      offset % head_dim; // temporary hard code
-    kCache_ptr[to_k_idx + offset] =
-        static_cast<half>(devQKVProjArray[from_idx + q_hidden_size + kv_offset]);
+                          offset % head_dim; // temporary hard code
+    kCache_ptr[to_k_idx + offset] = static_cast<half>(
+        devQKVProjArray[from_idx + q_hidden_size + kv_offset]);
     kCache_ptr[to_v_idx + offset] =
-        static_cast<half>(devQKVProjArray[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]);
+        static_cast<half>(devQKVProjArray[from_idx + q_hidden_size +
+                                          temp_kv_hidden_size + kv_offset]);
   }
 }
 
@@ -495,8 +495,8 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
     // matrix A: output projection weight
     // matrix A's layout: [v_dim * num_heads, o_dim]
     DT const *A = weight_ptr + m->hidden_size * (m->qk_dim * m->num_q_heads +
-                                           m->qk_dim * m->num_q_heads +
-                                           m->v_dim * m->num_q_heads);
+                                                 m->qk_dim * m->num_q_heads +
+                                                 m->v_dim * m->num_q_heads);
     // matrix B: attn heads
     // matrix B's layout: [v_dim * num_heads, num_new_tokens]
     DT const *B = static_cast<DT *>(m->attn_heads);
@@ -540,9 +540,9 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
-                             GenericTensorAccessorR const weight,
-                             DataType data_type,
-                             cudaStream_t stream) {
+                      GenericTensorAccessorR const weight,
+                      DataType data_type,
+                      cudaStream_t stream) {
   // additional processing for weight uploading
   // Note that we update weight_ptr and bias_ptr when uploading weight and
   // bias
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 9e6da5238..cd937f165 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -145,8 +145,7 @@ Tensor
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qk_dim = kdim, v_dim = kdim,
-      o_dim = embed_dim;
+  int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim;
   int hidden_size = input->dims[0];
   int qParas = qk_dim * hidden_size;
   int kParas = qk_dim * hidden_size;
@@ -166,10 +165,8 @@ Tensor
   }
   if (qkv_bias || final_bias) {
     // q, k, v, o
-    int qkv_bias_size =
-        qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? o_dim : 0)};
+    int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -287,11 +284,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qk_dim(_kdim),
-      v_dim(_vdim), o_dim(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) {
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
@@ -330,8 +327,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                                  CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
           (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
@@ -389,11 +385,11 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qk_dim(_kdim),
-      v_dim(_vdim), o_dim(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias)
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -432,8 +428,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                                  CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
           (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 02d917165..df16d2979 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -496,13 +496,13 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
       stream));
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
   // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index d9b7b95e3..5010851b2 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -61,8 +61,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const head_dim = m->qk_dim;
   uint32_t const batch_size = bc->num_active_requests();
-  float const sm_scale =
-      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -244,13 +243,13 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
 
   // long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
   compute_qkv(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
   // phase 2: Update key/val cache
   update_qkv_cache<DT>(m, bc, stream);
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 647c1a42d..4e00bf0ef 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -149,8 +149,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
         numdims, dims, data_type, li, 0, true /*create_grad*/);
   }
   // Compute weight size
-  int qk_dim = kdim, v_dim = kdim,
-      o_dim = embed_dim;
+  int qk_dim = kdim, v_dim = kdim, o_dim = embed_dim;
   int hidden_size = input->dims[0];
   int qParas = qk_dim * hidden_size;
   int kParas = qk_dim * hidden_size;
@@ -178,10 +177,8 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   }
   if (qkv_bias || final_bias) {
     // q, k, v, o
-    int qkv_bias_size =
-        qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
-    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) +
-                   (final_bias ? o_dim : 0)};
+    int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+    int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0)};
     li->weights[1] = create_weight_legion_ordering(1,
                                                    dims,
                                                    data_type,
@@ -312,13 +309,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qk_dim(_kdim),
-      v_dim(_vdim), o_dim(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree) {
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
@@ -365,8 +361,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
           (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
@@ -427,13 +422,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
       apply_rotary_embedding(_apply_rotary_embedding),
-      hidden_size(_input->dims[0].size), qk_dim(_kdim),
-      v_dim(_vdim), o_dim(_embed_dim),
-      qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
-      scaling_query(_scaling_query), scaling_factor(_scaling_factor),
-      qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
-      quantization_type(_quantization_type), offload(_offload),
-      tensor_parallelism_degree(_tensor_parallelism_degree)
+      hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
+      o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
+      kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
+      scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
+      position_bias(_position_bias), quantization_type(_quantization_type),
+      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -477,8 +471,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
       ParallelTensorShape bias_shape = _input->get_shape();
-      int qkv_bias_size =
-          qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
+      int qkv_bias_size = qk_dim * num_q_heads + (qk_dim + v_dim) * num_q_heads;
       bias_shape.dims[0].size =
           (qkv_bias ? qkv_bias_size : 0) + (final_bias ? o_dim : 0);
       bias_shape.dims[1].size = bias_shape.dims[2].size = 1;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index c148eaf33..ee37c425a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -75,7 +75,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    hipStream_t stream) {
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
-    int parallelism = m->local_hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
+    int parallelism =
+        m->local_hidden_size * KV_WEIGHT_NUM * num_tokens_to_commit;
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(commit_tokens_kernel<DT>),
         GET_BLOCKS(parallelism),
@@ -491,13 +492,13 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                            stream));
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
 
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 2a2d7da93..cb545ec84 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -80,14 +80,20 @@ __global__ void commit_tokens_kernel(
       int const req_id = committedTokenInfos[i].request_index;
       int const tok_id = committedTokenInfos[i].token_depth;
 
-      size_t from_k_idx = get_k_entry_offset(
-                 req_id, index_in_kv_cache, max_num_pages, num_kv_heads, head_dim),
-             from_v_idx = get_v_entry_offset(
-                 req_id, index_in_kv_cache, max_num_pages, num_kv_heads, head_dim);
-      size_t to_k_idx =
-                 get_k_entry_offset(req_id, tok_id, max_num_pages, num_kv_heads, head_dim),
-             to_v_idx =
-                 get_v_entry_offset(req_id, tok_id, max_num_pages, num_kv_heads, head_dim);
+      size_t from_k_idx = get_k_entry_offset(req_id,
+                                             index_in_kv_cache,
+                                             max_num_pages,
+                                             num_kv_heads,
+                                             head_dim),
+             from_v_idx = get_v_entry_offset(req_id,
+                                             index_in_kv_cache,
+                                             max_num_pages,
+                                             num_kv_heads,
+                                             head_dim);
+      size_t to_k_idx = get_k_entry_offset(
+                 req_id, tok_id, max_num_pages, num_kv_heads, head_dim),
+             to_v_idx = get_v_entry_offset(
+                 req_id, tok_id, max_num_pages, num_kv_heads, head_dim);
       assert(to_k_idx <= from_k_idx);
 
       kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
@@ -147,8 +153,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
   uint32_t const num_kv_heads = m->num_kv_heads;
   uint32_t const head_dim = m->qk_dim;
   uint32_t const batch_size = bc->num_active_requests();
-  float const sm_scale =
-      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
+  float const sm_scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qk_dim) : 1.0f;
 
   //   cudaEventCreate(&t_start);
   //   cudaEventCreate(&t_end);
@@ -388,13 +393,13 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
   // Implement kernel to compute KQV for input tokens
   compute_qkv(m,
-                     bc,
-                     shard_id,
-                     input_ptr,
-                     weight_ptr,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     bias_ptr,
-                     stream);
+              bc,
+              shard_id,
+              input_ptr,
+              weight_ptr,
+              static_cast<DT *>(m->devQKVProjArray),
+              bias_ptr,
+              stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index b11f13b35..8588b8934 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
+#include "flexflow/utils/file_loader.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
-#include "flexflow/utils/file_loader.h"
 
 #include <vector>
 using namespace std;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 06925e3a7..bb027d586 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -296,36 +296,37 @@ void RequestManager::load_batch_config_task(
           if (handle.incr_attention_metadata->decode_handler_collections.count(
                   batch_size) == 0) {
             handle.incr_attention_metadata
-                ->decode_handler_collections[batch_size] =
-                static_cast<void *>(new flashinfer::BatchDecodeHandler(true, batch_size));
+                ->decode_handler_collections[batch_size] = static_cast<void *>(
+                new flashinfer::BatchDecodeHandler(true, batch_size));
           }
           handler = static_cast<BatchDecodeHandler *>(
               handle.incr_attention_metadata
                   ->decode_handler_collections[batch_size]);
 
           handler->SetCUDAStream(stream);
-          DISPATCH_HEADDIM(handle.incr_attention_metadata->head_dim(), HEAD_DIM, {
-            handler->BeginForwardDispatched<HEAD_DIM,
-                                            PageStorage::kIndices,
-                                            LogitsPostHook::kNone,
-                                            PosEncodingMode::kNone,
-                                            half,
-                                            half,
-                                            half,
-                                            int32_t>(
-                static_cast<void *>(
+          DISPATCH_HEADDIM(
+              handle.incr_attention_metadata->head_dim(), HEAD_DIM, {
+                handler->BeginForwardDispatched<HEAD_DIM,
+                                                PageStorage::kIndices,
+                                                LogitsPostHook::kNone,
+                                                PosEncodingMode::kNone,
+                                                half,
+                                                half,
+                                                half,
+                                                int32_t>(
+                    static_cast<void *>(
                         handle.incr_attention_metadata->float_workspace),
-                handle.incr_attention_metadata->float_workspace_size,
-                static_cast<void *>(
+                    handle.incr_attention_metadata->float_workspace_size,
+                    static_cast<void *>(
                         handle.incr_attention_metadata->int_workspace),
-                handle.incr_attention_metadata->int_workspace_size,
-                static_cast<int32_t *>(kv_indptr_h),
-                static_cast<int32_t *>(kv_last_page_len_h),
-                batch_size,
-                handle.incr_attention_metadata->num_q_heads(),
-                handle.incr_attention_metadata->num_kv_heads(),
-                kPagesize);
-          });
+                    handle.incr_attention_metadata->int_workspace_size,
+                    static_cast<int32_t *>(kv_indptr_h),
+                    static_cast<int32_t *>(kv_last_page_len_h),
+                    batch_size,
+                    handle.incr_attention_metadata->num_q_heads(),
+                    handle.incr_attention_metadata->num_kv_heads(),
+                    kPagesize);
+              });
         } else {
           BatchPrefillHandler *handler = nullptr;
           if (handle.incr_attention_metadata->prompt_handler_collections.count(
@@ -341,10 +342,10 @@ void RequestManager::load_batch_config_task(
           handler->SetCUDAStream(stream);
           handler->BeginForward<half, int32_t>(
               static_cast<void *>(
-                      handle.incr_attention_metadata->float_workspace),
+                  handle.incr_attention_metadata->float_workspace),
               handle.incr_attention_metadata->float_workspace_size,
               static_cast<void *>(
-                      handle.incr_attention_metadata->int_workspace),
+                  handle.incr_attention_metadata->int_workspace),
               handle.incr_attention_metadata->int_workspace_size,
               static_cast<int32_t *>(q_indptr_h),
               static_cast<int32_t *>(kv_indptr_h),
@@ -489,10 +490,10 @@ void RequestManager::load_batch_config_task(
         handler->SetCUDAStream(stream);
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
-                    handle.tree_search_attention_metadata->float_workspace),
+                handle.tree_search_attention_metadata->float_workspace),
             handle.tree_search_attention_metadata->float_workspace_size,
             static_cast<void *>(
-                    handle.tree_search_attention_metadata->int_workspace),
+                handle.tree_search_attention_metadata->int_workspace),
             handle.tree_search_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),
@@ -655,10 +656,10 @@ void RequestManager::load_batch_config_task(
         handler->SetCUDAStream(stream);
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
-                    handle.tree_verify_attention_metadata->float_workspace),
+                handle.tree_verify_attention_metadata->float_workspace),
             handle.tree_verify_attention_metadata->float_workspace_size,
             static_cast<void *>(
-                    handle.tree_verify_attention_metadata->int_workspace),
+                handle.tree_verify_attention_metadata->int_workspace),
             handle.tree_verify_attention_metadata->int_workspace_size,
             static_cast<int32_t *>(q_indptr_h),
             static_cast<int32_t *>(kv_indptr_h),

From 60c1dbea5be72fcef1710c01d82047be47c21e66 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 21 Aug 2024 22:05:47 -0700
Subject: [PATCH 405/667] chore: minor

---
 src/ops/kernels/inc_multihead_self_attention_kernels.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index b93d5a877..57a02e6f8 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -407,9 +407,9 @@ __global__ void
            to_v_idx = get_v_entry_offset(
                req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
-    int const stride = num_q_heads / num_kv_heads; // temporary hard code
-    int const kv_offset = offset / head_dim * stride * head_dim +
-                          offset % head_dim; // temporary hard code
+    int const stride = num_q_heads / num_kv_heads;
+    int const kv_offset =
+        offset / head_dim * stride * head_dim + offset % head_dim;
     kCache_ptr[to_k_idx + offset] = static_cast<half>(
         devQKVProjArray[from_idx + q_hidden_size + kv_offset]);
     kCache_ptr[to_v_idx + offset] =

From d29d155ab823812f8357b25a3bb80c717cc16dcf Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 23 Aug 2024 09:37:23 -0700
Subject: [PATCH 406/667] chore: separate attention meta into another header
 file

---
 include/flexflow/attention_config.h | 213 ++++++++++++++++++++++++++++
 include/flexflow/config.h           | 191 +------------------------
 2 files changed, 214 insertions(+), 190 deletions(-)
 create mode 100644 include/flexflow/attention_config.h

diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h
new file mode 100644
index 000000000..63b0112e7
--- /dev/null
+++ b/include/flexflow/attention_config.h
@@ -0,0 +1,213 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_
+#define _FLEXFLOW_ATTENTION_CONFIG_H_
+#include "flexflow/batch_config.h"
+
+namespace FlexFlow {
+
+constexpr uint32_t kPagesize = 64;
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
+  }
+
+class AttentionMetaData {
+public:
+  AttentionMetaData() {
+    num_q_heads_ = 0;
+    num_kv_heads_ = 0;
+    head_dim_ = 0;
+    q_indptr = nullptr;
+    kv_indptr = nullptr;
+    kv_indices = nullptr;
+    kv_last_page_len = nullptr;
+    qk_indptr = nullptr;
+    custom_mask = nullptr;
+    workspace = nullptr;
+    workspace_size = 0;
+    float_workspace = nullptr;
+    float_workspace_size = 0;
+    int_workspace = nullptr;
+    int_workspace_size = 0;
+    mem_size_ = 0;
+    enabled_ = false;
+  }
+  AttentionMetaData(AttentionMetaData const &rhs) {
+    num_q_heads_ = rhs.num_q_heads_;
+    num_kv_heads_ = rhs.num_kv_heads_;
+    head_dim_ = rhs.head_dim_;
+    q_indptr = rhs.q_indptr;
+    kv_indptr = rhs.kv_indptr;
+    kv_indices = rhs.kv_indices;
+    kv_last_page_len = rhs.kv_last_page_len;
+    qk_indptr = rhs.qk_indptr;
+    custom_mask = rhs.custom_mask;
+    workspace = rhs.workspace;
+    workspace_size = rhs.workspace_size;
+    float_workspace = rhs.float_workspace;
+    float_workspace_size = rhs.float_workspace_size;
+    int_workspace = rhs.int_workspace;
+    int_workspace_size = rhs.int_workspace_size;
+    mem_size_ = rhs.mem_size_;
+    enabled_ = rhs.enabled_;
+    decode_handler_collections = rhs.decode_handler_collections;
+    prompt_handler_collections = rhs.prompt_handler_collections;
+  }
+
+  size_t mem_size() {
+    if (mem_size_ > 0) {
+      return mem_size_;
+    }
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages =
+        (BatchConfig::max_spec_tree_token_num() +
+         BatchConfig::max_sequence_length() + kPagesize - 1) /
+        kPagesize;
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              ((BatchConfig::max_spec_tree_token_num() *
+                                    (BatchConfig::max_spec_tree_token_num() +
+                                     BatchConfig::max_sequence_length()) +
+                                7) /
+                               8);
+
+    float_workspace_size = 128 * 1024 * 1024; // 128 MB
+    int_workspace_size = 8 * 1024 * 1024;     // 8 MB
+    workspace_size =
+        float_workspace_size + int_workspace_size; // float + int workspace
+
+    mem_size_ = sizeof(int32_t) * indices_size +
+                sizeof(uint8_t) * custom_mask_size + workspace_size;
+    return mem_size_;
+  }
+
+  void assign_address(void *ptr, int size) {
+    if (ptr == nullptr) {
+      q_indptr = nullptr;
+      kv_indptr = nullptr;
+      kv_indices = nullptr;
+      kv_last_page_len = nullptr;
+      qk_indptr = nullptr;
+      custom_mask = nullptr;
+      workspace = nullptr;
+      float_workspace = nullptr;
+      int_workspace = nullptr;
+      return;
+    }
+    assert(size >= mem_size() &&
+           "Insufficient memory size for attention metadata");
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages =
+        (BatchConfig::max_spec_tree_token_num() +
+         BatchConfig::max_sequence_length() + kPagesize - 1) /
+        kPagesize;
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
+                              ((BatchConfig::max_spec_tree_token_num() *
+                                    (BatchConfig::max_spec_tree_token_num() +
+                                     BatchConfig::max_sequence_length()) +
+                                7) /
+                               8);
+
+    q_indptr = static_cast<int32_t *>(ptr);
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + max_num_pages * batch_size;
+    qk_indptr = kv_last_page_len + batch_size + 1;
+    custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
+    workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
+                                    sizeof(int32_t) * indices_size +
+                                    sizeof(uint8_t) * custom_mask_size);
+    float_workspace = workspace;
+    int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
+                                        float_workspace_size);
+  }
+
+  void set_num_q_heads(uint32_t const num_q_heads) {
+    num_q_heads_ = num_q_heads;
+  }
+  void set_num_kv_heads(uint32_t const num_kv_heads) {
+    num_kv_heads_ = num_kv_heads;
+  }
+  void set_head_dim(uint32_t const head_dim) {
+    head_dim_ = head_dim;
+  }
+  uint32_t num_q_heads() const {
+    return num_q_heads_;
+  }
+  uint32_t num_kv_heads() const {
+    return num_kv_heads_;
+  }
+  uint32_t head_dim() const {
+    return head_dim_;
+  }
+
+  void set_enabled(bool const enabled) {
+    enabled_ = enabled;
+  }
+  bool enabled() const {
+    return enabled_;
+  }
+
+  uint32_t num_q_heads_;
+  uint32_t num_kv_heads_;
+  uint32_t head_dim_;
+
+  int32_t *q_indptr;
+  int32_t *kv_indptr;
+  int32_t *kv_indices;
+  int32_t *kv_last_page_len;
+  int32_t *qk_indptr;
+  uint8_t *custom_mask;
+  void *workspace;
+  size_t workspace_size;
+  void *float_workspace;
+  size_t float_workspace_size;
+  void *int_workspace;
+  size_t int_workspace_size;
+
+  size_t mem_size_;
+
+  // batchsize -> handler
+  bool enabled_;
+  std::unordered_map<int, void *> decode_handler_collections;
+  std::unordered_map<int, void *> prompt_handler_collections;
+};
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_ATTENTION_CONFIG_H_
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 0e9325f09..0e15fc089 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -16,6 +16,7 @@
 #ifndef _FLEXFLOW_CONFIG_H_
 #define _FLEXFLOW_CONFIG_H_
 #include "ffconst.h"
+#include "flexflow/attention_config.h"
 #include "flexflow/batch_config.h"
 #include "legion.h"
 #include <cstddef>
@@ -67,196 +68,6 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 
 class FFConfig;
 
-constexpr uint32_t kPagesize = 64;
-#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
-  switch (head_dim) {                                                          \
-    case 64: {                                                                 \
-      constexpr size_t HEAD_DIM = 64;                                          \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case 128: {                                                                \
-      constexpr size_t HEAD_DIM = 128;                                         \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    case 256: {                                                                \
-      constexpr size_t HEAD_DIM = 256;                                         \
-      __VA_ARGS__                                                              \
-      break;                                                                   \
-    }                                                                          \
-    default: {                                                                 \
-      std::ostringstream err_msg;                                              \
-      err_msg << "Unsupported head_dim: " << head_dim;                         \
-      throw std::invalid_argument(err_msg.str());                              \
-    }                                                                          \
-  }
-
-class AttentionMetaData {
-public:
-  AttentionMetaData() {
-    num_q_heads_ = 0;
-    num_kv_heads_ = 0;
-    head_dim_ = 0;
-    q_indptr = nullptr;
-    kv_indptr = nullptr;
-    kv_indices = nullptr;
-    kv_last_page_len = nullptr;
-    qk_indptr = nullptr;
-    custom_mask = nullptr;
-    workspace = nullptr;
-    workspace_size = 0;
-    float_workspace = nullptr;
-    float_workspace_size = 0;
-    int_workspace = nullptr;
-    int_workspace_size = 0;
-    mem_size_ = 0;
-    enabled_ = false;
-  }
-  AttentionMetaData(AttentionMetaData const &rhs) {
-    num_q_heads_ = rhs.num_q_heads_;
-    num_kv_heads_ = rhs.num_kv_heads_;
-    head_dim_ = rhs.head_dim_;
-    q_indptr = rhs.q_indptr;
-    kv_indptr = rhs.kv_indptr;
-    kv_indices = rhs.kv_indices;
-    kv_last_page_len = rhs.kv_last_page_len;
-    qk_indptr = rhs.qk_indptr;
-    custom_mask = rhs.custom_mask;
-    workspace = rhs.workspace;
-    workspace_size = rhs.workspace_size;
-    float_workspace = rhs.float_workspace;
-    float_workspace_size = rhs.float_workspace_size;
-    int_workspace = rhs.int_workspace;
-    int_workspace_size = rhs.int_workspace_size;
-    mem_size_ = rhs.mem_size_;
-    enabled_ = rhs.enabled_;
-    decode_handler_collections = rhs.decode_handler_collections;
-    prompt_handler_collections = rhs.prompt_handler_collections;
-  }
-
-  size_t mem_size() {
-    if (mem_size_ > 0) {
-      return mem_size_;
-    }
-    size_t batch_size = BatchConfig::max_requests_per_batch();
-    size_t max_num_pages =
-        (BatchConfig::max_spec_tree_token_num() +
-         BatchConfig::max_sequence_length() + kPagesize - 1) /
-        kPagesize;
-    size_t indices_size = std::max(
-        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
-    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
-                              ((BatchConfig::max_spec_tree_token_num() *
-                                    (BatchConfig::max_spec_tree_token_num() +
-                                     BatchConfig::max_sequence_length()) +
-                                7) /
-                               8);
-
-    float_workspace_size = 128 * 1024 * 1024; // 128 MB
-    int_workspace_size = 8 * 1024 * 1024;     // 8 MB
-    workspace_size =
-        float_workspace_size + int_workspace_size; // float + int workspace
-
-    mem_size_ = sizeof(int32_t) * indices_size +
-                sizeof(uint8_t) * custom_mask_size + workspace_size;
-    return mem_size_;
-  }
-
-  void assign_address(void *ptr, int size) {
-    if (ptr == nullptr) {
-      q_indptr = nullptr;
-      kv_indptr = nullptr;
-      kv_indices = nullptr;
-      kv_last_page_len = nullptr;
-      qk_indptr = nullptr;
-      custom_mask = nullptr;
-      workspace = nullptr;
-      float_workspace = nullptr;
-      int_workspace = nullptr;
-      return;
-    }
-    assert(size >= mem_size() &&
-           "Insufficient memory size for attention metadata");
-    size_t batch_size = BatchConfig::max_requests_per_batch();
-    size_t max_num_pages =
-        (BatchConfig::max_spec_tree_token_num() +
-         BatchConfig::max_sequence_length() + kPagesize - 1) /
-        kPagesize;
-    size_t indices_size = std::max(
-        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
-    size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
-                              ((BatchConfig::max_spec_tree_token_num() *
-                                    (BatchConfig::max_spec_tree_token_num() +
-                                     BatchConfig::max_sequence_length()) +
-                                7) /
-                               8);
-
-    q_indptr = static_cast<int32_t *>(ptr);
-    kv_indptr = q_indptr + batch_size + 1;
-    kv_indices = kv_indptr + batch_size + 1;
-    kv_last_page_len = kv_indices + max_num_pages * batch_size;
-    qk_indptr = kv_last_page_len + batch_size + 1;
-    custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
-    workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
-                                    sizeof(int32_t) * indices_size +
-                                    sizeof(uint8_t) * custom_mask_size);
-    float_workspace = workspace;
-    int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
-                                        float_workspace_size);
-  }
-
-  void set_num_q_heads(uint32_t const num_q_heads) {
-    num_q_heads_ = num_q_heads;
-  }
-  void set_num_kv_heads(uint32_t const num_kv_heads) {
-    num_kv_heads_ = num_kv_heads;
-  }
-  void set_head_dim(uint32_t const head_dim) {
-    head_dim_ = head_dim;
-  }
-  uint32_t num_q_heads() const {
-    return num_q_heads_;
-  }
-  uint32_t num_kv_heads() const {
-    return num_kv_heads_;
-  }
-  uint32_t head_dim() const {
-    return head_dim_;
-  }
-
-  void set_enabled(bool const enabled) {
-    enabled_ = enabled;
-  }
-  bool enabled() const {
-    return enabled_;
-  }
-
-  uint32_t num_q_heads_;
-  uint32_t num_kv_heads_;
-  uint32_t head_dim_;
-
-  int32_t *q_indptr;
-  int32_t *kv_indptr;
-  int32_t *kv_indices;
-  int32_t *kv_last_page_len;
-  int32_t *qk_indptr;
-  uint8_t *custom_mask;
-  void *workspace;
-  size_t workspace_size;
-  void *float_workspace;
-  size_t float_workspace_size;
-  void *int_workspace;
-  size_t int_workspace_size;
-
-  size_t mem_size_;
-
-  // batchsize -> handler
-  bool enabled_;
-  std::unordered_map<int, void *> decode_handler_collections;
-  std::unordered_map<int, void *> prompt_handler_collections;
-};
-
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;

From bc4d9f7a27de67940518c4f4ad917829adde1bf5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 23 Aug 2024 22:04:42 -0700
Subject: [PATCH 407/667] feat: avoid patch query

---
 config/config.linux | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/config.linux b/config/config.linux
index 873f74783..15e9c8821 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,7 +111,7 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
-patch -p0 $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/../config/raft.patch
+patch -p0 --batch $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/../config/raft.patch
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc

From e41f37410df1fa359355478d9677abe0d67b00a1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 25 Aug 2024 11:55:44 -0700
Subject: [PATCH 408/667] chore: separate apply_pos_encoding from compute_qkv

---
 .../inc_multihead_self_attention_kernels.h    |   6 +
 src/ops/inc_multihead_self_attention.cu       |   6 +
 .../inc_multihead_self_attention_kernels.cu   | 166 +++++++++---------
 src/ops/spec_inc_multihead_self_attention.cu  |   6 +
 src/ops/tree_inc_multihead_self_attention.cu  |   5 +
 5 files changed, 109 insertions(+), 80 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 29d2cd1dd..d8f70db8b 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -54,6 +54,12 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                  DT const *bias_ptr,
                  ffStream_t stream);
 
+template <typename DT>
+void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        DT *output_ptr,
+                        cudaStream_t stream);
+
 template <typename DT>
 void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
                       BatchConfig const *bc,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 83ff630a6..a10561169 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -255,6 +255,12 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
               static_cast<DT *>(m->devQKVProjArray),
               bias_ptr,
               stream);
+
+  apply_pos_encoding(m,
+                     bc,
+                     static_cast<DT *>(m->devQKVProjArray),
+                     stream);
+
   // phase 2: Update key/val cache
   update_qkv_cache<DT>(m, bc, stream);
 
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 57a02e6f8..206f134d4 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -175,59 +175,6 @@ __global__ void
   }
 }
 
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_hf(DT *input_ptr,
-                              cuFloatComplex *complex_input,
-                              BatchConfig::PerTokenInfo const *tokenInfos,
-                              int qk_dim,
-                              int num_tokens,
-                              size_t q_array_size,
-                              int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qk_dim : qk_dim;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int token_idx = real_i / (hidden_size / 2);
-    int idx = real_i % (proj_size / 2);
-    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
-
-    int real_part_index = idx + head_idx * proj_size +
-                          token_idx * hidden_size * QKV_WEIGHT_NUM +
-                          hidden_size * (q_tensor ? 0 : 1);
-    int complex_part_index = real_part_index + (proj_size / 2);
-
-    // complex_input[i] = {input_ptr[real_part_index],
-    //                     input_ptr[complex_part_index]};
-    cuFloatComplex cii = {input_ptr[real_part_index],
-                          input_ptr[complex_part_index]};
-
-    // get the freq_cis: shape 1 * (qk_dim/2) = 1 * 64
-    // apply a Cartesian coordinate transformation
-    // multiple with input & /copy back to q/k
-
-    // get position of token
-
-    // size_t pos = id_map[token_idx].token_position;
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    // complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    // input_ptr[real_part_index] = complex_input[i].x;
-    // input_ptr[complex_part_index] = complex_input[i].y;
-
-    cii = cuCmulf(cii, complex_pos);
-    input_ptr[real_part_index] = cii.x;
-    input_ptr[complex_part_index] = cii.y;
-  }
-}
-
 template <typename DT>
 void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                  BatchConfig const *bc,
@@ -312,7 +259,6 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->qk_dim * num_tokens * m->num_q_heads;
-  size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
 
   // Step 2: apply bias for QKV, or scale the query
   if (*m->qkv_bias) {
@@ -341,35 +287,83 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                                      m->scaling_factor,
                                      m->local_hidden_size);
   }
+}
 
-  //   checkCUDA(cudaEventCreate(&t_start));
-  //   checkCUDA(cudaEventCreate(&t_end));
-  //   checkCUDA(cudaEventRecord(t_start, stream));
+template <typename DT>
+__global__ void
+    apply_pos_encoding_kernel(DT *input_ptr,
+                              cuFloatComplex *complex_input,
+                              BatchConfig::PerTokenInfo const *tokenInfos,
+                              int qk_dim,
+                              int num_tokens,
+                              size_t q_array_size,
+                              int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // create complex number
+    bool q_tensor = i < (q_array_size / 2);
+    int proj_size = q_tensor ? qk_dim : qk_dim;
+    int real_i = q_tensor ? i : i - q_array_size / 2;
+
+    int token_idx = real_i / (hidden_size / 2);
+    int idx = real_i % (proj_size / 2);
+    int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
+
+    int real_part_index = idx + head_idx * proj_size +
+                          token_idx * hidden_size * QKV_WEIGHT_NUM +
+                          hidden_size * (q_tensor ? 0 : 1);
+    int complex_part_index = real_part_index + (proj_size / 2);
+
+    // complex_input[i] = {input_ptr[real_part_index],
+    //                     input_ptr[complex_part_index]};
+    cuFloatComplex cii = {input_ptr[real_part_index],
+                          input_ptr[complex_part_index]};
+
+    // get the freq_cis: shape 1 * (qk_dim/2) = 1 * 64
+    // apply a Cartesian coordinate transformation
+    // multiple with input & /copy back to q/k
+
+    // get position of token
+
+    // size_t pos = id_map[token_idx].token_position;
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    // float before_real = complex_input[i].x, before_complex =
+    int pos_i = real_i % (proj_size / 2);
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    // complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    // input_ptr[real_part_index] = complex_input[i].x;
+    // input_ptr[complex_part_index] = complex_input[i].y;
 
-  // Step 3: apply rotary embedding if needed
-  if (*m->apply_rotary_embedding) {
-    /*q&k*/
-    parallelism = num_tokens * m->local_hidden_size;
-    apply_rotary_embedding_hf<<<GET_BLOCKS(parallelism),
-                                min(CUDA_NUM_THREADS, parallelism),
-                                0,
-                                stream>>>(output_ptr,
-                                          m->complex_input,
-                                          m->token_infos,
-                                          m->qk_dim,
-                                          num_tokens,
-                                          q_array_size,
-                                          m->local_hidden_size);
+    cii = cuCmulf(cii, complex_pos);
+    input_ptr[real_part_index] = cii.x;
+    input_ptr[complex_part_index] = cii.y;
   }
-  //   checkCUDA(cudaEventRecord(t_end, stream));
-  //   checkCUDA(cudaEventSynchronize(t_end));
-  //   elapsed = 0;
-  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  //   cudaEventDestroy(t_start);
-  //   cudaEventDestroy(t_end);
-  //   if (bc->inference_mode == TREE_VERIFY_MODE and device == 0) {
-  //     std::cout << "Rotary time: " << elapsed << " ms\n";
-  //   }
+}
+
+template <typename DT>
+void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
+                        BatchConfig const *bc,
+                        DT *output_ptr,
+                        cudaStream_t stream) {
+  // apply rotary embedding if needed
+  if (!*m->apply_rotary_embedding) {
+    return;
+  }
+  int num_tokens = bc->num_active_tokens();
+  int parallelism = num_tokens * m->local_hidden_size;
+  size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
+  apply_pos_encoding_kernel<<<GET_BLOCKS(parallelism),
+                              min(CUDA_NUM_THREADS, parallelism),
+                              0,
+                              stream>>>(output_ptr,
+                                        m->complex_input,
+                                        m->token_infos,
+                                        m->qk_dim,
+                                        num_tokens,
+                                        q_array_size,
+                                        m->local_hidden_size);
 }
 
 template <typename DT>
@@ -635,6 +629,18 @@ template void Kernels::IncMultiHeadAttention::compute_qkv<half>(
     half const *bias_ptr,
     cudaStream_t stream);
 
+template void Kernels::IncMultiHeadAttention::apply_pos_encoding<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    float *output_ptr,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::apply_pos_encoding<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    half *output_ptr,
+    cudaStream_t stream);
+
 template void Kernels::IncMultiHeadAttention::update_qkv_cache<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 5010851b2..cbef406e2 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -250,6 +250,12 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
               static_cast<DT *>(m->devQKVProjArray),
               bias_ptr,
               stream);
+
+  apply_pos_encoding(m,
+                     bc,
+                     static_cast<DT *>(m->devQKVProjArray),
+                     stream);
+
   // phase 2: Update key/val cache
   update_qkv_cache<DT>(m, bc, stream);
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index cb545ec84..aa129bd00 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -401,6 +401,11 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
+  apply_pos_encoding(m,
+                     bc,
+                     static_cast<DT *>(m->devQKVProjArray),
+                     stream);
+
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
   //   elapsed = 0;

From 5783cf184f90a4257667a4adb4c9441d22924f15 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 10:52:49 -0700
Subject: [PATCH 409/667] chore: remove unused ptr

---
 .../flexflow/ops/inc_multihead_self_attention.h |  1 -
 src/ops/inc_multihead_self_attention.cu         | 17 ++---------------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 5a90dd61b..dc70f6720 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -186,7 +186,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   void *weight_ptr, *bias_ptr; // for weight offload
   void *devQKVProjArray, *queryTmp, *kvCache;
   half *outputTmp;
-  void *qk_prods, *qk_prods_softmax;
   void *attn_heads;
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index a10561169..689fa1c23 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -504,8 +504,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t qkv_max_proj_size =
         max_tokens_per_batch *
         (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads);
-    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0,
-           qk_prod_size = 0;
+    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0;
     // assert((BatchConfig::max_sequence_length() +
     //         BatchConfig::max_spec_tree_token_num()) %
     //            kPagesize ==
@@ -527,8 +526,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         value_cache_size = num_q_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
-        qk_prod_size = BatchConfig::max_sequence_length() * max_num_pages *
-                       kPagesize * num_q_heads;
         break;
       }
       default:
@@ -541,7 +538,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         2;
     size_t totalSize =
         (qkv_max_proj_size + query_tmp_size + key_cache_size +
-         value_cache_size + 2 * qk_prod_size + attn_heads_size) *
+         value_cache_size + attn_heads_size) *
             size_of_dt +
         output_tmp_size * data_type_size(DT_HALF) +
         complex_size * sizeof(cuFloatComplex); // more components will
@@ -607,12 +604,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
       //         tokeninfo_size);
       // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size;
-      qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
-                                                             size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
-      qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped(
-          qk_prod_size * size_of_dt);
-      // offset += qk_prod_size * size_of_dt;
       attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
                                                                size_of_dt);
       // offset += attn_heads_size * size_of_dt;
@@ -626,10 +617,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       // token_infos =
       //     gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
       //         tokeninfo_size);
-      qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
-                                                             size_of_dt);
-      qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
-          qk_prod_size * size_of_dt);
       attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size *
                                                                size_of_dt);
       complex_input =

From ea580f7314796bb830b99289a8a245662b3f411d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 10:54:29 -0700
Subject: [PATCH 410/667] fix: memory pointer alignment

---
 include/flexflow/batch_config.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index fef5d0b73..a3b298bfc 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -50,7 +50,7 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  inline static int const MAX_NUM_REQUESTS = 64;
+  inline static int const MAX_NUM_REQUESTS = 8;
   inline static int const MAX_NUM_TOKENS = 1024;
   inline static int const MAX_SPEC_TREE_TOKEN_NUM = 128;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 4;
@@ -69,6 +69,7 @@ class BatchConfig {
     int first_token_index_in_request = -1;
     int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
+    int padding = 0; // Padding for memory pointer alignment
   };
 
   struct PerTokenInfo {

From be93e5cce0b2d71b071b7f7f0e031c0854f30e1a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 14:53:26 -0700
Subject: [PATCH 411/667] chore: minor smplification

---
 include/flexflow/attention_config.h          | 15 ++++++++------
 src/ops/inc_multihead_self_attention.cu      |  5 ++---
 src/ops/tree_inc_multihead_self_attention.cu |  5 ++---
 src/runtime/request_manager.cu               | 21 +++++++++-----------
 4 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h
index 63b0112e7..7144b7ab3 100644
--- a/include/flexflow/attention_config.h
+++ b/include/flexflow/attention_config.h
@@ -20,6 +20,11 @@
 namespace FlexFlow {
 
 constexpr uint32_t kPagesize = 64;
+
+inline int round_up_pages(int const num_elements) {
+  return (num_elements + kPagesize - 1) / kPagesize;
+}
+
 #define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
   switch (head_dim) {                                                          \
     case 64: {                                                                 \
@@ -93,9 +98,8 @@ class AttentionMetaData {
     }
     size_t batch_size = BatchConfig::max_requests_per_batch();
     size_t max_num_pages =
-        (BatchConfig::max_spec_tree_token_num() +
-         BatchConfig::max_sequence_length() + kPagesize - 1) /
-        kPagesize;
+        round_up_pages(BatchConfig::max_spec_tree_token_num() +
+                       BatchConfig::max_sequence_length());
     size_t indices_size = std::max(
         (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
@@ -132,9 +136,8 @@ class AttentionMetaData {
            "Insufficient memory size for attention metadata");
     size_t batch_size = BatchConfig::max_requests_per_batch();
     size_t max_num_pages =
-        (BatchConfig::max_spec_tree_token_num() +
-         BatchConfig::max_sequence_length() + kPagesize - 1) /
-        kPagesize;
+        round_up_pages(BatchConfig::max_spec_tree_token_num() +
+                       BatchConfig::max_sequence_length());
     size_t indices_size = std::max(
         (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
     size_t custom_mask_size = BatchConfig::max_requests_per_batch() *
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 689fa1c23..4d2963b9d 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -510,9 +510,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     //            kPagesize ==
     //        0);
     size_t max_num_pages =
-        (BatchConfig::max_sequence_length() +
-         BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-        kPagesize;
+        round_up_pages(BatchConfig::max_sequence_length() +
+                       BatchConfig::max_spec_tree_token_num());
     switch (infer_mode) {
       case INC_DECODING_MODE:
       case TREE_SEARCH_MODE:
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index aa129bd00..586957886 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -111,9 +111,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   //   cudaEventRecord(t_start, stream);
 
   int const max_num_pages =
-      (BatchConfig::max_sequence_length() +
-       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-      kPagesize;
+      round_up_pages(BatchConfig::max_sequence_length() +
+                     BatchConfig::max_spec_tree_token_num());
   int const num_requests = bc->num_active_requests();
   int parallelism = m->num_kv_heads * m->qk_dim * num_requests;
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index bb027d586..04f18b1c3 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -246,9 +246,8 @@ void RequestManager::load_batch_config_task(
             sizeof(BatchConfig::requestsInfo));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
-            (BatchConfig::max_sequence_length() +
-             BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-            kPagesize;
+            round_up_pages(BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
 
         int parallelism = batch_size;
         prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
@@ -285,7 +284,7 @@ void RequestManager::load_batch_config_task(
                     .first_token_index_in_request;
             q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
             kv_indptr_h[indptr_idx + 1] =
-                kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+                kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
             kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
             indptr_idx++;
           }
@@ -392,9 +391,8 @@ void RequestManager::load_batch_config_task(
                 sizeof(BatchConfig::request_available));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
-            (BatchConfig::max_sequence_length() +
-             BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-            kPagesize;
+            round_up_pages(BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
 
         int parallelism = batch_size;
         prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
@@ -482,7 +480,7 @@ void RequestManager::load_batch_config_task(
                     .first_token_index_in_request;
             q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
             kv_indptr_h[indptr_idx + 1] =
-                kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+                kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
             indptr_idx++;
           }
         }
@@ -558,9 +556,8 @@ void RequestManager::load_batch_config_task(
                 sizeof(BatchConfig::request_available));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
-            (BatchConfig::max_sequence_length() +
-             BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-            kPagesize;
+            round_up_pages(BatchConfig::max_sequence_length() +
+                           BatchConfig::max_spec_tree_token_num());
 
         int parallelism = batch_size;
         prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
@@ -648,7 +645,7 @@ void RequestManager::load_batch_config_task(
                     .first_token_index_in_request;
             q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
             kv_indptr_h[indptr_idx + 1] =
-                kv_indptr_h[indptr_idx] + (kv_len + kPagesize - 1) / kPagesize;
+                kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
             indptr_idx++;
           }
         }

From b6bcd4e1881f5be4ef6a86157a3f9b1f15f6c222 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 15:49:23 -0700
Subject: [PATCH 412/667] feat: StreamingCacheInfo

---
 include/flexflow/batch_config.h | 30 ++++++++++++++++++++
 src/runtime/batch_config.cc     | 49 +++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index a3b298bfc..638a99d9e 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -27,6 +27,32 @@ class InferenceResult;
 using BatchConfigFuture = Legion::Future;
 using InferenceResultFuture = Legion::Future;
 
+/*
+ * StreamingCacheInfo is a class that manages the streaming kv cache for
+ * attention operator (https://arxiv.org/abs/2309.17453), and we use it in draft
+ * model. It matains a fixed-content *sink* cache and a fixed-size *window*
+ * cache. The *sink* cache is the foremost part of the original kv cache, while
+ * the *window* cache is the backmost part of the original kv cache and is
+ * rolling updated. The information is per-request.
+ * Note that the position encoding of the q&k alters each iteration (relative
+ * position), so we store the *pre-pos-encoding* kv value in the cache.
+ */
+class StreamingCacheInfo {
+public:
+  StreamingCacheInfo();
+  StreamingCacheInfo(int sink_cache_size, int window_cache_size);
+  StreamingCacheInfo(StreamingCacheInfo const &other);
+
+  void update_cache(int len);
+  void reset_cache();
+
+public:
+  int sink_cache_size, window_cache_size;
+  // the meta info of the window cache, commit_len helps to determine if we fill
+  // up the window.
+  int window_back, commit_len;
+};
+
 class BatchConfig {
 public:
   using RequestGuid = size_t;
@@ -57,6 +83,9 @@ class BatchConfig {
   inline static int const MAX_TREE_DEPTH = 16;
   inline static int const MAX_TREE_WIDTH = 64;
   inline static int const MAX_K_LOGITS = 16;
+  // The Constants for the Streaming KVCache
+  inline static int const SINK_SIZE = 4;
+  inline static int const STREAMING_MAX_POS = 2048;
 
   int num_tokens = 0;
   int num_available_requests = 0;
@@ -151,6 +180,7 @@ class BatchConfig {
 
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
+  StreamingCacheInfo streamingCacheInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
   CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
   bool request_available[MAX_NUM_REQUESTS];
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index d74f8084c..a2f78a73a 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -16,6 +16,7 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/request_manager.h"
 #include "legion.h"
+#include <algorithm>
 #include <cassert>
 #include <climits>
 
@@ -48,6 +49,7 @@ BatchConfig::BatchConfig(BatchConfig const &rhs) {
     if (rhs.request_available[request_idx]) {
       request_available[request_idx] = true;
       requestsInfo[request_idx] = rhs.requestsInfo[request_idx];
+      streamingCacheInfo[request_idx] = rhs.streamingCacheInfo[request_idx];
       causalMask[request_idx] = rhs.causalMask[request_idx];
     }
   }
@@ -155,6 +157,24 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
     }
   }
 
+  // Streaming cache info
+  if (bc.inference_mode == TREE_SEARCH_MODE) {
+    os << "Streaming cache info:\n";
+    for (int i = 0; i < bc.max_requests_per_batch(); i++) {
+      if (bc.request_available[i]) {
+        os << "  Request " << i << ":\n";
+        os << "    Sink cache size: "
+           << bc.streamingCacheInfo[i].sink_cache_size << std::endl;
+        os << "    Window cache size: "
+           << bc.streamingCacheInfo[i].window_cache_size << std::endl;
+        os << "    Window back: " << bc.streamingCacheInfo[i].window_back
+           << std::endl;
+        os << "    Commit len: " << bc.streamingCacheInfo[i].commit_len
+           << std::endl;
+      }
+    }
+  }
+
   // Per-token info
   os << "Per-token info:\n";
   for (int i = 0; i < bc.num_tokens; i++) {
@@ -232,4 +252,33 @@ InferenceResult::InferenceResult(InferenceResult const &other) {
             gumbel_logits);
 }
 
+StreamingCacheInfo::StreamingCacheInfo() : StreamingCacheInfo(0, 0) {}
+
+StreamingCacheInfo::StreamingCacheInfo(int sink_cache_size,
+                                       int window_cache_size)
+    : sink_cache_size(sink_cache_size), window_cache_size(window_cache_size),
+      window_back(0), commit_len(0) {}
+
+StreamingCacheInfo::StreamingCacheInfo(StreamingCacheInfo const &other)
+    : sink_cache_size(other.sink_cache_size),
+      window_cache_size(other.window_cache_size),
+      window_back(other.window_back), commit_len(other.commit_len) {}
+
+// For draft model, we only update the cache when prefill or
+// commit the verified result from target model
+void StreamingCacheInfo::update_cache(int len) {
+  commit_len += len;
+  if (commit_len <= sink_cache_size + window_cache_size) {
+    window_back = std::max(0, commit_len - sink_cache_size);
+  } else {
+    commit_len = sink_cache_size + window_cache_size;
+    window_back = (window_back + len - 1) % window_cache_size + 1;
+  }
+}
+
+void StreamingCacheInfo::reset_cache() {
+  window_back = 0;
+  commit_len = 0;
+}
+
 }; // namespace FlexFlow

From f2634a969d9dc8672664d317078b11118700f811 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 20:48:32 -0700
Subject: [PATCH 413/667] feat: add streamingCache-related meta params

---
 include/flexflow/batch_config.h               |  2 +-
 include/flexflow/flexflow_c.h                 |  4 +++
 include/flexflow/model.h                      |  4 +++
 .../ops/inc_multihead_self_attention.h        | 14 ++++++--
 .../ops/inc_multihead_self_attention_params.h |  2 +-
 .../ops/spec_inc_multihead_self_attention.h   |  3 ++
 ...spec_inc_multihead_self_attention_params.h |  1 +
 inference/incr_decoding/incr_decoding.cc      |  1 +
 inference/models/falcon.cc                    |  2 ++
 inference/models/llama.cc                     |  3 ++
 inference/models/llama.h                      |  1 +
 inference/models/starcoder.cc                 |  1 +
 inference/spec_infer/spec_infer.cc            |  2 ++
 python/flexflow/core/flexflow_cffi.py         |  4 +++
 src/c/flexflow_c.cc                           |  8 +++++
 src/ops/inc_multihead_self_attention.cc       | 22 +++++++++++--
 src/ops/inc_multihead_self_attention.cu       | 33 +++++++++++++++----
 src/ops/spec_inc_multihead_self_attention.cc  | 20 +++++++++--
 src/ops/spec_inc_multihead_self_attention.cu  |  3 +-
 src/ops/tree_inc_multihead_self_attention.cu  |  3 +-
 20 files changed, 113 insertions(+), 20 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 638a99d9e..0ef446e35 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -85,7 +85,7 @@ class BatchConfig {
   inline static int const MAX_K_LOGITS = 16;
   // The Constants for the Streaming KVCache
   inline static int const SINK_SIZE = 4;
-  inline static int const STREAMING_MAX_POS = 2048;
+  inline static int const MAX_STREAMING_POS = 2048;
 
   int num_tokens = 0;
   int num_available_requests = 0;
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index ddf9c7e8a..9bc2c6973 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -448,6 +448,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
@@ -468,6 +469,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
@@ -509,6 +511,7 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
@@ -530,6 +533,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 948feb364..6618fdaf8 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -724,6 +724,7 @@ class FFModel {
                                       float scaling_factor = 1.0f,
                                       bool qk_prod_scaling = true,
                                       bool position_bias = false,
+                                      bool streaming_cache = false,
                                       char const *name = NULL);
   Tensor
       spec_inc_multihead_self_attention(Tensor const input,
@@ -742,6 +743,7 @@ class FFModel {
                                         float scaling_factor = 1.0f,
                                         bool qk_prod_scaling = true,
                                         bool position_bias = false,
+                                        bool streaming_cache = false,
                                         char const *name = NULL);
   Tensor inc_multihead_self_attention_verify(
       Tensor const input,
@@ -778,6 +780,7 @@ class FFModel {
                                    float scaling_factor = 1.0f,
                                    bool qk_prod_scaling = true,
                                    bool position_bias = false,
+                                   bool streaming_cache = false,
                                    char const *name = NULL);
   Tensor
       spec_inc_multiquery_self_attention(Tensor const input,
@@ -797,6 +800,7 @@ class FFModel {
                                          float scaling_factor = 1.0f,
                                          bool qk_prod_scaling = true,
                                          bool position_bias = false,
+                                         bool streaming_cache = false,
                                          char const *name = NULL);
   Tensor inc_multiquery_self_attention_verify(
       Tensor const input,
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index dc70f6720..f0b764cf7 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -47,6 +47,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
+                            bool _streaming_cache,
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
@@ -69,6 +70,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool allocate_weights,
                             DataType _quantization_type,
                             bool _offload,
+                            bool _streaming_cache,
                             int _tensor_parallelism_degree,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
@@ -131,7 +133,7 @@ class IncMultiHeadSelfAttention : public Op {
   int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
-  bool offload;
+  bool offload, streaming_cache;
 };
 
 class IncMultiHeadSelfAttentionMeta : public OpMeta {
@@ -165,7 +167,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 int _num_q_heads,
                                 int _num_kv_heads,
                                 DataType _quantization_type,
-                                bool _offload);
+                                bool _offload,
+                                bool _streaming_cache);
   ~IncMultiHeadSelfAttentionMeta(void);
 
 public:
@@ -184,8 +187,13 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   bool *position_bias;
   float scaling_factor;
   void *weight_ptr, *bias_ptr; // for weight offload
-  void *devQKVProjArray, *queryTmp, *kvCache;
+  void *devQKVProjArray, *queryTmp;
   half *outputTmp;
+  void *kvCache;
+  bool streaming_cache;
+  // When enable Streaming cache, we alter relative position each iteration, so
+  // we need below memory buffer for storing the pre-pos-encoding key value.
+  void *streamingPrePosEnc;
   void *attn_heads;
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
index 58681069e..7c259a0a9 100644
--- a/include/flexflow/ops/inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -15,7 +15,7 @@ struct IncMultiHeadSelfAttentionParams {
   bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
       scaling_query, qk_prod_scaling, position_bias;
   DataType quantization_type;
-  bool offload;
+  bool offload, streaming_cache;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index 617263a05..b08e161c5 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -42,6 +42,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
                                 bool allocate_weights,
+                                bool _streaming_cache,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 ParallelTensor const _input,
@@ -61,6 +62,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
                                 bool allocate_weights,
+                                bool _streaming_cache,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 SpecIncMultiHeadSelfAttention const &other,
@@ -124,6 +126,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
       qk_prod_scaling, position_bias;
   int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
+  bool streaming_cache;
 };
 
 class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index 1461224ba..2def2a51c 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -13,6 +13,7 @@ struct SpecIncMultiHeadSelfAttentionParams {
   float dropout, scaling_factor;
   bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
       scaling_query, qk_prod_scaling, position_bias;
+  bool streaming_cache;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
 };
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index adf51aa30..82d31bf2a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -237,6 +237,7 @@ void FlexFlow::top_level_task(Task const *task,
                               weights_filepath,
                               INC_DECODING_MODE,
                               generationConfig,
+                              false,
                               use_full_precision);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index a9805bf8e..24c63ea0e 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -116,6 +116,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
+            false,   /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
@@ -166,6 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
+            false,  /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index a1f4d370f..64e54ae6b 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -25,6 +25,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                                std::string const &weight_file_path,
                                InferenceMode mode,
                                GenerationConfig generation_config,
+                               bool streaming_cache,
                                bool use_full_precision) {
   // do not apply cpu offload in beam search model.
   LLAMAConfig llama_config(model_config_file_path);
@@ -112,6 +113,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
+            streaming_cache,
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
@@ -160,6 +162,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
+            streaming_cache,   /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 1a6a9114e..a5b2c4a40 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -86,6 +86,7 @@ class LLAMA {
                                  std::string const &weight_file_path,
                                  InferenceMode mode,
                                  GenerationConfig generation_config,
+                                 bool streaming_cache,
                                  bool use_full_precision = false);
 };
 
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 8251ef71c..55faec3a7 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -124,6 +124,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
+            false,                      /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 1cdb2e8e9..e57528ad4 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -371,6 +371,7 @@ void FlexFlow::top_level_task(Task const *task,
                               model_metadata.llm_weights_path,
                               TREE_VERIFY_MODE,
                               generationConfig,
+                              false,
                               use_full_precision);
   } else if (model_metadata.llm_model_type == ModelType::OPT) {
     OPT::create_opt_model(tree_model,
@@ -418,6 +419,7 @@ void FlexFlow::top_level_task(Task const *task,
                                 model_metadata.ssm_model_weights_paths[ssm_id],
                                 TREE_SEARCH_MODE,
                                 generationConfig,
+                                false,
                                 use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 403f2cba5..dcdda6698 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -2784,6 +2784,7 @@ def spec_inc_multihead_self_attention(
         scaling_factor=1.0,
         qk_prod_scaling=True,
         position_bias=False,
+        streaming_cache=False,
         name=None,
     ):
         """Defines the MultiHead Attention operation as described in Attention Is All You Need
@@ -2864,6 +2865,7 @@ def spec_inc_multihead_self_attention(
             scaling_factor,
             qk_prod_scaling,
             position_bias,
+            streaming_cache,
             c_name,
         )
         self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
@@ -2991,6 +2993,7 @@ def groupquery_self_attention(
         scaling_factor=1.0,
         qk_prod_scaling=True,
         position_bias=False,
+        streaming_cache=False,
         name=None,
     ):
         """Defines the multi-query head attention, which allows a different number of Q and KV heads,
@@ -3075,6 +3078,7 @@ def groupquery_self_attention(
             scaling_factor,
             qk_prod_scaling,
             position_bias,
+            streaming_cache,
             c_name,
         )
         self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index d086d6d16..a398b54ca 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1201,6 +1201,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1222,6 +1223,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
                                                        scaling_factor,
                                                        qk_prod_scaling,
                                                        position_bias,
+                                                       streaming_cache,
                                                        name);
   return FFCObjectWrapper::wrap(tensor);
 }
@@ -1244,6 +1246,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1266,6 +1269,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
                                                 scaling_factor,
                                                 qk_prod_scaling,
                                                 position_bias,
+                                                streaming_cache,
                                                 name);
   return FFCObjectWrapper::wrap(tensor);
 }
@@ -1333,6 +1337,7 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1355,6 +1360,7 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
                                                     scaling_factor,
                                                     qk_prod_scaling,
                                                     position_bias,
+                                                    streaming_cache,
                                                     name);
   return FFCObjectWrapper::wrap(tensor);
 }
@@ -1378,6 +1384,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     float scaling_factor,
     bool qk_prod_scaling,
     bool position_bias,
+    bool streaming_cache,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -1401,6 +1408,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
                                                  scaling_factor,
                                                  qk_prod_scaling,
                                                  position_bias,
+                                                 streaming_cache,
                                                  name);
   return FFCObjectWrapper::wrap(tensor);
 }
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index c35a07a4e..54d71ea0b 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -70,6 +70,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                              float scaling_factor,
                                              bool qk_prod_scaling,
                                              bool position_bias,
+                                             bool streaming_cache,
                                              char const *name) {
   return groupquery_self_attention(input,
                                    embed_dim,
@@ -88,6 +89,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                    scaling_factor,
                                    qk_prod_scaling,
                                    position_bias,
+                                   streaming_cache,
                                    name);
 }
 
@@ -108,6 +110,7 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
                                           float scaling_factor,
                                           bool qk_prod_scaling,
                                           bool position_bias,
+                                          bool streaming_cache,
                                           char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
@@ -204,6 +207,7 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
+  li->add_int_property("streaming_cache", streaming_cache);
   li->add_int_property("tensor_parallelism_degree",
                        config.tensor_parallelism_degree);
   layers.push_back(li);
@@ -249,6 +253,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
   DataType quantization_type = (DataType)value;
   layer->get_int_property("offload", value);
   bool offload = (bool)value;
+  layer->get_int_property("streaming_cache", value);
+  bool streaming_cache = (bool)value;
   layer->get_int_property("tensor_parallelism_degree", value);
   int tensor_parallelism_degree = (int)value;
 
@@ -272,6 +278,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
                                        false /*allocate_weights*/,
                                        quantization_type,
                                        offload,
+                                       streaming_cache,
                                        tensor_parallelism_degree,
                                        layer->name);
 }
@@ -297,6 +304,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
+    bool _streaming_cache,
     int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
@@ -317,7 +325,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
       position_bias(_position_bias), quantization_type(_quantization_type),
-      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree) {
+      offload(_offload), streaming_cache(_streaming_cache),
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   numOutputs = 1;
@@ -408,6 +417,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool allocate_weights,
     DataType _quantization_type,
     bool _offload,
+    bool _streaming_cache,
     int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
@@ -429,7 +439,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
       position_bias(_position_bias), quantization_type(_quantization_type),
-      offload(_offload), tensor_parallelism_degree(_tensor_parallelism_degree)
+      offload(_offload), streaming_cache(_streaming_cache),
+      tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -526,6 +537,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 allocate_weights,
                                 other.quantization_type,
                                 other.offload,
+                                other.streaming_cache,
                                 other.tensor_parallelism_degree,
                                 other.name) {}
 
@@ -555,6 +567,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 allocate_weights,
                                 params.quantization_type,
                                 params.offload,
+                                params.streaming_cache,
                                 params.tensor_parallelism_degree,
                                 params.name) {}
 
@@ -897,7 +910,8 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.streaming_cache == rhs.streaming_cache;
 }
 
 IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
@@ -919,6 +933,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.tensor_parallelism_degree = this->tensor_parallelism_degree,
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
+  params.streaming_cache = this->streaming_cache;
   params.num_kv_heads = this->num_kv_heads;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
@@ -950,6 +965,7 @@ size_t hash<FlexFlow::IncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.position_bias);
   hash_combine(key, params.quantization_type);
   hash_combine(key, params.offload);
+  hash_combine(key, params.streaming_cache);
   hash_combine(key, params.tensor_parallelism_degree);
   return key;
 }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 4d2963b9d..073d7d3b6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -415,7 +415,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     _num_q_heads,
                                     _num_kv_heads,
                                     attn->quantization_type,
-                                    attn->offload) {}
+                                    attn->offload,
+                                    attn->streaming_cache) {}
 
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
@@ -440,7 +441,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _num_q_heads,
     int _num_kv_heads,
     DataType _quantization_type,
-    bool _offload)
+    bool _offload,
+    bool _streaming_cache)
     : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -453,6 +455,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   size_t size_of_dt = data_type_size(attn->data_type);
   quantization_type = _quantization_type;
   offload = _offload;
+  streaming_cache = _streaming_cache;
 
   global_num_q_heads = _global_num_q_heads;
   global_num_kv_heads = _global_num_kv_heads;
@@ -505,6 +508,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         max_tokens_per_batch *
         (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads);
     size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0;
+    size_t streaming_pre_pos_enc_size = 0;
     // assert((BatchConfig::max_sequence_length() +
     //         BatchConfig::max_spec_tree_token_num()) %
     //            kPagesize ==
@@ -525,6 +529,17 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         value_cache_size = num_q_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
+        if (streaming_cache) {
+          size_t max_position_pages = round_up_pages(
+              BatchConfig::MAX_STREAMING_POS);
+          key_cache_size = num_kv_heads * qk_dim *
+                           BatchConfig::max_requests_per_batch() *
+                            max_position_pages * kPagesize;
+          value_cache_size = num_kv_heads * v_dim *
+                             BatchConfig::max_requests_per_batch() *
+                             max_position_pages * kPagesize;
+          streaming_pre_pos_enc_size = key_cache_size + value_cache_size;
+        }
         break;
       }
       default:
@@ -537,7 +552,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         2;
     size_t totalSize =
         (qkv_max_proj_size + query_tmp_size + key_cache_size +
-         value_cache_size + attn_heads_size) *
+         value_cache_size + streaming_pre_pos_enc_size + attn_heads_size) *
             size_of_dt +
         output_tmp_size * data_type_size(DT_HALF) +
         complex_size * sizeof(cuFloatComplex); // more components will
@@ -547,18 +562,18 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       size_t totalSharedSize =
           infer_mode == TREE_VERIFY_MODE
               ? totalSize - (query_tmp_size + key_cache_size +
-                             value_cache_size + qkv_max_proj_size) *
+                             value_cache_size + streaming_pre_pos_enc_size + qkv_max_proj_size) *
                                 size_of_dt
               : totalSize -
-                    (query_tmp_size + key_cache_size + value_cache_size) *
+                    (query_tmp_size + key_cache_size + value_cache_size + streaming_pre_pos_enc_size) *
                         size_of_dt;
 
       size_t instance_size =
           size_of_dt *
           (infer_mode == TREE_VERIFY_MODE
-               ? query_tmp_size + key_cache_size + value_cache_size +
+               ? query_tmp_size + key_cache_size + value_cache_size + streaming_pre_pos_enc_size +
                      qkv_max_proj_size
-               : query_tmp_size + key_cache_size + value_cache_size);
+               : query_tmp_size + key_cache_size + value_cache_size + streaming_pre_pos_enc_size);
 
       if (quantization_type != DT_NONE) {
         totalSharedSize += quantized_weightSize;
@@ -587,6 +602,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     }
     kvCache = gpu_mem_allocator.allocate_instance_untyped(
         (key_cache_size + value_cache_size) * size_of_dt);
+    if (streaming_pre_pos_enc_size > 0) {
+      streamingPrePosEnc = gpu_mem_allocator.allocate_instance_untyped(
+          streaming_pre_pos_enc_size * size_of_dt);
+    }
     outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
 
     token_infos =
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index cd937f165..5817bd1c4 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -69,6 +69,7 @@ Tensor
                                                float scaling_factor,
                                                bool qk_prod_scaling,
                                                bool position_bias,
+                                               bool streaming_cache,
                                                char const *name) {
   return spec_inc_multiquery_self_attention(input,
                                             embed_dim,
@@ -87,6 +88,7 @@ Tensor
                                             scaling_factor,
                                             qk_prod_scaling,
                                             position_bias,
+                                            streaming_cache,
                                             name);
 }
 
@@ -108,6 +110,7 @@ Tensor
                                                 float scaling_factor,
                                                 bool qk_prod_scaling,
                                                 bool position_bias,
+                                                bool streaming_cache,
                                                 char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
@@ -190,6 +193,7 @@ Tensor
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
+  li->add_int_property("streaming_cache", streaming_cache);
   layers.push_back(li);
   return li->outputs[0];
 }
@@ -229,6 +233,8 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   bool qk_prod_scaling = (bool)value;
   layer->get_int_property("position_bias", value);
   bool position_bias = (bool)value;
+  layer->get_int_property("streaming_cache", value);
+  bool streaming_cache = (bool)value;
 
   return new SpecIncMultiHeadSelfAttention(model,
                                            layer->layer_guid,
@@ -248,6 +254,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
                                            qk_prod_scaling,
                                            position_bias,
                                            false /*allocate_weights*/,
+                                           streaming_cache,
                                            layer->name);
 }
 
@@ -270,6 +277,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _qk_prod_scaling,
     bool _position_bias,
     bool allocate_weights,
+    bool _streaming_cache,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -288,7 +296,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
-      position_bias(_position_bias) {
+      position_bias(_position_bias) , streaming_cache(_streaming_cache) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
@@ -370,6 +378,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _qk_prod_scaling,
     bool _position_bias,
     bool allocate_weights,
+    bool _streaming_cache,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -389,7 +398,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
-      position_bias(_position_bias)
+      position_bias(_position_bias), streaming_cache(_streaming_cache)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -478,6 +487,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     other.qk_prod_scaling,
                                     other.position_bias,
                                     allocate_weights,
+                                    other.streaming_cache,
                                     other.name) {}
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
@@ -504,6 +514,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.qk_prod_scaling,
                                     params.position_bias,
                                     allocate_weights,
+                                    params.streaming_cache,
                                     params.name) {}
 
 void SpecIncMultiHeadSelfAttention::init_inference(
@@ -825,7 +836,8 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs,
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.streaming_cache == rhs.streaming_cache;
 }
 
 SpecIncMultiHeadSelfAttentionParams
@@ -846,6 +858,7 @@ SpecIncMultiHeadSelfAttentionParams
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
+  params.streaming_cache = this->streaming_cache;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -874,6 +887,7 @@ size_t hash<FlexFlow::SpecIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
   hash_combine(key, params.position_bias);
+  hash_combine(key, params.streaming_cache);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index cbef406e2..5ee745a7a 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -394,7 +394,8 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     _num_q_heads,
                                     _num_kv_heads,
                                     DT_NONE,
-                                    false) {
+                                    false,
+                                    attn->streaming_cache) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 586957886..ddd738a0d 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -617,7 +617,8 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_q_heads,
                                     _num_kv_heads,
                                     attn->quantization_type,
-                                    attn->offload),
+                                    attn->offload,
+                                    false),
       num_active_tokens(0) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));

From 828b1b87cb60f6ac06b43437e1d373b01887d298 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 21:09:24 -0700
Subject: [PATCH 414/667] chore: more acurate definition

---
 include/flexflow/batch_config.h         |  3 ++
 src/ops/inc_multihead_self_attention.cu | 42 ++++++++++++++-----------
 src/runtime/batch_config.cc             |  4 +++
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 0ef446e35..8a89f9c31 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -67,6 +67,7 @@ class BatchConfig {
   static int max_verify_tokens_per_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
+  static int get_max_tree_depth();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
   void save_to_file(std::string const &filename) const;
@@ -83,8 +84,10 @@ class BatchConfig {
   inline static int const MAX_TREE_DEPTH = 16;
   inline static int const MAX_TREE_WIDTH = 64;
   inline static int const MAX_K_LOGITS = 16;
+
   // The Constants for the Streaming KVCache
   inline static int const SINK_SIZE = 4;
+  // size_SINK + size_WINDOW + depth_DRAFT shouldn't exceed this value
   inline static int const MAX_STREAMING_POS = 2048;
 
   int num_tokens = 0;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 073d7d3b6..6ab7ca36a 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -256,10 +256,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding(m,
-                     bc,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     stream);
+  apply_pos_encoding(m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
 
   // phase 2: Update key/val cache
   update_qkv_cache<DT>(m, bc, stream);
@@ -530,15 +527,22 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
         if (streaming_cache) {
-          size_t max_position_pages = round_up_pages(
-              BatchConfig::MAX_STREAMING_POS);
+          size_t max_post_pos_enc_pages =
+              round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                             BatchConfig::get_max_tree_depth() +
+                             BatchConfig::max_spec_tree_token_num());
           key_cache_size = num_kv_heads * qk_dim *
                            BatchConfig::max_requests_per_batch() *
-                            max_position_pages * kPagesize;
+                           max_post_pos_enc_pages * kPagesize;
           value_cache_size = num_kv_heads * v_dim *
                              BatchConfig::max_requests_per_batch() *
-                             max_position_pages * kPagesize;
-          streaming_pre_pos_enc_size = key_cache_size + value_cache_size;
+                             max_post_pos_enc_pages * kPagesize;
+          streaming_pre_pos_enc_size =
+              num_kv_heads * (qk_dim + v_dim) *
+              BatchConfig::max_requests_per_batch() *
+              round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                             BatchConfig::get_max_tree_depth()) *
+              kPagesize;
         }
         break;
       }
@@ -561,19 +565,21 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       // assert that we have enough reserved work space left
       size_t totalSharedSize =
           infer_mode == TREE_VERIFY_MODE
-              ? totalSize - (query_tmp_size + key_cache_size +
-                             value_cache_size + streaming_pre_pos_enc_size + qkv_max_proj_size) *
-                                size_of_dt
-              : totalSize -
-                    (query_tmp_size + key_cache_size + value_cache_size + streaming_pre_pos_enc_size) *
-                        size_of_dt;
+              ? totalSize -
+                    (query_tmp_size + key_cache_size + value_cache_size +
+                     streaming_pre_pos_enc_size + qkv_max_proj_size) *
+                        size_of_dt
+              : totalSize - (query_tmp_size + key_cache_size +
+                             value_cache_size + streaming_pre_pos_enc_size) *
+                                size_of_dt;
 
       size_t instance_size =
           size_of_dt *
           (infer_mode == TREE_VERIFY_MODE
-               ? query_tmp_size + key_cache_size + value_cache_size + streaming_pre_pos_enc_size +
-                     qkv_max_proj_size
-               : query_tmp_size + key_cache_size + value_cache_size + streaming_pre_pos_enc_size);
+               ? query_tmp_size + key_cache_size + value_cache_size +
+                     streaming_pre_pos_enc_size + qkv_max_proj_size
+               : query_tmp_size + key_cache_size + value_cache_size +
+                     streaming_pre_pos_enc_size);
 
       if (quantization_type != DT_NONE) {
         totalSharedSize += quantized_weightSize;
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index a2f78a73a..e437e6c1e 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -103,6 +103,10 @@ int BatchConfig::max_spec_tree_token_num() {
   return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
 }
 
+int BatchConfig::get_max_tree_depth() {
+  return RequestManager::get_request_manager()->get_max_tree_depth();
+}
+
 // Overloading the << operator for the Bitset class
 std::ostream &operator<<(std::ostream &os,
                          BatchConfig::BitMask::Bitset const &bitset) {

From 7e7122959497b2c8c434b2da29d9f4e69950a2f6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 Aug 2024 22:47:23 -0700
Subject: [PATCH 415/667] chore: minor

---
 src/ops/kernels/inc_multihead_self_attention_kernels.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 206f134d4..affedda79 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -420,9 +420,8 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = m->local_hidden_size * num_new_tokens;
   int const max_num_pages =
-      (BatchConfig::max_sequence_length() +
-       BatchConfig::max_spec_tree_token_num() + kPagesize - 1) /
-      kPagesize;
+      round_up_pages(BatchConfig::max_sequence_length() +
+                     BatchConfig::max_spec_tree_token_num());
   update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
                             min(CUDA_NUM_THREADS, parallelism),
                             0,

From a2041ea3fbe8e8fbbafce12ac12011858246f06a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 28 Aug 2024 11:39:15 -0700
Subject: [PATCH 416/667] feat: add streamingCacheInfo

---
 include/flexflow/batch_config.h               |  2 +
 include/flexflow/config.h                     |  6 ++-
 .../ops/inc_multihead_self_attention.h        |  5 ++-
 include/flexflow/request_manager.h            | 17 ++++++++
 inference/incr_decoding/incr_decoding.cc      | 14 +++++--
 inference/spec_infer/spec_infer.cc            | 14 +++++--
 src/ops/inc_multihead_self_attention.cu       |  4 ++
 src/ops/tree_inc_multihead_self_attention.cu  |  1 +
 src/runtime/batch_config.cc                   | 12 +++++-
 src/runtime/request_manager.cc                | 42 +++++++++++++++++++
 src/runtime/request_manager.cu                | 14 ++++++-
 11 files changed, 119 insertions(+), 12 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 8a89f9c31..a5eecddaf 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -43,6 +43,8 @@ class StreamingCacheInfo {
   StreamingCacheInfo(int sink_cache_size, int window_cache_size);
   StreamingCacheInfo(StreamingCacheInfo const &other);
 
+  StreamingCacheInfo &operator=(StreamingCacheInfo const &other);
+
   void update_cache(int len);
   void reset_cache();
 
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 0e15fc089..be23d8d5e 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -85,8 +85,10 @@ struct FFHandler {
 
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
-      sizeof(BatchConfig::committed_tokens) + sizeof(int);
+      sizeof(BatchConfig::request_available) +
+      sizeof(BatchConfig::streamingCacheInfo) +
+      sizeof(BatchConfig::causalMask) + sizeof(BatchConfig::committed_tokens) +
+      sizeof(int);
 
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index f0b764cf7..246784dea 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/inference.h"
@@ -192,13 +193,15 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   void *kvCache;
   bool streaming_cache;
   // When enable Streaming cache, we alter relative position each iteration, so
-  // we need below memory buffer for storing the pre-pos-encoding key value.
+  // we need below memory buffer for storing the pre-pos-encoding key value in
+  // sink and window.
   void *streamingPrePosEnc;
   void *attn_heads;
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
   BatchConfig::PerRequestInfo *request_infos;
   bool *request_available;
+  StreamingCacheInfo *streaming_cache_infos;
   DataType quantization_type;
   bool offload;
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5c6f6b6e0..472b43b52 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -120,6 +120,20 @@ struct Request {
         : from_index(from_index), to_index(to_index), token_id(token_id) {}
   };
   std::vector<CommittedToken> committed_tokens;
+
+  // Enabling Streaming KVCache means we doesn't store the whole KV sequence of
+  // the tokens in a request. Instead, we only store the sink cache (a few
+  // foremost tokens) and the window cache (rolling-updated backmost tokens
+  // through decoding). Currently, we only use streaming cache in the *draft
+  // model* calculation.
+  // - Maintain the streaming cache: During inference, we
+  // first fill up the sink cache then the window cache. After the window cache
+  // is full, we move back to the beginning of the window cache and commit the
+  // tokens in replace there.
+  // - When to update the streaming cache:
+  // 1. Prefilling phase
+  // 2. Committing phase after the target model verification
+  StreamingCacheInfo streaming_cache_info;
 };
 
 class TokenTreeNode {
@@ -244,6 +258,7 @@ class RequestManager {
   int get_max_tree_width();
   void set_max_tree_width(int max_tree_width);
   void set_speculative_sampling(bool speculative_sampling);
+  void set_streaming_cache(bool streaming_cache);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -318,6 +333,8 @@ class RequestManager {
   DecodingMode decoding_mode;
   PrefillModel prefill_model;
   bool speculative_sampling = false;
+  // specify if enable streaming cache for incremental decoding or draft model
+  bool streaming_cache = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 82d31bf2a..09663a22b 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -48,7 +48,8 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &sampling_seed) {
+                      int &sampling_seed,
+                      bool &enable_streaming_cache) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -110,6 +111,10 @@ void parse_input_args(char **argv,
       sampling_seed = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--enable-streaming-cache")) {
+      enable_streaming_cache = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -144,6 +149,7 @@ void FlexFlow::top_level_task(Task const *task,
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;
+  bool enable_streaming_cache = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -160,7 +166,8 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   sampling_seed);
+                   sampling_seed,
+                   enable_streaming_cache);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -226,6 +233,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tree_depth(8);
   rm->set_max_tree_width(16);
   rm->set_verbose(verbose);
+  rm->set_streaming_cache(enable_streaming_cache);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
@@ -237,7 +245,7 @@ void FlexFlow::top_level_task(Task const *task,
                               weights_filepath,
                               INC_DECODING_MODE,
                               generationConfig,
-                              false,
+                              enable_streaming_cache,
                               use_full_precision);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index e57528ad4..5bdc0cd12 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -69,7 +69,8 @@ void parse_input_args(char **argv,
                       int &expansion_degree,
                       bool &spec_sampling,
                       bool &do_sample,
-                      int &sampling_seed) {
+                      int &sampling_seed,
+                      bool &enable_streaming_cache) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -153,6 +154,10 @@ void parse_input_args(char **argv,
       do_sample = true;
       continue;
     }
+    if (!strcmp(argv[i], "--enable-streaming-cache")) {
+      enable_streaming_cache = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -317,6 +322,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool spec_sampling = false;
   bool do_sample = false;
   int sampling_seed = 0;
+  bool enable_streaming_cache = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -336,7 +342,8 @@ void FlexFlow::top_level_task(Task const *task,
                    expansion_degree,
                    spec_sampling,
                    do_sample,
-                   sampling_seed);
+                   sampling_seed,
+                   enable_streaming_cache);
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -356,6 +363,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
+  rm->set_streaming_cache(enable_streaming_cache);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
                          model_metadata.eos_token_id,
@@ -419,7 +427,7 @@ void FlexFlow::top_level_task(Task const *task,
                                 model_metadata.ssm_model_weights_paths[ssm_id],
                                 TREE_SEARCH_MODE,
                                 generationConfig,
-                                false,
+                                enable_streaming_cache,
                                 use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 6ab7ca36a..d717eb9a4 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -622,6 +622,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     request_available = reinterpret_cast<bool *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
+    streaming_cache_infos = reinterpret_cast<StreamingCacheInfo *>(
+        reinterpret_cast<char *>(handler.batch_config_metadata) +
+        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
+        sizeof(BatchConfig::request_available));
 
     if (offload) {
       // token_infos =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index ddd738a0d..1b302e32c 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -638,6 +638,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
             sizeof(BatchConfig::request_available) +
+            sizeof(BatchConfig::streamingCacheInfo) +
             sizeof(BatchConfig::causalMask));
     num_tokens_to_commit = reinterpret_cast<int *>(
         reinterpret_cast<char *>(committed_token_infos) +
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index e437e6c1e..ea07902bf 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -268,8 +268,18 @@ StreamingCacheInfo::StreamingCacheInfo(StreamingCacheInfo const &other)
       window_cache_size(other.window_cache_size),
       window_back(other.window_back), commit_len(other.commit_len) {}
 
+StreamingCacheInfo &
+    StreamingCacheInfo::operator=(StreamingCacheInfo const &other) {
+  sink_cache_size = other.sink_cache_size;
+  window_cache_size = other.window_cache_size;
+  window_back = other.window_back;
+  commit_len = other.commit_len;
+  return *this;
+}
+
 // For draft model, we only update the cache when prefill or
-// commit the verified result from target model
+// commit the verified result from target model;
+// For incremental decoding, we update the cache both in prefill and decoding
 void StreamingCacheInfo::update_cache(int len) {
   commit_len += len;
   if (commit_len <= sink_cache_size + window_cache_size) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3a6619b37..a576ec43a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -206,6 +206,10 @@ void RequestManager::set_speculative_sampling(bool speculative_sampling_) {
   speculative_sampling = speculative_sampling_;
 }
 
+void RequestManager::set_streaming_cache(bool streaming_cache_) {
+  streaming_cache = streaming_cache_;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -301,6 +305,9 @@ RequestManager::RequestGuid
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
     assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
     init_token_tree(request.guid);
+    request.streaming_cache_info = StreamingCacheInfo(
+        BatchConfig::SINK_SIZE,
+        BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
   }
 
   pending_request_queue.push(request);
@@ -360,6 +367,9 @@ RequestManager::RequestGuid
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
     assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
     init_token_tree(request.guid);
+    request.streaming_cache_info = StreamingCacheInfo(
+        BatchConfig::SINK_SIZE,
+        BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
   }
 
   pending_request_queue.push(request);
@@ -724,6 +734,10 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
+  if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
+    prefill_request->streaming_cache_info.update_cache(
+        prefill_request->num_tokens_in_batch);
+  }
 
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
@@ -768,6 +782,9 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
     request.llm_cache_size++;
+    if (streaming_cache) {
+      request.streaming_cache_info.update_cache(1);
+    }
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
 
@@ -800,6 +817,10 @@ void RequestManager::update_ssm_prefill_results(
   // request_manager_status is PREFILLING and the prefill_model is SSM.
   // There's no results to update, but we should update ssm_cache_size.
   prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
+  if (streaming_cache) {
+    prefill_request->streaming_cache_info.update_cache(
+        prefill_request->num_tokens_in_batch);
+  }
 
   profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
 }
@@ -881,6 +902,9 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       get_max_tokens_per_batch(),
       (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
 
+  // Copy the streaming cache info
+  bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
+
   prefill_request->first_token_offset_in_batch = 0;
   prefill_request->num_tokens_in_batch =
       bc.requestsInfo[request_index].num_tokens_in_batch;
@@ -935,6 +959,9 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
       get_max_tokens_per_batch(),
       (int)prefill_request->tokens.size() - prefill_request->ssm_cache_size);
 
+  // Copy the streaming cache info
+  bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
+
   prefill_request->first_token_offset_in_batch = 0;
   prefill_request->num_tokens_in_batch =
       bc.requestsInfo[request_index].num_tokens_in_batch;
@@ -992,6 +1019,9 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
     bc.requestsInfo[request_index].num_tokens_in_batch = 1;
 
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+
     request.first_token_offset_in_batch = bc.num_tokens;
     request.num_tokens_in_batch = 1;
 
@@ -1100,6 +1130,9 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     // update_llm_verify_results
     new_bc.causalMask[request_index] = request.causal_mask;
 
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+
     if (profiling_requests[guid].ssm_decoding_steps == 0) {
       profiling_requests[guid].start_decoding_time =
           Realm::Clock::current_time_in_microseconds();
@@ -1190,6 +1223,9 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
     // Copy the causal mask, it should already been updated by
     // update_ssm_inference_results
     new_bc.causalMask[request_index] = request.causal_mask;
+
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
   }
 
   if (verbose) {
@@ -1292,6 +1328,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // Create the causal mask for the large model based on the small model
     // causal mask.
     new_bc.causalMask[request_index] = create_llm_bitmask(guid);
+
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
   }
 
   if (verbose) {
@@ -1430,6 +1469,9 @@ bool RequestManager::update_ssm_inference_results(
 
     if (current_ssm_step == 1) {
       request.ssm_cache_size = request.tokens.size();
+      if (streaming_cache) {
+        request.streaming_cache_info.update_cache(request.num_tokens_in_batch);
+      }
     }
 
     if (current_ssm_step == 1) {
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 04f18b1c3..c98d54727 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -231,6 +231,14 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::request_available);
 
+  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
+                                total_copy_size,
+                            &(batch_config->streamingCacheInfo),
+                            sizeof(BatchConfig::streamingCacheInfo),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  total_copy_size += sizeof(BatchConfig::streamingCacheInfo);
+
   // load attention metadata
   if (batch_config->get_mode() == INC_DECODING_MODE) {
     if (handle.incr_attention_metadata->enabled()) {
@@ -388,7 +396,8 @@ void RequestManager::load_batch_config_task(
                 static_cast<char *>(handle.batch_config_metadata) +
                 sizeof(BatchConfig::tokensInfo) +
                 sizeof(BatchConfig::requestsInfo) +
-                sizeof(BatchConfig::request_available));
+                sizeof(BatchConfig::request_available)) +
+                sizeof(BatchConfig::streamingCacheInfo);
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +
@@ -553,7 +562,8 @@ void RequestManager::load_batch_config_task(
                 static_cast<char *>(handle.batch_config_metadata) +
                 sizeof(BatchConfig::tokensInfo) +
                 sizeof(BatchConfig::requestsInfo) +
-                sizeof(BatchConfig::request_available));
+                sizeof(BatchConfig::request_available)) +
+                sizeof(BatchConfig::streamingCacheInfo);
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +

From 694cedfad51ce1e845c43d374e353285cb69e5e2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 28 Aug 2024 15:17:11 -0700
Subject: [PATCH 417/667] feat: apply_pos_encoding & update_qkv_cache, add
 offset control

---
 .../inc_multihead_self_attention_kernels.cu   | 50 +++++++++++++++----
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index affedda79..4f9d5e62f 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -12,9 +12,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "flexflow/batch_config.h"
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flashinfer/pos_enc.cuh"
+#include "flexflow/attention_config.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
@@ -28,6 +31,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 
+using flashinfer::BatchQKApplyLlama31Rotary;
+using flashinfer::BatchQKApplyRotary;
+
 #define WARP_SIZE 32
 
 namespace Kernels {
@@ -293,11 +299,14 @@ template <typename DT>
 __global__ void
     apply_pos_encoding_kernel(DT *input_ptr,
                               cuFloatComplex *complex_input,
+                              BatchConfig::PerRequestInfo const *requestInfos,
                               BatchConfig::PerTokenInfo const *tokenInfos,
                               int qk_dim,
                               int num_tokens,
                               size_t q_array_size,
-                              int hidden_size) {
+                              int hidden_size,
+                              bool streaming_cache,
+                              StreamingCacheInfo const *streaming_cache_infos) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
     // create complex number
     bool q_tensor = i < (q_array_size / 2);
@@ -327,6 +336,13 @@ __global__ void
     // size_t pos = id_map[token_idx].token_position;
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
+    // relative position should be calculated based on current streaming size
+    if (streaming_cache) {
+      int req_idx = tokenInfos[token_idx].request_index;
+      pos += streaming_cache_infos[req_idx].commit_len -
+             requestInfos[req_idx].first_token_index_in_request;
+    }
+
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
     float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
@@ -342,6 +358,10 @@ __global__ void
   }
 }
 
+// Apply position embedding for qk.
+// Note that this is only used for tokens in the current batch.
+// For other Key tokens like in streaming cache, we nned other kernel to apply
+// the position embedding.
 template <typename DT>
 void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
@@ -359,25 +379,30 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                               0,
                               stream>>>(output_ptr,
                                         m->complex_input,
+                                        m->request_infos,
                                         m->token_infos,
                                         m->qk_dim,
                                         num_tokens,
                                         q_array_size,
-                                        m->local_hidden_size);
+                                        m->local_hidden_size,
+                                        m->streaming_cache,
+                                        m->streaming_cache_infos);
 }
 
 template <typename DT>
 __global__ void
     update_qkv_cache_kernel(DT *devQKVProjArray,
                             half *qTmp_ptr,
-                            half *kCache_ptr,
+                            half *kvCache_ptr,
                             BatchConfig::PerTokenInfo const *tokenInfos,
-                            BatchConfig::PerRequestInfo *request_infos,
+                            BatchConfig::PerRequestInfo *requestInfos,
                             int const max_num_pages,
                             int num_q_heads,
                             int num_kv_heads,
                             int head_dim,
-                            int num_new_tokens) {
+                            int num_new_tokens,
+                            bool streaming_cache,
+                            StreamingCacheInfo const *streaming_cache_infos) {
   int const q_hidden_size = num_q_heads * head_dim;
   int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
   int const kv_hidden_size = num_kv_heads * head_dim;
@@ -389,7 +414,12 @@ __global__ void
   }
 
   int const req_idx = tokenInfos[token_idx].request_index;
-  int const token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+  int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  if (streaming_cache) {
+    token_abs_idx += streaming_cache_infos[req_idx].commit_len -
+                     requestInfos[req_idx].first_token_index_in_request;
+  }
 
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
@@ -404,9 +434,9 @@ __global__ void
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
         offset / head_dim * stride * head_dim + offset % head_dim;
-    kCache_ptr[to_k_idx + offset] = static_cast<half>(
+    kvCache_ptr[to_k_idx + offset] = static_cast<half>(
         devQKVProjArray[from_idx + q_hidden_size + kv_offset]);
-    kCache_ptr[to_v_idx + offset] =
+    kvCache_ptr[to_v_idx + offset] =
         static_cast<half>(devQKVProjArray[from_idx + q_hidden_size +
                                           temp_kv_hidden_size + kv_offset]);
   }
@@ -434,7 +464,9 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
                                       m->num_q_heads,
                                       m->num_kv_heads,
                                       m->qk_dim,
-                                      num_new_tokens);
+                                      num_new_tokens,
+                                      m->streaming_cache,
+                                      m->streaming_cache_infos);
 }
 
 template <typename DT>

From 810721e6fc9b03ed42619bc3626f65ae548b2855 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 28 Aug 2024 15:35:55 -0700
Subject: [PATCH 418/667] chore: minor rename

---
 include/flexflow/ops/inc_multihead_self_attention.h    |  2 +-
 .../ops/kernels/inc_multihead_self_attention_kernels.h |  2 +-
 src/ops/inc_multihead_self_attention.cu                |  4 ++--
 .../kernels/inc_multihead_self_attention_kernels.cu    | 10 +++++-----
 src/ops/spec_inc_multihead_self_attention.cu           |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu           |  2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 246784dea..8db1c072d 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -195,7 +195,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   // When enable Streaming cache, we alter relative position each iteration, so
   // we need below memory buffer for storing the pre-pos-encoding key value in
   // sink and window.
-  void *streamingPrePosEnc;
+  void *streamingPrePosEncBuf;
   void *attn_heads;
   char *quantized_weight_ptr;
   BatchConfig::PerTokenInfo *token_infos;
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index d8f70db8b..d36f0c123 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -61,7 +61,7 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                         cudaStream_t stream);
 
 template <typename DT>
-void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                       BatchConfig const *bc,
                       cudaStream_t stream);
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index d717eb9a4..119f588c8 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -259,7 +259,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   apply_pos_encoding(m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
 
   // phase 2: Update key/val cache
-  update_qkv_cache<DT>(m, bc, stream);
+  update_qkv_in_batch<DT>(m, bc, stream);
 
   // cudaEventRecord(t_end, stream);
   // checkCUDA(cudaEventSynchronize(t_end));
@@ -609,7 +609,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     kvCache = gpu_mem_allocator.allocate_instance_untyped(
         (key_cache_size + value_cache_size) * size_of_dt);
     if (streaming_pre_pos_enc_size > 0) {
-      streamingPrePosEnc = gpu_mem_allocator.allocate_instance_untyped(
+      streamingPrePosEncBuf = gpu_mem_allocator.allocate_instance_untyped(
           streaming_pre_pos_enc_size * size_of_dt);
     }
     outputTmp = gpu_mem_allocator.allocate_instance<half>(output_tmp_size);
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 4f9d5e62f..86a3ab3ff 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -391,7 +391,7 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 __global__ void
-    update_qkv_cache_kernel(DT *devQKVProjArray,
+    update_qkv_in_batch_kernel(DT *devQKVProjArray,
                             half *qTmp_ptr,
                             half *kvCache_ptr,
                             BatchConfig::PerTokenInfo const *tokenInfos,
@@ -443,7 +443,7 @@ __global__ void
 }
 
 template <typename DT>
-void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                       BatchConfig const *bc,
                       cudaStream_t stream) {
   // update the kv cache, compact the q array
@@ -452,7 +452,7 @@ void update_qkv_cache(IncMultiHeadSelfAttentionMeta const *m,
   int const max_num_pages =
       round_up_pages(BatchConfig::max_sequence_length() +
                      BatchConfig::max_spec_tree_token_num());
-  update_qkv_cache_kernel<<<GET_BLOCKS(parallelism),
+  update_qkv_in_batch_kernel<<<GET_BLOCKS(parallelism),
                             min(CUDA_NUM_THREADS, parallelism),
                             0,
                             stream>>>(static_cast<DT *>(m->devQKVProjArray),
@@ -672,12 +672,12 @@ template void Kernels::IncMultiHeadAttention::apply_pos_encoding<half>(
     half *output_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_cache<float>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_cache<half>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream);
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 5ee745a7a..ea4c17a5b 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -257,7 +257,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
                      stream);
 
   // phase 2: Update key/val cache
-  update_qkv_cache<DT>(m, bc, stream);
+  update_qkv_in_batch<DT>(m, bc, stream);
 
   // phase 3: Compute attention score
   // 3 kernels for pahse 3: matmul1 - softmax - matmal2
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 1b302e32c..b1a420b2c 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -420,7 +420,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventRecord(t_start, stream);
 
   // Update key-val cache, compact q array
-  update_qkv_cache<DT>(m, bc, stream);
+  update_qkv_in_batch<DT>(m, bc, stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));

From ced5e34d21d8fc0c6f71a5a72144a96adf581881 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Thu, 29 Aug 2024 16:05:49 -0400
Subject: [PATCH 419/667] Modified the scheduling algorithm.

---
 include/flexflow/request_manager.h | 178 +++++++-----
 src/runtime/request_manager.cc     | 449 +++++++++++++++++------------
 2 files changed, 365 insertions(+), 262 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 5c6f6b6e0..95eee576b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -57,6 +57,77 @@ class InferenceManager {
   std::unordered_map<FFModel *, FileDataLoader *> model_weights_loaders;
 };
 
+class TokenTreeNode {
+public:
+  BatchConfig::TokenId id;
+  float log_accumulated_prob;
+  int parent_pos;
+  bool included = false;
+  bool gumbel = false;
+  float gumbel_logit = 0.0f;
+
+  TokenTreeNode(BatchConfig::TokenId id,
+                float log_accumulated_prob,
+                int parent_pos,
+                bool gumbel = false,
+                float gumbel_logit = 0.0f)
+      : id(id), log_accumulated_prob(log_accumulated_prob),
+        parent_pos(parent_pos), gumbel(gumbel), gumbel_logit(gumbel_logit) {}
+};
+
+bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
+               std::shared_ptr<TokenTreeNode> const &rhs);
+
+bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
+                std::shared_ptr<TokenTreeNode> const &rhs);
+
+// A comparator for std::shared_ptr<TokenTreeNode>
+// This is used to sort the token tree nodes in ascending order
+struct SharedTokenTreeNodePtrLess {
+  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
+                  std::shared_ptr<TokenTreeNode> const &rhs) const {
+    if (lhs->gumbel) {
+      assert(rhs->gumbel);
+      return lhs->gumbel_logit < rhs->gumbel_logit;
+    }
+    return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
+  }
+};
+
+// A comparator for std::shared_ptr<TokenTreeNode>
+// This is used in to sort the token tree nodes in descending order
+struct SharedTokenTreeNodePtrGreater {
+  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
+                  std::shared_ptr<TokenTreeNode> const &rhs) const {
+    if (lhs->gumbel) {
+      assert(rhs->gumbel);
+      return lhs->gumbel_logit > rhs->gumbel_logit;
+    }
+    return lhs->log_accumulated_prob > rhs->log_accumulated_prob;
+  }
+};
+
+class TokenTree {
+public:
+  std::list<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
+  // The numebr of tokens in the tree that are not pruned
+  int tree_size = 0;
+  // The numebr of tokens in the tree including the pruned ones
+
+  void add_layer() {
+    tree_layers.emplace_back();
+  }
+
+  void clear() {
+    tree_layers.clear();
+    tree_size = 0;
+  }
+
+  TokenTree() : tree_size(0) {}
+};
+
+std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree);
+
 struct Request {
   enum Status {
     PENDING = 101,   // loading prompt
@@ -68,6 +139,8 @@ struct Request {
   int batch_index = -1;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
+  double slo_ratio = 1.0;
+  double decode_latency_ms = 0.0;
 
   int first_token_offset_in_batch = 0;
   int num_tokens_in_batch = 0;
@@ -120,81 +193,33 @@ struct Request {
         : from_index(from_index), to_index(to_index), token_id(token_id) {}
   };
   std::vector<CommittedToken> committed_tokens;
-};
-
-class TokenTreeNode {
-public:
-  BatchConfig::TokenId id;
-  float log_accumulated_prob;
-  int parent_pos;
-  bool pruned = false;
-  bool gumbel = false;
-  float gumbel_logit = 0.0f;
-
-  TokenTreeNode(BatchConfig::TokenId id,
-                float log_accumulated_prob,
-                int parent_pos,
-                bool gumbel = false,
-                float gumbel_logit = 0.0f)
-      : id(id), log_accumulated_prob(log_accumulated_prob),
-        parent_pos(parent_pos), gumbel(gumbel), gumbel_logit(gumbel_logit) {}
-};
 
-bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
-               std::shared_ptr<TokenTreeNode> const &rhs);
+  std::priority_queue<std::shared_ptr<TokenTreeNode>,
+                      std::vector<std::shared_ptr<TokenTreeNode>>,
+                      SharedTokenTreeNodePtrLess>
+      token_tree_nodes_pq;
 
-bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
-                std::shared_ptr<TokenTreeNode> const &rhs);
-
-// A comparator for std::shared_ptr<TokenTreeNode>
-// This is used in to sort the token tree nodes in descending order
-struct CompareSharedTokenTreeNodePtr {
-  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
-                  std::shared_ptr<TokenTreeNode> const &rhs) const {
-    if (lhs->gumbel) {
-      assert(rhs->gumbel);
-      return lhs->gumbel_logit < rhs->gumbel_logit;
-    }
-    return lhs->log_accumulated_prob < rhs->log_accumulated_prob;
-  }
+  double get_length_weight();
+  void set_slo_ratio(double slo_ratio_);
+  double get_slo_ratio();
 };
 
 // A comparator for std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
 // This is used to sort the token tree nodes in ascending order
-struct CompareSharedTokenTreeNodePtrRequestGuidPair {
-  bool operator()(std::pair<std::shared_ptr<TokenTreeNode>,
-                            BatchConfig::RequestGuid> const &lhs,
-                  std::pair<std::shared_ptr<TokenTreeNode>,
-                            BatchConfig::RequestGuid> const &rhs) const {
+struct SharedTokenTreeNodePtrRequestGreater {
+  bool operator()(
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &lhs,
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &rhs) const {
     if (lhs.first->gumbel) {
       assert(rhs.first->gumbel);
-      return lhs.first->gumbel_logit > rhs.first->gumbel_logit;
+      return lhs.first->gumbel_logit * lhs.second.get_length_weight() >
+             rhs.first->gumbel_logit * rhs.second.get_length_weight();
     }
-    return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
-  }
-};
-
-class TokenTree {
-public:
-  std::list<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
-  // The numebr of tokens in the tree that are not pruned
-  int tree_size = 0;
-  // The numebr of tokens in the tree including the pruned ones
-
-  void add_layer() {
-    tree_layers.emplace_back();
+    return lhs.first->log_accumulated_prob * lhs.second.get_length_weight() >
+           rhs.first->log_accumulated_prob * rhs.second.get_length_weight();
   }
-
-  void clear() {
-    tree_layers.clear();
-    tree_size = 0;
-  }
-
-  TokenTree() : tree_size(0) {}
 };
 
-std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree);
-
 class RequestManager {
 public:
   enum State {
@@ -244,6 +269,14 @@ class RequestManager {
   int get_max_tree_width();
   void set_max_tree_width(int max_tree_width);
   void set_speculative_sampling(bool speculative_sampling);
+  void set_baseline_latency(double baseline_latency_ms);
+  double get_baseline_latency();
+  void set_ssm_spec_latency(double ssm_spec_latency_ms);
+  double get_ssm_spec_latency();
+  void set_llm_verify_latency(double llm_verify_latency_ms);
+  double get_llm_verify_latency();
+  void set_correction_factor(double correction_factor);
+  double get_correction_factor();
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -313,6 +346,12 @@ class RequestManager {
   int max_tree_depth;
   int max_tree_width;
   int k;
+  // Profile based latency
+  double baseline_latency_ms;
+  double ssm_spec_latency_ms;
+  double llm_verify_latency_ms;
+  double correction_factor = 1.05;
+
   State request_manager_status;
   BackgroundServerStatus background_server_status;
   DecodingMode decoding_mode;
@@ -345,14 +384,6 @@ class RequestManager {
   int num_available_requests = 0;
   int ssm_completed = true;
 
-  // This is a helper data structure to store help the pruning of the token
-  // trees across different requests.
-  // TODO: clear this in the first step of the speculation!
-  std::priority_queue<
-      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
-      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
-      CompareSharedTokenTreeNodePtrRequestGuidPair>
-      token_tree_node_pool;
   // rm state
   std::mutex rm_state_mutex;
 
@@ -436,8 +467,11 @@ class RequestManager {
   void init_token_tree(RequestGuid guid);
   void add_root_to_spec_token_tree(RequestGuid guid,
                                    BatchConfig::TokenId token_id);
-  bool add_tokens_to_spec_token_tree(
+  void add_tokens_to_spec_token_tree(
       InferenceResult const &ssm_inference_result);
+  void prune_token_tree();
+  void add_tokens_toward_slo(RequestGuid guid, int &budget);
+  void add_tokens_toward_memory_occupancy(int budget);
   /* ---------- Spec Decoding Helper Functions ---------- */
   void renormalize(std::vector<std::pair<TokenId, float>> &D,
                    std::unordered_map<TokenId, float> &R,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3a6619b37..19700769d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -83,6 +83,18 @@ std::string LoadBytesFromFile(std::string const &path) {
   return data;
 }
 
+double Request::get_length_weight() {
+  double coeff_alpha = 128;
+  return log((double(tokens.size()) + coeff_alpha) / coeff_alpha);
+}
+
+void Request::set_slo_ratio(double slo_ratio_) {
+  slo_ratio = slo_ratio_;
+}
+double Request::get_slo_ratio() {
+  return slo_ratio;
+}
+
 RequestManager::RequestManager()
     : background_server_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -206,6 +218,38 @@ void RequestManager::set_speculative_sampling(bool speculative_sampling_) {
   speculative_sampling = speculative_sampling_;
 }
 
+void RequestManager::set_baseline_latency(double baseline_latency_ms_) {
+  baseline_latency_ms = baseline_latency_ms_;
+}
+
+double RequestManager::get_baseline_latency() {
+  return baseline_latency_ms;
+}
+
+void RequestManager::set_ssm_spec_latency(double ssm_spec_latency_ms_) {
+  ssm_spec_latency_ms = ssm_spec_latency_ms_;
+}
+
+double RequestManager::get_ssm_spec_latency() {
+  return ssm_spec_latency_ms;
+}
+
+void RequestManager::set_llm_verify_latency(double llm_verify_latency_ms_) {
+  llm_verify_latency_ms = llm_verify_latency_ms_;
+}
+
+double RequestManager::get_llm_verify_latency() {
+  return llm_verify_latency_ms;
+}
+
+void RequestManager::set_correction_factor(double correction_factor_) {
+  correction_factor = correction_factor_;
+}
+
+double RequestManager::get_correction_factor() {
+  return correction_factor;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -1270,7 +1314,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     int layer_index = 0;
     for (auto const &tree_layer : token_tree.tree_layers) {
       for (auto const &tree_node : tree_layer) {
-        if (tree_node->pruned == false) {
+        if (tree_node->included == true) {
           new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               request.tokens.size() - 1 + token_tree_index;
@@ -1343,18 +1387,11 @@ bool RequestManager::update_llm_verify_results(
     get_verify_results_greedy(llm_verify_result);
   }
 
-  profiling.llm_step_times.push_back(
-      (Realm::Clock::current_time_in_microseconds() -
-       profiling.llm_step_start) *
-      1e-3);
+  long long int current_time = Realm::Clock::current_time_in_microseconds();
+  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
+                                     1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
 
-  // Clear the token tree node pool
-  token_tree_node_pool = std::priority_queue<
-      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
-      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
-      CompareSharedTokenTreeNodePtrRequestGuidPair>();
-
   bool request_completed = false;
 
   // Iterate over the requests
@@ -1372,6 +1409,9 @@ bool RequestManager::update_llm_verify_results(
       std::cout << request.speculative_token_trees[0];
     }
 
+    request.decode_latency_ms =
+        (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+
     // Initialize the token tree for the request
     init_token_tree(guid);
     assert(!request.committed_tokens.empty() &&
@@ -1392,6 +1432,12 @@ bool RequestManager::update_llm_verify_results(
       // Request is completed
       request_completed = true;
       request_complete_clean_up(request_index);
+    } else if (request.decode_latency_ms > request.tokens.size() *
+                                               baseline_latency_ms *
+                                               request.get_slo_ratio()) {
+      // The request violates the SLO, drop that request
+      request_completed = true;
+      request_complete_clean_up(request_index);
     } else {
       update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
     }
@@ -1409,14 +1455,10 @@ bool RequestManager::update_ssm_inference_results(
   assert(current_ssm_step >= 1 &&
          "The current speculation step should be no less than 1");
 
-  int num_branches = BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-  int result_index = 0;
-
   // Here we assume that the order of the tokens in the last
   // BatchConfig and hence the last InferenceResult is equal to
   // the order of the request in the last BatchConfig
-  bool all_request_last_layer_empty =
-      add_tokens_to_spec_token_tree(ssm_inference_result);
+  add_tokens_to_spec_token_tree(ssm_inference_result);
 
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
@@ -1441,16 +1483,16 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  if (all_request_last_layer_empty or
-      current_ssm_step == get_max_tree_depth()) {
+  if (current_ssm_step == get_max_tree_depth()) {
     // Update profiling statistics before returning
     profiling.ssm_step_times.push_back(
         (Realm::Clock::current_time_in_microseconds() -
          profiling.ssm_step_start) *
         1e-3);
     profiling.ssm_steps.push_back(current_ssm_step);
+    return true;
   }
-  return all_request_last_layer_empty;
+  return false;
 }
 
 /* --------- Bitmask Related Functions --------- */
@@ -1564,7 +1606,7 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   for (auto const &tree_layer : token_tree.tree_layers) {
     for (auto const &tree_node : tree_layer) {
       current_layer_abs_index.push_back(abs_index_in_tree);
-      if (tree_node->pruned == false) {
+      if (tree_node->included == true) {
         if (abs_index_in_tree == 0) {
           // The root token, set itself
           llm_bitmask.bit_mask[0].set_bit(0);
@@ -1720,7 +1762,7 @@ void RequestManager::get_verify_results_sample(
       // Iterate through the tokens in the current layer to find the candidate
       // tokens whose parent is the last accepted token
       for (auto const &node_ptr : tree_layer) {
-        if (node_ptr->pruned) {
+        if (!node_ptr->included) {
           // Don't increase current_token_index here
           current_token_index_in_layer++;
           continue;
@@ -1855,7 +1897,7 @@ void RequestManager::get_verify_results_greedy(
       int current_token_index_in_layer = 0;
 
       for (auto const &node_ptr : tree_layer) {
-        if (node_ptr->pruned) {
+        if (!node_ptr->included) {
           current_token_index_in_layer++;
           continue;
         }
@@ -2322,12 +2364,12 @@ void RequestManager::add_root_to_spec_token_tree(
   if (speculative_sampling) {
     node_ptr->gumbel = true;
   }
-  token_tree_node_pool.push(std::make_pair(node_ptr, guid));
   speculative_token_tree.tree_layers.front().push_back(node_ptr);
   speculative_token_tree.tree_size++;
+  request.token_tree_nodes_pq.push(node_ptr);
 }
 
-bool RequestManager::add_tokens_to_spec_token_tree(
+void RequestManager::add_tokens_to_spec_token_tree(
     InferenceResult const &ssm_inference_result) {
 
   for (int request_index = 0; request_index < get_max_requests_per_batch();
@@ -2342,218 +2384,245 @@ bool RequestManager::add_tokens_to_spec_token_tree(
 
     int parent_num = request.num_tokens_in_batch;
     if (parent_num == 0) {
-      // The request has no committed tokens, we don't need to add tokens to
-      // the token tree
       continue;
     }
     int result_offset = request.first_token_offset_in_batch *
                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-    int current_tree_size = request.causal_mask.tree_or_prompt_size;
-    int empty_slots_in_layer =
-        min(get_max_spec_tree_token_num() - current_tree_size,
-            get_max_tree_width()); // The number of empty slots
-
-    if (empty_slots_in_layer == 0) {
-      // The token tree is full, we don't need to add tokens to it
-      continue;
-    }
-
-    bool token_pool_full =
-        token_tree_node_pool.size() >= get_max_tokens_per_batch();
-
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();
-    std::set<std::shared_ptr<TokenTreeNode>, CompareSharedTokenTreeNodePtr>
-        tokens;
+    std::set<std::shared_ptr<TokenTreeNode>, SharedTokenTreeNodePtrLess> tokens;
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
-      if (!parent_ptr->pruned) {
-        // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
-        float parent_log_prob = parent_ptr->log_accumulated_prob;
-        int child_start_idx =
-            result_offset +
-            parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-        // TODO: rename child_probs to child_logits after change the output of
-        // argmax from prob to logprob
-        std::vector<std::pair<float, int>> child_probs(
-            BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
-        for (int child_pos = 0;
-             child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-             child_pos++) {
-          int result_idx = child_start_idx + child_pos;
-          if (!speculative_sampling) {
-            // TODO: the argmax will return log prob instead of prob
-            if (log(ssm_inference_result.probs[result_idx]) !=
-                -std::numeric_limits<float>::infinity()) {
-              child_probs[child_pos] = std::make_pair(
-                  log(ssm_inference_result.probs[result_idx]), result_idx);
-            }
-          } else {
-            // Use gumbel perturbed logits here
-            // TODO: handle the case when the child logit is -inf
+      // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
+      float parent_log_prob = parent_ptr->log_accumulated_prob;
+      int child_start_idx =
+          result_offset +
+          parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      // TODO: rename child_probs to child_logits after change the output of
+      // argmax from prob to logprob
+      std::vector<std::pair<float, int>> child_probs(
+          BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+      for (int child_pos = 0;
+           child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           child_pos++) {
+        int result_idx = child_start_idx + child_pos;
+        if (!speculative_sampling) {
+          // TODO: the argmax will return log prob instead of prob
+          if (log(ssm_inference_result.probs[result_idx]) !=
+              -std::numeric_limits<float>::infinity()) {
             child_probs[child_pos] = std::make_pair(
-                ssm_inference_result.gumbel_logits[result_idx], result_idx);
+                log(ssm_inference_result.probs[result_idx]), result_idx);
           }
+        } else {
+          // Use gumbel perturbed logits here
+          // TODO: handle the case when the child logit is -inf
+          // TODO: this branch is not tested
+          child_probs[child_pos] = std::make_pair(
+              ssm_inference_result.gumbel_logits[result_idx], result_idx);
         }
-        // Sort in descending order
-        std::sort(child_probs.begin(),
-                  child_probs.end(),
-                  std::greater<std::pair<float, int>>());
+      }
+      // Sort in descending order
+      std::sort(child_probs.begin(),
+                child_probs.end(),
+                std::greater<std::pair<float, int>>());
+      if (speculative_sampling) {
+        // TODO: this branch is not tested
+        // Condition the gumbel perturbed logits on the maximum
+        gumbel_conditioned_on_max(parent_ptr->gumbel_logit, child_probs);
+      }
+
+      for (auto const &child_prob : child_probs) {
+        float logit = child_prob.first;
+        // The value used to compare between tokens
+        float accumulated_log_prob = logit + parent_log_prob;
+        float gumbel_logit = 0.0f;
+        float cmp_value;
         if (speculative_sampling) {
-          // Condition the gumbel perturbed logits on the maximum
-          gumbel_conditioned_on_max(parent_ptr->gumbel_logit, child_probs);
+          cmp_value = gumbel_logit = logit;
+        } else {
+          cmp_value = accumulated_log_prob;
         }
-
-        // for (int child_pos = 0;
-        //      child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-        //      child_pos++) {
-        for (auto const &child_prob : child_probs) {
-
-          //   int result_idx =
-          //       result_offset +
-          //       parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES +
-          //       child_pos;
-
-          float logit = child_prob.first;
-          // The value used to compare between tokens
-          float accumulated_log_prob = logit + parent_log_prob;
-          float gumbel_logit = 0.0f;
-          float cmp_value;
+        int result_idx = child_prob.second;
+
+        //   std::cout << "Probability at result index " << result_idx << ":
+        //   "
+        //             << ssm_inference_result.probs[result_idx] << "\t";
+        //   std::cout << "Token id: "
+        //             << ssm_inference_result.token_ids[result_idx] <<
+        //             std::endl;
+        assert(logit != -std::numeric_limits<float>::infinity() &&
+               "Child log probability should not be -inf.");
+
+        if (tokens.size() == max_tree_width and
+            cmp_value <= (speculative_sampling
+                              ? (*tokens.begin())->gumbel_logit
+                              : (*tokens.begin())->log_accumulated_prob)) {
+          // The current layer is full, and the new token has a lower compare
+          // value than the minimum node in tokens, we don't need to add the
+          // new token and the following tokens belong to the same parent to
+          // it, because the tokens are sorted by their compare value
+          break;
+        } else {
+          std::shared_ptr<TokenTreeNode> node_ptr(nullptr);
           if (speculative_sampling) {
-            cmp_value = gumbel_logit = logit;
+            node_ptr = std::make_shared<TokenTreeNode>(
+                ssm_inference_result.token_ids[result_idx],
+                accumulated_log_prob,
+                parent_pos,
+                true,
+                gumbel_logit);
           } else {
-            cmp_value = accumulated_log_prob;
+            node_ptr = std::make_shared<TokenTreeNode>(
+                ssm_inference_result.token_ids[result_idx],
+                accumulated_log_prob,
+                parent_pos);
           }
-          int result_idx = child_prob.second;
-
-          //   std::cout << "Probability at result index " << result_idx << ":
-          //   "
-          //             << ssm_inference_result.probs[result_idx] << "\t";
-          //   std::cout << "Token id: "
-          //             << ssm_inference_result.token_ids[result_idx] <<
-          //             std::endl;
-          assert(logit != -std::numeric_limits<float>::infinity() &&
-                 "Child log probability should not be -inf.");
-
-          if (tokens.size() == empty_slots_in_layer and
-              cmp_value <= (speculative_sampling
-                                ? (*tokens.begin())->gumbel_logit
-                                : (*tokens.begin())->log_accumulated_prob)) {
-            // The token tree is full, and the new token has a lower compare
-            // value than the minimum node in the pool, we don't need to add the
-            // new token and the following tokens belong to the same parent to
-            // the tree, because the tokens are sorted by their compare value
-            break;
-          } else if (token_pool_full and
-                     cmp_value <=
-                         (speculative_sampling
-                              ? token_tree_node_pool.top().first->gumbel_logit
-                              : token_tree_node_pool.top()
-                                    .first->log_accumulated_prob)) {
-            // The token tree is not full, but the token pool is full, and the
-            // new token has a lower compare value than the minimum node
-            // in the pool, we don't need to add the new token and the
-            // following tokens belong to the same parent to the tree, because
-            // the tokens are sorted by their compare value
-            break;
-          } else {
-            std::shared_ptr<TokenTreeNode> node_ptr(nullptr);
-            if (speculative_sampling) {
-              node_ptr = std::make_shared<TokenTreeNode>(
-                  ssm_inference_result.token_ids[result_idx],
-                  accumulated_log_prob,
-                  parent_pos,
-                  true,
-                  gumbel_logit);
-            } else {
-              node_ptr = std::make_shared<TokenTreeNode>(
-                  ssm_inference_result.token_ids[result_idx],
-                  accumulated_log_prob,
-                  parent_pos);
-            }
-            // if (tokens.size() == empty_slots_in_layer and
-            //     cmp_value > (speculative_sampling
-            //                      ? (*tokens.begin())->gumbel_logit
-            //                      : (*tokens.begin())->log_accumulated_prob))
-            //                      {
-            if (tokens.size() == empty_slots_in_layer and
-                *tokens.begin() < node_ptr) {
-              // The token tree is full, and the new token has a higher compare
-              // value than the minimum node in the pool, we need to remove the
-              // minimum node from the pool and add the new token to the tree
-              tokens.erase(tokens.begin());
-            }
-            tokens.insert(node_ptr);
+          if (tokens.size() == max_tree_width) {
+            // The current layer is full, and the new token has a higher compare
+            // value than the minimum node in tokens, we need to remove the
+            // minimum node from tokens and add the new token to it
+            tokens.erase(tokens.begin());
           }
+          tokens.insert(node_ptr);
         }
       }
       parent_pos++;
     }
 
-    // Now add all tokens in the set to the token tree, in descending order of
-    // their compare value
+    // Now add all tokens in the set to the token tree
     spec_token_tree.add_layer();
-    for (auto token_it = tokens.crbegin(); token_it != tokens.crend();
+    for (auto token_it = tokens.cbegin(); token_it != tokens.cend();
          token_it++) {
-      token_pool_full =
-          token_tree_node_pool.size() == get_max_tokens_per_batch();
-      if (token_pool_full and (*token_it) <= token_tree_node_pool.top().first) {
-        //   if (token_pool_full and
-        //       token_tree_node_pool.top().first->log_accumulated_prob >=
-        //           (*token_it)->log_accumulated_prob) {
-        break;
-      } else if (token_pool_full) {
-        token_tree_node_pool.top().first->pruned = true;
-        all_requests[token_tree_node_pool.top().second]
-            .speculative_token_trees[0]
-            .tree_size--;
-        token_tree_node_pool.pop();
-      }
-
-      token_tree_node_pool.push(std::make_pair((*token_it), guid));
       spec_token_tree.tree_layers.back().push_back((*token_it));
       spec_token_tree.tree_size++;
+      request.token_tree_nodes_pq.push((*token_it));
     }
+
+    assert(spec_token_tree.tree_size <=
+               get_max_tree_width() * get_max_tree_depth() + 1 &&
+           "The size of the token tree should not exceed the maximum size.");
   }
+}
 
-  bool all_request_last_layer_empty = true;
+void RequestManager::prune_token_tree() {
+  // Each reqeust has at least one token
+  int budget = get_max_tokens_per_batch() - num_available_requests;
+  assert(budget >= 0);
 
+  std::vector<std::pair<double, int>> spare_latency_2_request_index;
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
-      // Request in this slot is unavailable
       continue;
     }
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    TokenTree &spec_token_tree = request.speculative_token_trees[0];
+    double spare_latency =
+        request.get_slo_ratio() * baseline_latency_ms * request.tokens.size() -
+        request.decode_latency_ms;
+    assert(spare_latency >= 0.0);
+    spare_latency_2_request_index.push_back(
+        std::make_pair(spare_latency, request_index));
+  }
+
+  // Sort the requests by spare latency in ascending order
+  std::sort(spare_latency_2_request_index.begin(),
+            spare_latency_2_request_index.end(),
+            std::less<std::pair<double, int>>());
+
+  for (auto const &spare_latency_request_index_pair :
+       spare_latency_2_request_index) {
+    int request_index = spare_latency_request_index_pair.second;
+    RequestGuid guid = guid_of_requests[request_index];
+    add_tokens_toward_slo(guid, budget);
+  }
+
+  assert(budget >= 0);
+  if (budget > 0) {
+  }
+}
+
+void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
+  Request &request = all_requests[guid];
+  double num_tokens_to_decode = (ssm_spec_latency_ms + llm_verify_latency_ms) *
+                                correction_factor /
+                                (baseline_latency_ms * request.get_slo_ratio());
+
+  double current_added = 1.0;
+  // Include the root of every token tree
+  request.token_tree_nodes_pq.top()->included = true;
+  request.token_tree_nodes_pq.pop();
+
+  while (budget > 0 and current_added < num_tokens_to_decode) {
+    if (request.token_tree_nodes_pq.empty()) {
+      break;
+    }
+    auto node_ptr = request.token_tree_nodes_pq.top();
+    request.token_tree_nodes_pq.pop();
+    node_ptr->included = true;
+    current_added += node_ptr->log_accumulated_prob;
+    budget--;
+  }
+}
+
+void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
+  // This is a helper data structure to store help the pruning of the token
+  // trees across different requests.
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, Request &>>,
+      SharedTokenTreeNodePtrRequestGreater>
+      global_token_tree_node_pq;
 
-    if (spec_token_tree.tree_layers.size() <= current_ssm_step) {
-      // This request has no token added in this layer, skip it
+  // Initialie the priority queue with the top element in each request's token
+  // tree
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
       continue;
     }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (request.token_tree_nodes_pq.empty()) {
+      continue;
+    }
+    if (!request.token_tree_nodes_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_pq.top(), request});
+      request.token_tree_nodes_pq.pop();
+    }
+  }
 
-    std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
-        spec_token_tree.tree_layers.back();
-    for (auto it = last_layer.begin(); it != last_layer.end();) {
-      if ((*it)->pruned) {
-        it = last_layer.erase(it);
-        // spec_token_tree.tree_size--;
-      } else {
-        ++it;
-      }
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !global_token_tree_node_pq.empty()) {
+    auto [node_ptr, request] = global_token_tree_node_pq.top();
+    global_token_tree_node_pq.pop();
+    node_ptr->included = true;
+    if (!request.token_tree_nodes_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_pq.top(), request});
+      request.token_tree_nodes_pq.pop();
     }
-    all_request_last_layer_empty &= last_layer.empty();
+    budget--;
+  }
 
-    if (last_layer.empty()) {
-      spec_token_tree.tree_layers.pop_back();
+  // Clear the priority queue in each requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
     }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    std::priority_queue<std::shared_ptr<TokenTreeNode>,
+                        std::vector<std::shared_ptr<TokenTreeNode>>,
+                        SharedTokenTreeNodePtrLess>()
+        .swap(request.token_tree_nodes_pq);
   }
-  assert(token_tree_node_pool.size() <= get_max_tokens_per_batch() &&
-         "The token tree node pool should not exceed the maximum size.");
-  return all_request_last_layer_empty;
 }
 
 std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
@@ -2563,7 +2632,7 @@ std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
     os << "Layer: " << layer_idx << std::endl;
     int token_pos = 0;
     for (auto const &node : layer) {
-      if (!node->pruned) {
+      if (node->included) {
         os << "token pos: " << token_pos << "\ttoken id: " << node->id
            << "\tparent pos: " << node->parent_pos
            << "\tlog prob: " << node->log_accumulated_prob << std::endl;

From 70a4c2d269c527c45876e61dc2ecb188379f1c37 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 29 Aug 2024 17:20:32 -0700
Subject: [PATCH 420/667] feat: kernel implementation for streaming cache usage

---
 include/flexflow/batch_config.h               |   2 +-
 .../inc_multihead_self_attention_kernels.h    |  14 +-
 .../inc_multihead_self_attention_kernels.cu   | 299 +++++++++++++++---
 src/runtime/batch_config.cc                   |   2 +-
 4 files changed, 277 insertions(+), 40 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index a5eecddaf..9be933bd3 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -45,7 +45,7 @@ class StreamingCacheInfo {
 
   StreamingCacheInfo &operator=(StreamingCacheInfo const &other);
 
-  void update_cache(int len);
+  void commit_cache(int len);
   void reset_cache();
 
 public:
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index d36f0c123..526fe3b0d 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -62,8 +62,18 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                      BatchConfig const *bc,
-                      cudaStream_t stream);
+                         BatchConfig const *bc,
+                         cudaStream_t stream);
+
+template <typename DT>
+void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
+                                  BatchConfig const *bc,
+                                  cudaStream_t stream);
+
+template <typename DT>
+void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
+               BatchConfig const *bc,
+               cudaStream_t stream);
 
 template <typename DT>
 void produce_output(IncMultiHeadSelfAttentionMeta const *m,
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 86a3ab3ff..0967803fb 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 #include "flexflow/batch_config.h"
+#include <cassert>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
@@ -358,6 +359,7 @@ __global__ void
   }
 }
 
+// [For the tokens in batch]
 // Apply position embedding for qk.
 // Note that this is only used for tokens in the current batch.
 // For other Key tokens like in streaming cache, we nned other kernel to apply
@@ -390,19 +392,19 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void
-    update_qkv_in_batch_kernel(DT *devQKVProjArray,
-                            half *qTmp_ptr,
-                            half *kvCache_ptr,
-                            BatchConfig::PerTokenInfo const *tokenInfos,
-                            BatchConfig::PerRequestInfo *requestInfos,
-                            int const max_num_pages,
-                            int num_q_heads,
-                            int num_kv_heads,
-                            int head_dim,
-                            int num_new_tokens,
-                            bool streaming_cache,
-                            StreamingCacheInfo const *streaming_cache_infos) {
+__global__ void update_qkv_in_batch_kernel(
+    DT *qkv_proj_array,
+    half *qTmp_ptr,
+    half *kvCache_ptr,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerRequestInfo const *requestInfos,
+    int const max_num_pages,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int num_new_tokens,
+    bool streaming_cache,
+    StreamingCacheInfo const *streaming_cache_infos) {
   int const q_hidden_size = num_q_heads * head_dim;
   int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
   int const kv_hidden_size = num_kv_heads * head_dim;
@@ -423,7 +425,7 @@ __global__ void
 
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
-      static_cast<half>(devQKVProjArray[from_idx + offset]);
+      static_cast<half>(qkv_proj_array[from_idx + offset]);
 
   if (offset < kv_hidden_size) {
     size_t to_k_idx = get_k_entry_offset(
@@ -434,39 +436,241 @@ __global__ void
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
         offset / head_dim * stride * head_dim + offset % head_dim;
-    kvCache_ptr[to_k_idx + offset] = static_cast<half>(
-        devQKVProjArray[from_idx + q_hidden_size + kv_offset]);
+    kvCache_ptr[to_k_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
     kvCache_ptr[to_v_idx + offset] =
-        static_cast<half>(devQKVProjArray[from_idx + q_hidden_size +
-                                          temp_kv_hidden_size + kv_offset]);
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                         temp_kv_hidden_size + kv_offset]);
   }
 }
 
+// [For the tokens in batch]
+// Update the kv cache, and compact the q array.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: q&kv ptr took by the attention kernel.
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                      BatchConfig const *bc,
-                      cudaStream_t stream) {
-  // update the kv cache, compact the q array
+                         BatchConfig const *bc,
+                         cudaStream_t stream) {
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = m->local_hidden_size * num_new_tokens;
   int const max_num_pages =
       round_up_pages(BatchConfig::max_sequence_length() +
                      BatchConfig::max_spec_tree_token_num());
   update_qkv_in_batch_kernel<<<GET_BLOCKS(parallelism),
-                            min(CUDA_NUM_THREADS, parallelism),
-                            0,
-                            stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                                      static_cast<half *>(m->queryTmp),
-                                      static_cast<half *>(m->kvCache),
-                                      m->token_infos,
-                                      m->request_infos,
-                                      max_num_pages,
-                                      m->num_q_heads,
-                                      m->num_kv_heads,
-                                      m->qk_dim,
-                                      num_new_tokens,
-                                      m->streaming_cache,
-                                      m->streaming_cache_infos);
+                               min(CUDA_NUM_THREADS, parallelism),
+                               0,
+                               stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                         static_cast<half *>(m->queryTmp),
+                                         static_cast<half *>(m->kvCache),
+                                         m->token_infos,
+                                         m->request_infos,
+                                         max_num_pages,
+                                         m->num_q_heads,
+                                         m->num_kv_heads,
+                                         m->qk_dim,
+                                         num_new_tokens,
+                                         m->streaming_cache,
+                                         m->streaming_cache_infos);
+}
+
+__global__ void update_kv_in_streaming_cache_kernel(
+    half *pre_pos_enc_buf,
+    half *kv_cache,
+    BatchConfig::PerRequestInfo const *requestInfos,
+    bool const *request_available,
+    int const max_num_pages_pre_pos_enc_buf,
+    int const max_num_pages_kv_cache,
+    int num_kv_heads,
+    int head_dim,
+    StreamingCacheInfo const *streaming_cache_infos,
+    uint32_t const max_num_requests) {
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int token_idx = thread_idx / kv_hidden_size;
+  int const offset = thread_idx % kv_hidden_size;
+  int request_idx = 0;
+  while (token_idx >= 0 && request_idx < max_num_requests) {
+    if (request_available[request_idx]) {
+      token_idx -= streaming_cache_infos[request_idx].commit_len;
+    }
+    request_idx++;
+  }
+  if (token_idx >= 0) {
+    return;
+  }
+  request_idx--;
+  token_idx += streaming_cache_infos[request_idx].commit_len;
+
+  size_t from_k_idx = get_k_entry_offset(request_idx,
+                                         token_idx,
+                                         max_num_pages_pre_pos_enc_buf,
+                                         num_kv_heads,
+                                         head_dim),
+         from_v_idx = get_v_entry_offset(request_idx,
+                                         token_idx,
+                                         max_num_pages_pre_pos_enc_buf,
+                                         num_kv_heads,
+                                         head_dim);
+
+  // to_idx should consider the rolling property of the window cache
+  int to_idx = token_idx;
+  StreamingCacheInfo const &info = streaming_cache_infos[request_idx];
+  if (to_idx >= info.sink_cache_size &&
+      info.commit_len < info.sink_cache_size + info.window_cache_size) {
+    to_idx -= info.sink_cache_size;
+    to_idx = (to_idx + info.window_cache_size - info.window_back) %
+             info.window_cache_size;
+    to_idx += info.sink_cache_size;
+  }
+
+  size_t to_k_idx = get_k_entry_offset(request_idx,
+                                       to_idx,
+                                       max_num_pages_kv_cache,
+                                       num_kv_heads,
+                                       head_dim),
+         to_v_idx = get_v_entry_offset(request_idx,
+                                       to_idx,
+                                       max_num_pages_kv_cache,
+                                       num_kv_heads,
+                                       head_dim);
+
+  kv_cache[to_k_idx + offset] = pre_pos_enc_buf[from_k_idx + offset];
+  kv_cache[to_v_idx + offset] = pre_pos_enc_buf[from_v_idx + offset];
+}
+
+// [For the tokens in streaming cache]
+// Convert the out-of-order cache to in-order relative position.
+// Source: pre-pos-encoding kv values in the streaming cache.
+// Destination: kv ptr took by the attention kernel.
+template <typename DT>
+void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
+                                  BatchConfig const *bc,
+                                  cudaStream_t stream) {
+  assert(m->streaming_cache);
+  int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
+  int num_tokens = 0;
+  for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch();
+       req_idx++) {
+    if (!bc->request_available[req_idx]) {
+      continue;
+    }
+    num_tokens += bc->streamingCacheInfo[req_idx].commit_len;
+  }
+  int parallelism = kv_hidden_size * num_tokens;
+  int const max_num_pages_pre_pos_enc_buf = round_up_pages(
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+  int const max_num_pages_kv_cache = round_up_pages(
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+      BatchConfig::max_spec_tree_token_num());
+
+  update_kv_in_streaming_cache_kernel<<<GET_BLOCKS(parallelism),
+                                        min(CUDA_NUM_THREADS, parallelism),
+                                        0,
+                                        stream>>>(
+      static_cast<half *>(m->streamingPrePosEncBuf),
+      static_cast<half *>(m->kvCache),
+      m->request_infos,
+      m->request_available,
+      max_num_pages_pre_pos_enc_buf,
+      max_num_pages_kv_cache,
+      m->num_kv_heads,
+      m->qk_dim,
+      m->streaming_cache_infos,
+      bc->max_requests_per_batch());
+}
+
+template <typename DT>
+__global__ void
+    commit_kv_kernel(DT const *qkv_proj_array,
+                     half *pre_pos_enc_buf,
+                     BatchConfig::PerTokenInfo const *tokenInfos,
+                     BatchConfig::PerRequestInfo const *requestInfos,
+                     int const max_num_pages,
+                     int num_q_heads,
+                     int num_kv_heads,
+                     int head_dim,
+                     StreamingCacheInfo const *streaming_cache_infos,
+                     int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / kv_hidden_size;
+  int const offset = thread_idx % kv_hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+  int const request_idx = tokenInfos[token_idx].request_index;
+
+  StreamingCacheInfo const &info = streaming_cache_infos[request_idx];
+  int to_idx = tokenInfos[token_idx].abs_index_in_request -
+               requestInfos[request_idx].first_token_index_in_request +
+               info.commit_len;
+  // cases that get over the boundary:
+  // 1. commit_len < sink_cache_size: commit to sink, window, window_back is
+  // after commit_len.
+  // 2. sink_cache_size <= commit_len < sink_cache_size + window_cache_size:
+  // commit to window, window_back + sink_cache_size = commit_len, pointing to
+  // the same position.
+  // 3. commit_len >= sink_cache_size + window_cache_size: commit to window,
+  // window is full before this commit, window_back is pointing to the real
+  // position.
+  if (to_idx >= info.sink_cache_size + info.window_cache_size) {
+    to_idx = tokenInfos[token_idx].abs_index_in_request -
+             requestInfos[request_idx].first_token_index_in_request +
+             info.window_back;
+    if (info.commit_len < info.sink_cache_size) {
+      // For case 1, compensating for sink offset, because window_back is
+      // someway back from commit_len.
+      to_idx -= info.sink_cache_size - info.commit_len;
+    }
+    to_idx = info.sink_cache_size + to_idx % info.window_cache_size;
+  }
+  // TODO: For now don't consider the case that the commit tokens roll over the
+  // for more than once. In this case, we should only count the last tokens in
+  // the same window position.
+
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  size_t to_k_idx = get_k_entry_offset(
+             request_idx, to_idx, max_num_pages, num_kv_heads, head_dim),
+         to_v_idx = get_v_entry_offset(
+             request_idx, to_idx, max_num_pages, num_kv_heads, head_dim);
+
+  pre_pos_enc_buf[to_k_idx + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + offset]);
+  pre_pos_enc_buf[to_v_idx + offset] = static_cast<half>(
+      qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + offset]);
+}
+
+// [For the tokens in batch]
+// Commit the kv values to the streaming cache.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: pre-pos-encoding kv values in the streaming cache.
+template <typename DT>
+void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
+               BatchConfig const *bc,
+               cudaStream_t stream) {
+  assert(m->streaming_cache);
+  int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
+  int const num_new_tokens = bc->num_active_tokens();
+  int parallelism = kv_hidden_size * num_new_tokens;
+  int const max_num_pages = round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                                           BatchConfig::get_max_tree_depth());
+
+  commit_kv_kernel<<<GET_BLOCKS(parallelism),
+                     min(CUDA_NUM_THREADS, parallelism),
+                     0,
+                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                               static_cast<half *>(m->streamingPrePosEncBuf),
+                               m->token_infos,
+                               m->request_infos,
+                               max_num_pages,
+                               m->num_q_heads,
+                               m->num_kv_heads,
+                               m->qk_dim,
+                               m->streaming_cache_infos,
+                               num_new_tokens);
 }
 
 template <typename DT>
@@ -503,7 +707,8 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
 #if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best
+  // performance
   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
 #else
   cudaDataType_t compute_type = cublas_data_type;
@@ -682,6 +887,28 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     BatchConfig const *bc,
     cudaStream_t stream);
 
+template void
+    Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::commit_kv<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::commit_kv<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
 template void Kernels::IncMultiHeadAttention::produce_output<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index ea07902bf..0dabefce4 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -280,7 +280,7 @@ StreamingCacheInfo &
 // For draft model, we only update the cache when prefill or
 // commit the verified result from target model;
 // For incremental decoding, we update the cache both in prefill and decoding
-void StreamingCacheInfo::update_cache(int len) {
+void StreamingCacheInfo::commit_cache(int len) {
   commit_len += len;
   if (commit_len <= sink_cache_size + window_cache_size) {
     window_back = std::max(0, commit_len - sink_cache_size);

From 686bdae54ba9a4e3f8381b6a31fa1320ef4d9ed9 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 30 Aug 2024 00:09:06 -0400
Subject: [PATCH 421/667] Removed an unused variable.

---
 include/flexflow/request_manager.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 95eee576b..0cbb1953a 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -110,20 +110,13 @@ struct SharedTokenTreeNodePtrGreater {
 class TokenTree {
 public:
   std::list<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
-  // The numebr of tokens in the tree that are not pruned
-  int tree_size = 0;
-  // The numebr of tokens in the tree including the pruned ones
-
   void add_layer() {
     tree_layers.emplace_back();
   }
 
   void clear() {
     tree_layers.clear();
-    tree_size = 0;
   }
-
-  TokenTree() : tree_size(0) {}
 };
 
 std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree);

From 7e10e1d155f836eca8aa29f018d85373c573a633 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Fri, 30 Aug 2024 15:57:43 -0400
Subject: [PATCH 422/667] Removed unused variable and added tree pruning.

---
 src/runtime/request_manager.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 19700769d..c6c1b6cf4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -773,6 +773,14 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
     // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);
+    std::cout << std::endl;
+    std::cout << std::endl;
+    std::cout << std::endl;
+    std::cout << result.token_ids[prefill_request->num_tokens_in_batch - 1]
+              << std::endl;
+    std::cout << std::endl;
+    std::cout << std::endl;
+    std::cout << std::endl;
     prefill_completed = true;
 
     if (prefill_request->tokens.back() == eos_token_id) {
@@ -1327,7 +1335,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       }
       layer_index++;
     }
-    assert(token_tree_index == token_tree.tree_size);
     new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
     request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
@@ -1484,6 +1491,8 @@ bool RequestManager::update_ssm_inference_results(
 
   // Stop conditions
   if (current_ssm_step == get_max_tree_depth()) {
+    // Prune the token tree at the last step
+    prune_token_tree();
     // Update profiling statistics before returning
     profiling.ssm_step_times.push_back(
         (Realm::Clock::current_time_in_microseconds() -
@@ -1624,9 +1633,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
     parent_pos_2_abs_index.swap(current_layer_abs_index);
   }
 
-  // A sanity check
-  assert(abs_index_in_tree == token_tree.tree_size);
-
   // Maintain other fields of llm_bitmask
   llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size;
   // We don't need to set llm_bitmask.current_layer_size and
@@ -2365,7 +2371,6 @@ void RequestManager::add_root_to_spec_token_tree(
     node_ptr->gumbel = true;
   }
   speculative_token_tree.tree_layers.front().push_back(node_ptr);
-  speculative_token_tree.tree_size++;
   request.token_tree_nodes_pq.push(node_ptr);
 }
 
@@ -2495,13 +2500,8 @@ void RequestManager::add_tokens_to_spec_token_tree(
     for (auto token_it = tokens.cbegin(); token_it != tokens.cend();
          token_it++) {
       spec_token_tree.tree_layers.back().push_back((*token_it));
-      spec_token_tree.tree_size++;
       request.token_tree_nodes_pq.push((*token_it));
     }
-
-    assert(spec_token_tree.tree_size <=
-               get_max_tree_width() * get_max_tree_depth() + 1 &&
-           "The size of the token tree should not exceed the maximum size.");
   }
 }
 

From 8face9cd4a35a38ec1c6f3aa7f5ca658aa44b106 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 30 Aug 2024 17:34:10 -0700
Subject: [PATCH 423/667] feat: implement position encoding for streaming cache

---
 .../inc_multihead_self_attention_kernels.h    |   6 +
 .../inc_multihead_self_attention_kernels.cu   | 113 +++++++++++++++---
 2 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 526fe3b0d..4ea246bd6 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -60,6 +60,12 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                         DT *output_ptr,
                         cudaStream_t stream);
 
+template <typename DT>
+void apply_pos_encoding_to_streaming_proj(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 0967803fb..3a8def47c 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -299,7 +299,6 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 template <typename DT>
 __global__ void
     apply_pos_encoding_kernel(DT *input_ptr,
-                              cuFloatComplex *complex_input,
                               BatchConfig::PerRequestInfo const *requestInfos,
                               BatchConfig::PerTokenInfo const *tokenInfos,
                               int qk_dim,
@@ -323,8 +322,6 @@ __global__ void
                           hidden_size * (q_tensor ? 0 : 1);
     int complex_part_index = real_part_index + (proj_size / 2);
 
-    // complex_input[i] = {input_ptr[real_part_index],
-    //                     input_ptr[complex_part_index]};
     cuFloatComplex cii = {input_ptr[real_part_index],
                           input_ptr[complex_part_index]};
 
@@ -334,7 +331,6 @@ __global__ void
 
     // get position of token
 
-    // size_t pos = id_map[token_idx].token_position;
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
     // relative position should be calculated based on current streaming size
@@ -344,15 +340,9 @@ __global__ void
              requestInfos[req_idx].first_token_index_in_request;
     }
 
-    // float before_real = complex_input[i].x, before_complex =
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
     cuFloatComplex complex_pos = {cos(freq), sin(freq)};
 
-    // complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    // input_ptr[real_part_index] = complex_input[i].x;
-    // input_ptr[complex_part_index] = complex_input[i].y;
-
     cii = cuCmulf(cii, complex_pos);
     input_ptr[real_part_index] = cii.x;
     input_ptr[complex_part_index] = cii.y;
@@ -380,7 +370,6 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                               min(CUDA_NUM_THREADS, parallelism),
                               0,
                               stream>>>(output_ptr,
-                                        m->complex_input,
                                         m->request_infos,
                                         m->token_infos,
                                         m->qk_dim,
@@ -391,6 +380,90 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                                         m->streaming_cache_infos);
 }
 
+__global__ void apply_pos_encoding_to_streaming_proj_kernel(
+    half *kv_cache,
+    BatchConfig::PerRequestInfo const *requestInfos,
+    bool const *request_available,
+    int const max_num_pages,
+    int num_kv_heads,
+    int head_dim,
+    StreamingCacheInfo const *streaming_cache_infos,
+    uint32_t const max_num_requests) {
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int token_idx = thread_idx / (kv_hidden_size / 2);
+  // Each complex is consist of (i, i + head_dim / 2) wuthin the same head.
+  int const head_idx = (thread_idx % (kv_hidden_size / 2)) / (head_dim / 2);
+  int const offset_in_head = thread_idx % (head_dim / 2);
+  // Get the corresponding request index and token index in the request.
+  int request_idx = 0;
+  while (token_idx >= 0 && request_idx < max_num_requests) {
+    if (request_available[request_idx]) {
+      token_idx -= streaming_cache_infos[request_idx].commit_len;
+    }
+    request_idx++;
+  }
+  if (token_idx >= 0) {
+    return;
+  }
+  request_idx--;
+  token_idx += streaming_cache_infos[request_idx].commit_len;
+
+  // Get the real and complex part index for the current complex.
+  int const real_part_idx =
+      get_k_entry_offset(
+          request_idx, token_idx, max_num_pages, num_kv_heads, head_dim) +
+      head_idx * head_dim + offset_in_head;
+  int const complex_part_idx = real_part_idx + head_dim / 2;
+
+  // Apply the rotary position encoding.
+  cuFloatComplex cii = {kv_cache[real_part_idx], kv_cache[complex_part_idx]};
+  size_t pos = token_idx;
+  float freq = pos * (1.0 / pow(10000.0, (float)2 * offset_in_head / head_dim));
+  cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+  cii = cuCmulf(cii, complex_pos);
+  kv_cache[real_part_idx] = cii.x;
+  kv_cache[complex_part_idx] = cii.y;
+}
+
+// [For the tokens in streaming cache]
+// Apply position embedding for k projection in the streaming cache.
+// Note that before the position encoding, the projection is moved *in order* to
+// the kv memory took by the attention kernel. So our operation is applied where
+// kvCache points to.
+template <typename DT>
+void apply_pos_encoding_to_streaming_proj(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream) {
+  assert(m->streaming_cache);
+  int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
+  int num_tokens = 0;
+  for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch();
+       req_idx++) {
+    if (!bc->request_available[req_idx]) {
+      continue;
+    }
+    num_tokens += bc->streamingCacheInfo[req_idx].commit_len;
+  }
+  int parallelism = num_tokens * kv_hidden_size / 2;
+  int const max_num_pages = round_up_pages(
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+      BatchConfig::max_spec_tree_token_num());
+  apply_pos_encoding_to_streaming_proj_kernel<<<GET_BLOCKS(parallelism),
+                                               min(CUDA_NUM_THREADS, parallelism),
+                                                0,
+                                                stream>>>(
+      static_cast<half *>(m->kvCache),
+      m->request_infos,
+      m->request_available,
+      max_num_pages,
+      m->num_kv_heads,
+      m->qk_dim,
+      m->streaming_cache_infos,
+      bc->max_requests_per_batch());
+}
+
 template <typename DT>
 __global__ void update_qkv_in_batch_kernel(
     DT *qkv_proj_array,
@@ -516,8 +589,8 @@ __global__ void update_kv_in_streaming_cache_kernel(
   // to_idx should consider the rolling property of the window cache
   int to_idx = token_idx;
   StreamingCacheInfo const &info = streaming_cache_infos[request_idx];
-  if (to_idx >= info.sink_cache_size &&
-      info.commit_len < info.sink_cache_size + info.window_cache_size) {
+  if (info.commit_len >= info.sink_cache_size + info.window_cache_size &&
+      to_idx >= info.sink_cache_size) {
     to_idx -= info.sink_cache_size;
     to_idx = (to_idx + info.window_cache_size - info.window_back) %
              info.window_cache_size;
@@ -877,6 +950,18 @@ template void Kernels::IncMultiHeadAttention::apply_pos_encoding<half>(
     half *output_ptr,
     cudaStream_t stream);
 
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        cudaStream_t stream);
+
 template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,

From e94598fa77a2e42e99e8f9cc4d731b859344d66d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 30 Aug 2024 22:10:48 -0700
Subject: [PATCH 424/667] fix: params should add (de)serialization method

---
 inference/incr_decoding/incr_decoding.cc | 12 ++++++------
 inference/spec_infer/spec_infer.cc       | 12 ++++++------
 src/runtime/graph.cc                     | 10 ++++++++--
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 09663a22b..83f2ba632 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -49,7 +49,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
                       int &sampling_seed,
-                      bool &enable_streaming_cache) {
+                      bool &streaming_cache) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -112,7 +112,7 @@ void parse_input_args(char **argv,
       continue;
     }
     if (!strcmp(argv[i], "--enable-streaming-cache")) {
-      enable_streaming_cache = true;
+      streaming_cache = true;
       continue;
     }
   }
@@ -149,7 +149,7 @@ void FlexFlow::top_level_task(Task const *task,
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;
-  bool enable_streaming_cache = false;
+  bool streaming_cache = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -167,7 +167,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_batch,
                    max_sequence_length,
                    sampling_seed,
-                   enable_streaming_cache);
+                   streaming_cache);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -233,7 +233,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tree_depth(8);
   rm->set_max_tree_width(16);
   rm->set_verbose(verbose);
-  rm->set_streaming_cache(enable_streaming_cache);
+  rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_id, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
@@ -245,7 +245,7 @@ void FlexFlow::top_level_task(Task const *task,
                               weights_filepath,
                               INC_DECODING_MODE,
                               generationConfig,
-                              enable_streaming_cache,
+                              streaming_cache,
                               use_full_precision);
   } else if (model_type == ModelType::OPT) {
     OPT::create_opt_model(model,
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 5bdc0cd12..cc48d9c86 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -70,7 +70,7 @@ void parse_input_args(char **argv,
                       bool &spec_sampling,
                       bool &do_sample,
                       int &sampling_seed,
-                      bool &enable_streaming_cache) {
+                      bool &streaming_cache) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -155,7 +155,7 @@ void parse_input_args(char **argv,
       continue;
     }
     if (!strcmp(argv[i], "--enable-streaming-cache")) {
-      enable_streaming_cache = true;
+      streaming_cache = true;
       continue;
     }
   }
@@ -322,7 +322,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool spec_sampling = false;
   bool do_sample = false;
   int sampling_seed = 0;
-  bool enable_streaming_cache = false;
+  bool streaming_cache = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -343,7 +343,7 @@ void FlexFlow::top_level_task(Task const *task,
                    spec_sampling,
                    do_sample,
                    sampling_seed,
-                   enable_streaming_cache);
+                   streaming_cache);
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -363,7 +363,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
-  rm->set_streaming_cache(enable_streaming_cache);
+  rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
                          model_metadata.eos_token_id,
@@ -427,7 +427,7 @@ void FlexFlow::top_level_task(Task const *task,
                                 model_metadata.ssm_model_weights_paths[ssm_id],
                                 TREE_SEARCH_MODE,
                                 generationConfig,
-                                enable_streaming_cache,
+                                streaming_cache,
                                 use_full_precision);
     } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
       OPT::create_opt_model(beam_model,
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 8cae8e059..ab6421d58 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2342,6 +2342,7 @@ GraphOptimalViewSerialized
         sez.serialize(attn->position_bias);
         sez.serialize(attn->quantization_type);
         sez.serialize(attn->offload);
+        sez.serialize(attn->streaming_cache);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(attn->tensor_parallelism_degree);
         sez.serialize(strlen(attn->name));
@@ -2367,6 +2368,7 @@ GraphOptimalViewSerialized
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
         sez.serialize(attn->position_bias);
+        sez.serialize(attn->streaming_cache);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(strlen(attn->name));
         sez.serialize(attn->name, strlen(attn->name));
@@ -2807,7 +2809,7 @@ void FFModel::deserialize_graph_optimal_view(
             tensor_parallelism_degree;
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, position_bias;
+            scaling_query, qk_prod_scaling, offload, streaming_cache, position_bias;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2829,6 +2831,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(position_bias);
         dez.deserialize(quantization_type);
         dez.deserialize(offload);
+        dez.deserialize(streaming_cache);
         dez.deserialize(num_kv_heads);
         dez.deserialize(tensor_parallelism_degree);
         size_t name_len;
@@ -2853,6 +2856,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.position_bias = position_bias;
         params.quantization_type = quantization_type;
         params.offload = offload;
+        params.streaming_cache = streaming_cache;
         params.num_kv_heads = num_kv_heads;
         params.tensor_parallelism_degree = tensor_parallelism_degree;
         strcpy(params.name, name);
@@ -2864,7 +2868,7 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads;
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, position_bias;
+            scaling_query, qk_prod_scaling, position_bias, streaming_cache;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
         dez.deserialize(transformer_layer_id);
@@ -2883,6 +2887,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
         dez.deserialize(position_bias);
+        dez.deserialize(streaming_cache);
         dez.deserialize(num_kv_heads);
         size_t name_len;
         char name[MAX_OPNAME] = {0};
@@ -2904,6 +2909,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
         params.position_bias = position_bias;
+        params.streaming_cache = streaming_cache;
         params.num_kv_heads = num_kv_heads;
         strcpy(params.name, name);
         node = get_or_create_node<SpecIncMultiHeadSelfAttention>(inputs[0],

From f0d56ece05e72c9848388a9b4dd002864892ff8a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 31 Aug 2024 16:26:31 -0700
Subject: [PATCH 425/667] chore: reduce kv cache size

---
 src/ops/inc_multihead_self_attention.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 119f588c8..6d7528dc3 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -520,10 +520,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         query_tmp_size =
             num_q_heads * qk_dim * BatchConfig::max_tokens_per_batch();
         // a K-ary tree max node is (k^n - 1) / 2
-        key_cache_size = num_q_heads * qk_dim *
+        key_cache_size = num_kv_heads * qk_dim *
                          BatchConfig::max_requests_per_batch() * max_num_pages *
                          kPagesize;
-        value_cache_size = num_q_heads * v_dim *
+        value_cache_size = num_kv_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
         if (streaming_cache) {

From 419e0f84b9c12e6e015318af99044936f9bab09b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 31 Aug 2024 21:58:10 -0700
Subject: [PATCH 426/667] chore: minor

---
 src/runtime/request_manager.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a576ec43a..b131c2765 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -735,7 +735,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
   prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
-    prefill_request->streaming_cache_info.update_cache(
+    prefill_request->streaming_cache_info.commit_cache(
         prefill_request->num_tokens_in_batch);
   }
 
@@ -783,7 +783,7 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     assert(request.status == Request::RUNNING);
     request.llm_cache_size++;
     if (streaming_cache) {
-      request.streaming_cache_info.update_cache(1);
+      request.streaming_cache_info.commit_cache(1);
     }
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
@@ -818,7 +818,7 @@ void RequestManager::update_ssm_prefill_results(
   // There's no results to update, but we should update ssm_cache_size.
   prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
   if (streaming_cache) {
-    prefill_request->streaming_cache_info.update_cache(
+    prefill_request->streaming_cache_info.commit_cache(
         prefill_request->num_tokens_in_batch);
   }
 
@@ -1470,7 +1470,7 @@ bool RequestManager::update_ssm_inference_results(
     if (current_ssm_step == 1) {
       request.ssm_cache_size = request.tokens.size();
       if (streaming_cache) {
-        request.streaming_cache_info.update_cache(request.num_tokens_in_batch);
+        request.streaming_cache_info.commit_cache(request.num_tokens_in_batch);
       }
     }
 

From fb312612eaf7b9f7e1eb09014d1364193717df84 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 31 Aug 2024 22:00:03 -0700
Subject: [PATCH 427/667] fix: output misalignment

---
 include/flexflow/config.h                    |  7 +-
 src/ops/inc_multihead_self_attention.cu      |  3 +-
 src/ops/tree_inc_multihead_self_attention.cu |  4 +-
 src/runtime/request_manager.cu               | 89 ++++++++------------
 4 files changed, 43 insertions(+), 60 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index be23d8d5e..86a1bc486 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -85,10 +85,9 @@ struct FFHandler {
 
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BatchConfig::request_available) +
-      sizeof(BatchConfig::streamingCacheInfo) +
-      sizeof(BatchConfig::causalMask) + sizeof(BatchConfig::committed_tokens) +
-      sizeof(int);
+      sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
+      sizeof(BatchConfig::committed_tokens) +
+      sizeof(BatchConfig::streamingCacheInfo) + sizeof(int);
 
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 6d7528dc3..81f377772 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -625,7 +625,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     streaming_cache_infos = reinterpret_cast<StreamingCacheInfo *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::request_available));
+        sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask)) +
+        sizeof(BatchConfig::committed_tokens);
 
     if (offload) {
       // token_infos =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index b1a420b2c..c9b74240f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -638,11 +638,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
             sizeof(BatchConfig::request_available) +
-            sizeof(BatchConfig::streamingCacheInfo) +
             sizeof(BatchConfig::causalMask));
     num_tokens_to_commit = reinterpret_cast<int *>(
         reinterpret_cast<char *>(committed_token_infos) +
-        sizeof(BatchConfig::committed_tokens));
+        sizeof(BatchConfig::committed_tokens) +
+        sizeof(BatchConfig::streamingCacheInfo));
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index c98d54727..f3d894278 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -231,6 +231,32 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::request_available);
 
+  for (int request_idx = 0;
+        request_idx < BatchConfig::max_requests_per_batch();
+        request_idx++) {
+    if (batch_config->request_available[request_idx]) {
+      checkCUDA(cudaMemcpyAsync(
+          static_cast<char *>(handle.batch_config_metadata) +
+              total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+          &(batch_config->causalMask[request_idx]),
+          sizeof(BatchConfig::BitMask),
+          cudaMemcpyHostToDevice,
+          stream));
+    }
+  }
+  total_copy_size += sizeof(BatchConfig::causalMask);
+
+  if (batch_config->num_tokens_to_commit > 0) {
+    checkCUDA(cudaMemcpyAsync(
+        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+        &(batch_config->committed_tokens),
+        batch_config->num_tokens_to_commit *
+            sizeof(BatchConfig::CommittedTokensInfo),
+        cudaMemcpyHostToDevice,
+        stream));
+  }
+  total_copy_size += sizeof(BatchConfig::committed_tokens);
+
   checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
                                 total_copy_size,
                             &(batch_config->streamingCacheInfo),
@@ -239,6 +265,14 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::streamingCacheInfo);
 
+  checkCUDA(cudaMemcpyAsync(
+      static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+      &(batch_config->num_tokens_to_commit),
+      sizeof(int),
+      cudaMemcpyHostToDevice,
+      stream));
+  total_copy_size += sizeof(int);
+
   // load attention metadata
   if (batch_config->get_mode() == INC_DECODING_MODE) {
     if (handle.incr_attention_metadata->enabled()) {
@@ -366,21 +400,6 @@ void RequestManager::load_batch_config_task(
     }
   } else if (batch_config->get_mode() == TREE_SEARCH_MODE) {
     if (handle.tree_search_attention_metadata->enabled()) {
-      for (int request_idx = 0;
-           request_idx < BatchConfig::max_requests_per_batch();
-           request_idx++) {
-        if (batch_config->request_available[request_idx]) {
-          checkCUDA(cudaMemcpyAsync(
-              static_cast<char *>(handle.batch_config_metadata) +
-                  total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
-              &(batch_config->causalMask[request_idx]),
-              sizeof(BatchConfig::BitMask),
-              cudaMemcpyHostToDevice,
-              stream));
-        }
-      }
-      total_copy_size += sizeof(BatchConfig::causalMask);
-
       // calculate the attention meta data
       {
         BatchConfig::PerRequestInfo *request_infos =
@@ -396,8 +415,7 @@ void RequestManager::load_batch_config_task(
                 static_cast<char *>(handle.batch_config_metadata) +
                 sizeof(BatchConfig::tokensInfo) +
                 sizeof(BatchConfig::requestsInfo) +
-                sizeof(BatchConfig::request_available)) +
-                sizeof(BatchConfig::streamingCacheInfo);
+                sizeof(BatchConfig::request_available));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +
@@ -513,40 +531,6 @@ void RequestManager::load_batch_config_task(
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     if (handle.tree_verify_attention_metadata->enabled()) {
-      for (int request_idx = 0;
-           request_idx < BatchConfig::max_requests_per_batch();
-           request_idx++) {
-        if (batch_config->request_available[request_idx]) {
-          checkCUDA(cudaMemcpyAsync(
-              static_cast<char *>(handle.batch_config_metadata) +
-                  total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
-              &(batch_config->causalMask[request_idx]),
-              sizeof(BatchConfig::BitMask),
-              cudaMemcpyHostToDevice,
-              stream));
-        }
-      }
-      total_copy_size += sizeof(BatchConfig::causalMask);
-
-      if (batch_config->num_tokens_to_commit > 0) {
-        checkCUDA(cudaMemcpyAsync(
-            static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-            &(batch_config->committed_tokens),
-            batch_config->num_tokens_to_commit *
-                sizeof(BatchConfig::CommittedTokensInfo),
-            cudaMemcpyHostToDevice,
-            stream));
-      }
-      total_copy_size += sizeof(BatchConfig::committed_tokens);
-
-      checkCUDA(cudaMemcpyAsync(
-          static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-          &(batch_config->num_tokens_to_commit),
-          sizeof(int),
-          cudaMemcpyHostToDevice,
-          stream));
-      total_copy_size += sizeof(int);
-
       // calculate the attention meta data
       {
         BatchConfig::PerRequestInfo *request_infos =
@@ -562,8 +546,7 @@ void RequestManager::load_batch_config_task(
                 static_cast<char *>(handle.batch_config_metadata) +
                 sizeof(BatchConfig::tokensInfo) +
                 sizeof(BatchConfig::requestsInfo) +
-                sizeof(BatchConfig::request_available)) +
-                sizeof(BatchConfig::streamingCacheInfo);
+                sizeof(BatchConfig::request_available));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +

From fdf7b86598dc0c528b371fe50c1d5eed44c8e09d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 31 Aug 2024 23:30:47 -0700
Subject: [PATCH 428/667] chore: minor

---
 include/flexflow/config.h                    |  4 ++--
 src/ops/inc_multihead_self_attention.cu      |  3 +--
 src/ops/tree_inc_multihead_self_attention.cu |  6 +++---
 src/runtime/request_manager.cu               | 16 ++++++++--------
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 86a1bc486..48b0450b6 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -86,8 +86,8 @@ struct FFHandler {
   size_t batch_config_metadata_size =
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
       sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
-      sizeof(BatchConfig::committed_tokens) +
-      sizeof(BatchConfig::streamingCacheInfo) + sizeof(int);
+      sizeof(BatchConfig::streamingCacheInfo) +
+      sizeof(BatchConfig::committed_tokens) + sizeof(int);
 
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 81f377772..4cb860ddb 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -625,8 +625,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     streaming_cache_infos = reinterpret_cast<StreamingCacheInfo *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask)) +
-        sizeof(BatchConfig::committed_tokens);
+        sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask));
 
     if (offload) {
       // token_infos =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index c9b74240f..5cf58b543 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -638,11 +638,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
             sizeof(BatchConfig::tokensInfo) +
             sizeof(BatchConfig::requestsInfo) +
             sizeof(BatchConfig::request_available) +
-            sizeof(BatchConfig::causalMask));
+            sizeof(BatchConfig::causalMask) +
+            sizeof(BatchConfig::streamingCacheInfo));
     num_tokens_to_commit = reinterpret_cast<int *>(
         reinterpret_cast<char *>(committed_token_infos) +
-        sizeof(BatchConfig::committed_tokens) +
-        sizeof(BatchConfig::streamingCacheInfo));
+        sizeof(BatchConfig::committed_tokens));
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index f3d894278..cd66d5bfb 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -246,6 +246,14 @@ void RequestManager::load_batch_config_task(
   }
   total_copy_size += sizeof(BatchConfig::causalMask);
 
+  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
+                                total_copy_size,
+                            &(batch_config->streamingCacheInfo),
+                            sizeof(BatchConfig::streamingCacheInfo),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  total_copy_size += sizeof(BatchConfig::streamingCacheInfo);
+
   if (batch_config->num_tokens_to_commit > 0) {
     checkCUDA(cudaMemcpyAsync(
         static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
@@ -257,14 +265,6 @@ void RequestManager::load_batch_config_task(
   }
   total_copy_size += sizeof(BatchConfig::committed_tokens);
 
-  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                total_copy_size,
-                            &(batch_config->streamingCacheInfo),
-                            sizeof(BatchConfig::streamingCacheInfo),
-                            cudaMemcpyHostToDevice,
-                            stream));
-  total_copy_size += sizeof(BatchConfig::streamingCacheInfo);
-
   checkCUDA(cudaMemcpyAsync(
       static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
       &(batch_config->num_tokens_to_commit),

From 77aa1afae014f7d61bc1dc5c85609963da2f4558 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 1 Sep 2024 13:03:34 -0700
Subject: [PATCH 429/667] fix: speculative decoding update_custom_mask only
 consider mask within current batch

---
 src/runtime/request_manager.cu | 40 ++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index cd66d5bfb..d016b53ec 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -130,6 +130,11 @@ __global__ void
 #define test_bit_orig(bit_mask, idx, pos)                                      \
   (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
+// Passing the CPU-side causalMask, then output the bit-packed custom_mask for
+// attention forward.
+// Layout of causalMask: [num_requests][tree_size][tree_size]
+// Layout of custom_mask: [num_requests][q_length][kv_length] (bit-packed)
+// Note that for spec-decoding, q_length == last_layer_length != tree_size
 __global__ void
     update_custom_mask_kernel(uint8_t *custom_mask,
                               int32_t const *qk_indptr,
@@ -160,21 +165,24 @@ __global__ void
     }
   }
 
+  BatchConfig::BitMask &causal_mask = causalMask[requext_idx_in_batch];
+
   int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch,
             q_start = request_infos[requext_idx_in_batch]
-                          .first_token_index_in_request;
+                          .first_token_index_in_request - causal_mask.non_tree_cache_size,
+            non_tree_cache_size = causal_mask.non_tree_cache_size;
 
   uint8_t packed_bits = 0;
   for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
     int const bit_offset = byte_idx * 8 + bit_idx,
-              q_idx = bit_offset / (q_start + q_length),
-              kv_idx = bit_offset % (q_start + q_length);
-    if (kv_idx < q_start || q_idx >= q_length) {
+              q_idx = bit_offset / (non_tree_cache_size + q_start + q_length),
+              kv_idx = bit_offset % (non_tree_cache_size + q_start + q_length);
+    if (kv_idx < non_tree_cache_size || q_idx >= q_length) {
       packed_bits |= 1 << bit_idx;
     } else {
-      if (test_bit_orig(causalMask[requext_idx_in_batch].bit_mask,
-                        q_idx,
-                        kv_idx - q_start)) {
+      if (test_bit_orig(causal_mask.bit_mask,
+                        q_start + q_idx,
+                        kv_idx - non_tree_cache_size)) {
         packed_bits |= 1 << bit_idx;
       }
     }
@@ -232,12 +240,12 @@ void RequestManager::load_batch_config_task(
   total_copy_size += sizeof(BatchConfig::request_available);
 
   for (int request_idx = 0;
-        request_idx < BatchConfig::max_requests_per_batch();
-        request_idx++) {
+   request_idx < BatchConfig::max_requests_per_batch();
+       request_idx++) {
     if (batch_config->request_available[request_idx]) {
       checkCUDA(cudaMemcpyAsync(
-          static_cast<char *>(handle.batch_config_metadata) +
-              total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+          static_cast<char *>(handle.batch_config_metadata) + 
+          total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
           &(batch_config->causalMask[request_idx]),
           sizeof(BatchConfig::BitMask),
           cudaMemcpyHostToDevice,
@@ -266,11 +274,11 @@ void RequestManager::load_batch_config_task(
   total_copy_size += sizeof(BatchConfig::committed_tokens);
 
   checkCUDA(cudaMemcpyAsync(
-      static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-      &(batch_config->num_tokens_to_commit),
-      sizeof(int),
-      cudaMemcpyHostToDevice,
-      stream));
+    static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+                            &(batch_config->num_tokens_to_commit),
+                            sizeof(int),
+                            cudaMemcpyHostToDevice,
+                            stream));
   total_copy_size += sizeof(int);
 
   // load attention metadata

From 425d770aa9a12b9325da7a4a6505e6c4a23669a9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 2 Sep 2024 11:14:01 -0700
Subject: [PATCH 430/667] fix: barrier_flag initial value

---
 src/parallel_ops/kernels/allreduce_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 02fb760fd..60a1afaef 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -166,7 +166,7 @@ void inference_kernel_wrapper(AllReduceMeta *m,
                                 ncclComm,
                                 const_cast<void *>(input.ptr),
                                 stream);
-  params.barrier_flag = (*comm_buffer->barrier_flag)++;
+  params.barrier_flag = ++(*comm_buffer->barrier_flag);
   for (int i = 0; i < num_devices; ++i) {
     params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
   }

From 049dfcbd00404c99cdc2c50d6257cf5908b6ab5b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 2 Sep 2024 11:15:48 -0700
Subject: [PATCH 431/667] fix: barrier_flag initial value

---
 src/parallel_ops/kernels/allreduce_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 02fb760fd..60a1afaef 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -166,7 +166,7 @@ void inference_kernel_wrapper(AllReduceMeta *m,
                                 ncclComm,
                                 const_cast<void *>(input.ptr),
                                 stream);
-  params.barrier_flag = (*comm_buffer->barrier_flag)++;
+  params.barrier_flag = ++(*comm_buffer->barrier_flag);
   for (int i = 0; i < num_devices; ++i) {
     params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
   }

From 263d9d6f8d00e1434333fc2932d6ffa3c9b864d7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 2 Sep 2024 12:29:20 -0700
Subject: [PATCH 432/667] doc: attention meta info

---
 src/runtime/request_manager.cu | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index d016b53ec..48d79ea5c 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -78,7 +78,17 @@ void RequestManager::load_tokens_task(
   }
 }
 
-// NOTE: qk_indptr is accumulative `ceil(qk_len / 8)`
+// q_indptr: the start offset of q in the batch for each request,
+//           the length is `num_requests + 1`: [0, num_q_0, num_q_0 + num_q_1,
+//           ..., num_q_0 + num_q_1 + ... + num_q_{num_requests - 1}]
+// kv_indptr: the start offset of kv page_indices for each request,
+//            the length is `num_requests + 1`.
+// kv_indices: the page indices for kv, the length is `num_kv_pages`.
+// kv_last_page_len: the cache length in the last page for each request,
+//                   the length is `num_requests`.
+// qk_indptr: the start offset of custom_mask in the flattened mask for each
+//            request, the length is `num_requests + 1`. It can be calculated as
+//            accumulative `ceil(qk_len / 8)`.
 __global__ void
     prepare_inference_params_kernel(int const num_requests,
                                     BatchConfig::PerRequestInfo *request_infos,
@@ -169,7 +179,8 @@ __global__ void
 
   int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch,
             q_start = request_infos[requext_idx_in_batch]
-                          .first_token_index_in_request - causal_mask.non_tree_cache_size,
+                          .first_token_index_in_request -
+                      causal_mask.non_tree_cache_size,
             non_tree_cache_size = causal_mask.non_tree_cache_size;
 
   uint8_t packed_bits = 0;
@@ -239,13 +250,12 @@ void RequestManager::load_batch_config_task(
                             stream));
   total_copy_size += sizeof(BatchConfig::request_available);
 
-  for (int request_idx = 0;
-   request_idx < BatchConfig::max_requests_per_batch();
+  for (int request_idx = 0; request_idx < BatchConfig::max_requests_per_batch();
        request_idx++) {
     if (batch_config->request_available[request_idx]) {
       checkCUDA(cudaMemcpyAsync(
-          static_cast<char *>(handle.batch_config_metadata) + 
-          total_copy_size + request_idx * sizeof(BatchConfig::BitMask),
+          static_cast<char *>(handle.batch_config_metadata) + total_copy_size +
+              request_idx * sizeof(BatchConfig::BitMask),
           &(batch_config->causalMask[request_idx]),
           sizeof(BatchConfig::BitMask),
           cudaMemcpyHostToDevice,
@@ -273,8 +283,8 @@ void RequestManager::load_batch_config_task(
   }
   total_copy_size += sizeof(BatchConfig::committed_tokens);
 
-  checkCUDA(cudaMemcpyAsync(
-    static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
+  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
+                                total_copy_size,
                             &(batch_config->num_tokens_to_commit),
                             sizeof(int),
                             cudaMemcpyHostToDevice,

From 689dbd6578d07b2f4352dd58a2791b2433f5c2b2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 2 Sep 2024 13:46:18 -0700
Subject: [PATCH 433/667] docs: minor

---
 .../inc_multihead_self_attention_kernels.h    | 22 +++++++++++++++++++
 .../inc_multihead_self_attention_kernels.cu   | 22 -------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 4ea246bd6..969c5ad6e 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -54,28 +54,50 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                  DT const *bias_ptr,
                  ffStream_t stream);
 
+// [For the tokens in batch]
+// Apply position embedding for qk.
+// Note that this is only used for tokens in the current batch.
+// For other Key tokens like in streaming cache, we nned other kernel to apply
+// the position embedding.
 template <typename DT>
 void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         DT *output_ptr,
                         cudaStream_t stream);
 
+// [For the tokens in streaming cache]
+// Apply position embedding for k projection in the streaming cache.
+// Note that before the position encoding, the projection is moved *in order* to
+// the kv memory took by the attention kernel. So our operation is applied where
+// kvCache points to.
 template <typename DT>
 void apply_pos_encoding_to_streaming_proj(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream);
 
+// [For the tokens in batch]
+// Update the kv cache, and compact the q array.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: q&kv ptr took by the attention kernel.
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream);
 
+// [For the tokens in streaming cache]
+// Convert the out-of-order cache to in-order relative position.
+// Source: pre-pos-encoding kv values in the streaming cache.
+// Destination: kv ptr took by the attention kernel.
 template <typename DT>
 void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
                                   BatchConfig const *bc,
                                   cudaStream_t stream);
 
+// [For the tokens in batch]
+// Commit the kv values to the streaming cache.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: pre-pos-encoding kv values in the streaming cache.
 template <typename DT>
 void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
                BatchConfig const *bc,
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 3a8def47c..61b010ba6 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -349,11 +349,6 @@ __global__ void
   }
 }
 
-// [For the tokens in batch]
-// Apply position embedding for qk.
-// Note that this is only used for tokens in the current batch.
-// For other Key tokens like in streaming cache, we nned other kernel to apply
-// the position embedding.
 template <typename DT>
 void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
@@ -426,11 +421,6 @@ __global__ void apply_pos_encoding_to_streaming_proj_kernel(
   kv_cache[complex_part_idx] = cii.y;
 }
 
-// [For the tokens in streaming cache]
-// Apply position embedding for k projection in the streaming cache.
-// Note that before the position encoding, the projection is moved *in order* to
-// the kv memory took by the attention kernel. So our operation is applied where
-// kvCache points to.
 template <typename DT>
 void apply_pos_encoding_to_streaming_proj(
     IncMultiHeadSelfAttentionMeta const *m,
@@ -517,10 +507,6 @@ __global__ void update_qkv_in_batch_kernel(
   }
 }
 
-// [For the tokens in batch]
-// Update the kv cache, and compact the q array.
-// Source: qkv projeciton array of tokens in the batch.
-// Destination: q&kv ptr took by the attention kernel.
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
@@ -612,10 +598,6 @@ __global__ void update_kv_in_streaming_cache_kernel(
   kv_cache[to_v_idx + offset] = pre_pos_enc_buf[from_v_idx + offset];
 }
 
-// [For the tokens in streaming cache]
-// Convert the out-of-order cache to in-order relative position.
-// Source: pre-pos-encoding kv values in the streaming cache.
-// Destination: kv ptr took by the attention kernel.
 template <typename DT>
 void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
                                   BatchConfig const *bc,
@@ -716,10 +698,6 @@ __global__ void
       qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + offset]);
 }
 
-// [For the tokens in batch]
-// Commit the kv values to the streaming cache.
-// Source: qkv projeciton array of tokens in the batch.
-// Destination: pre-pos-encoding kv values in the streaming cache.
 template <typename DT>
 void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
                BatchConfig const *bc,

From fe5a8ad721b113852184cdfc6df692775365da9d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 2 Sep 2024 18:25:16 -0700
Subject: [PATCH 434/667] Added indexing support for streaming cache.

---
 include/flexflow/batch_config.h    |  15 ++--
 include/flexflow/request_manager.h |   2 +
 src/runtime/batch_config.cc        |   4 ++
 src/runtime/request_manager.cc     | 109 +++++++++++++++++++----------
 4 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 9be933bd3..d56f4e245 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -29,13 +29,13 @@ using InferenceResultFuture = Legion::Future;
 
 /*
  * StreamingCacheInfo is a class that manages the streaming kv cache for
- * attention operator (https://arxiv.org/abs/2309.17453), and we use it in draft
- * model. It matains a fixed-content *sink* cache and a fixed-size *window*
- * cache. The *sink* cache is the foremost part of the original kv cache, while
- * the *window* cache is the backmost part of the original kv cache and is
- * rolling updated. The information is per-request.
- * Note that the position encoding of the q&k alters each iteration (relative
- * position), so we store the *pre-pos-encoding* kv value in the cache.
+ * attention operator (https://arxiv.org/abs/2309.17453), and we use it in the
+ * draft model. It maintains a fixed-content *sink* cache and a fixed-size
+ * *window* cache. The *sink* cache is the foremost part of the original kv
+ * cache, while the *window* cache is the backmost part of the original kv cache
+ * and is rolling updated. The information is per-request. Note that the
+ * position encoding of the q&k alters each iteration (relative position), so we
+ * store the *pre-pos-encoding* kv value in the cache.
  */
 class StreamingCacheInfo {
 public:
@@ -47,6 +47,7 @@ class StreamingCacheInfo {
 
   void commit_cache(int len);
   void reset_cache();
+  int global_2_cache_index(int global_index);
 
 public:
   int sink_cache_size, window_cache_size;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 472b43b52..ac951e626 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -68,6 +68,8 @@ struct Request {
   int batch_index = -1;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
+  int ssm_prefill_len = 0;
+  int llm_prefill_len = 0;
 
   int first_token_offset_in_batch = 0;
   int num_tokens_in_batch = 0;
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 0dabefce4..a3ad2894a 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -295,4 +295,8 @@ void StreamingCacheInfo::reset_cache() {
   commit_len = 0;
 }
 
+int StreamingCacheInfo::global_2_cache_index(int global_index) {
+  return (global_index - sink_cache_size) % window_cache_size + sink_cache_size;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b131c2765..2f6356792 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -733,11 +733,15 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   bool prefill_completed = false;
-  prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
     prefill_request->streaming_cache_info.commit_cache(
         prefill_request->num_tokens_in_batch);
+    prefill_request->llm_cache_size =
+        prefill_request->streaming_cache_info.commit_len;
+  } else {
+    prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
   }
+  prefill_request->llm_prefill_len += prefill_request->num_tokens_in_batch;
 
   if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
@@ -784,6 +788,9 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     request.llm_cache_size++;
     if (streaming_cache) {
       request.streaming_cache_info.commit_cache(1);
+      request.llm_cache_size = request.streaming_cache_info.commit_len;
+    } else {
+      request.llm_cache_size++;
     }
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
@@ -816,11 +823,15 @@ void RequestManager::update_ssm_prefill_results(
   // This function is called by update_inference_results when the
   // request_manager_status is PREFILLING and the prefill_model is SSM.
   // There's no results to update, but we should update ssm_cache_size.
-  prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
   if (streaming_cache) {
     prefill_request->streaming_cache_info.commit_cache(
         prefill_request->num_tokens_in_batch);
+    prefill_request->ssm_cache_size =
+        prefill_request->streaming_cache_info.commit_len;
+  } else {
+    prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
   }
+  prefill_request->ssm_prefill_len += prefill_request->num_tokens_in_batch;
 
   profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
 }
@@ -898,28 +909,27 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
   bc.requestsInfo[request_index].first_token_index_in_request =
       prefill_request->llm_cache_size;
-  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-      get_max_tokens_per_batch(),
-      (int)prefill_request->tokens.size() - prefill_request->llm_cache_size);
+  int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
+                                     (int)prefill_request->tokens.size() -
+                                         prefill_request->llm_prefill_len);
+  bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
 
   // Copy the streaming cache info
   bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
 
   prefill_request->first_token_offset_in_batch = 0;
-  prefill_request->num_tokens_in_batch =
-      bc.requestsInfo[request_index].num_tokens_in_batch;
+  prefill_request->num_tokens_in_batch = num_tokens_in_batch;
 
   // Token Info
-  for (int token_idx = 0;
-       token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
-       token_idx++) {
+  for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
     int abs_idx = prefill_request->llm_cache_size + token_idx;
     assert(abs_idx < prefill_request->tokens.size());
 
     bc.tokensInfo[token_idx].request_index = request_index;
     bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
     bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-    bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
+    bc.tokensInfo[token_idx].token_id =
+        prefill_request->tokens[prefill_request->llm_prefill_len + token_idx];
 
     bc.num_tokens++;
   }
@@ -955,28 +965,27 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
   bc.requestsInfo[request_index].first_token_index_in_request =
       prefill_request->ssm_cache_size;
-  bc.requestsInfo[request_index].num_tokens_in_batch = std::min(
-      get_max_tokens_per_batch(),
-      (int)prefill_request->tokens.size() - prefill_request->ssm_cache_size);
+  int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
+                                     (int)prefill_request->tokens.size() -
+                                         prefill_request->ssm_prefill_len);
+  bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
 
   // Copy the streaming cache info
   bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
 
   prefill_request->first_token_offset_in_batch = 0;
-  prefill_request->num_tokens_in_batch =
-      bc.requestsInfo[request_index].num_tokens_in_batch;
+  prefill_request->num_tokens_in_batch = num_tokens_in_batch;
 
   // Token Info
-  for (int token_idx = 0;
-       token_idx < bc.requestsInfo[request_index].num_tokens_in_batch;
-       token_idx++) {
+  for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
     int abs_idx = prefill_request->ssm_cache_size + token_idx;
     assert(abs_idx < prefill_request->tokens.size());
 
     bc.tokensInfo[token_idx].request_index = request_index;
     bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
     bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-    bc.tokensInfo[token_idx].token_id = prefill_request->tokens[abs_idx];
+    bc.tokensInfo[token_idx].token_id =
+        prefill_request->tokens[prefill_request->ssm_prefill_len + token_idx];
 
     bc.num_tokens++;
   }
@@ -1094,13 +1103,22 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     if (num_committed_tokens == 1) {
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 1;
       // The case where the prefilling is just finished. Although the last
-      // token's kv cache is already there, the we need to decode the last token
-      // because it's the root of the token tree.
+      // token's kv cache is already there, the we need to decode the last
+      // token because it's the root of the token tree.
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-          committed_tokens[0].to_index;
-      new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-          committed_tokens[0].to_index;
+      if (streaming_cache) {
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            request.streaming_cache_info.global_2_cache_index(
+                committed_tokens[0].to_index);
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.streaming_cache_info.global_2_cache_index(
+                committed_tokens[0].to_index);
+      } else {
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            committed_tokens[0].to_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            committed_tokens[0].to_index;
+      }
       new_bc.tokensInfo[new_bc.num_tokens].token_id =
           committed_tokens[0].token_id;
       new_bc.num_tokens++;
@@ -1109,10 +1127,19 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
            committed_token_index < committed_tokens.size();
            committed_token_index++) {
         new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-            committed_tokens[committed_token_index].to_index;
-        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            committed_tokens[committed_token_index].to_index;
+        if (streaming_cache) {
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              request.streaming_cache_info.global_2_cache_index(
+                  committed_tokens[committed_token_index].to_index);
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              request.streaming_cache_info.global_2_cache_index(
+                  committed_tokens[committed_token_index].to_index);
+        } else {
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              committed_tokens[committed_token_index].to_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              committed_tokens[committed_token_index].to_index;
+        }
         new_bc.tokensInfo[new_bc.num_tokens].token_id =
             committed_tokens[committed_token_index].token_id;
         new_bc.num_tokens++;
@@ -1129,6 +1156,10 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     // Copy the causal mask, it should already been updated in
     // update_llm_verify_results
     new_bc.causalMask[request_index] = request.causal_mask;
+    if (streaming_cache) {
+      new_bc.causalMask[request_index].non_tree_cache_size =
+          request.ssm_cache_size - 1;
+    }
 
     // Copy the streaming cache info
     new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
@@ -1182,9 +1213,9 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
       // This request has no token to decode in this and the following small
       // model inference steps
       new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+      // non_tree_cache_size = ssm_cache_size - 1
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.causal_mask.non_tree_cache_size +
-          request.causal_mask.tree_or_prompt_size -
+          request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size -
           request.causal_mask.current_layer_size;
       request.num_tokens_in_batch = 0;
       request.first_token_offset_in_batch = new_bc.num_tokens;
@@ -1194,9 +1225,9 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
           token_tree.tree_layers.back();
       // Exclude the current layer from the token tree, because we want the
       // start index
+      // non_tree_cache_size = ssm_cache_size - 1
       new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.causal_mask.non_tree_cache_size +
-          request.causal_mask.tree_or_prompt_size -
+          request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size -
           request.causal_mask.current_layer_size;
       new_bc.requestsInfo[request_index].num_tokens_in_batch =
           request.causal_mask.current_layer_size;
@@ -1212,7 +1243,7 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
             new_bc.requestsInfo[request_index].first_token_index_in_request +
             child_index;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            request.tokens.size() - 1 + current_ssm_step;
+            request.ssm_cache_size - 1 + current_ssm_step;
         new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
 
         new_bc.num_tokens++;
@@ -1223,6 +1254,10 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
     // Copy the causal mask, it should already been updated by
     // update_ssm_inference_results
     new_bc.causalMask[request_index] = request.causal_mask;
+    if (streaming_cache) {
+      new_bc.causalMask[request_index].non_tree_cache_size =
+          request.ssm_cache_size - 1;
+    }
 
     // Copy the streaming cache info
     new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
@@ -1468,9 +1503,11 @@ bool RequestManager::update_ssm_inference_results(
     assert(request.status == Request::RUNNING);
 
     if (current_ssm_step == 1) {
-      request.ssm_cache_size = request.tokens.size();
       if (streaming_cache) {
         request.streaming_cache_info.commit_cache(request.num_tokens_in_batch);
+        request.ssm_cache_size = request.streaming_cache_info.commit_len;
+      } else {
+        request.ssm_cache_size = request.tokens.size();
       }
     }
 

From 4a32e47eb33cf4ffe0c5746c5903bf8e2242fc30 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 07:15:53 -0700
Subject: [PATCH 435/667] Fix bugs.

---
 include/flexflow/request_manager.h |   37 +-
 src/runtime/request_manager.cc     | 4523 ++++++++++++++--------------
 2 files changed, 2313 insertions(+), 2247 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b63eaf6b8..2331ef220 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -202,21 +202,20 @@ struct Request {
   // 1. Prefilling phase
   // 2. Committing phase after the target model verification
   StreamingCacheInfo streaming_cache_info;
-};
 
-std::priority_queue<std::shared_ptr<TokenTreeNode>,
-                    std::vector<std::shared_ptr<TokenTreeNode>>,
-                    SharedTokenTreeNodePtrLess>
-    token_tree_nodes_pq;
+  std::priority_queue<std::shared_ptr<TokenTreeNode>,
+                      std::vector<std::shared_ptr<TokenTreeNode>>,
+                      SharedTokenTreeNodePtrLess>
+      token_tree_nodes_pq;
 
-double get_length_weight();
-void set_slo_ratio(double slo_ratio_);
-double get_slo_ratio();
+  double get_length_weight();
+  void set_slo_ratio(double slo_ratio_);
+  double get_slo_ratio();
 };
 
 // A comparator for std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
 // This is used to sort the token tree nodes in ascending order
-struct SharedTokenTreeNodePtrRequestGreater {
+struct SharedTokenTreeNodePtrRequestWeightedGreater {
   bool operator()(
       std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &lhs,
       std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &rhs) const {
@@ -230,6 +229,18 @@ struct SharedTokenTreeNodePtrRequestGreater {
   }
 };
 
+struct SharedTokenTreeNodePtrRequestGreater {
+  bool operator()(
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &lhs,
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &rhs) const {
+    if (lhs.first->gumbel) {
+      assert(rhs.first->gumbel);
+      return lhs.first->gumbel_logit > rhs.first->gumbel_logit;
+    }
+    return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
+  }
+};
+
 class RequestManager {
 public:
   enum State {
@@ -288,6 +299,8 @@ class RequestManager {
   void set_correction_factor(double correction_factor);
   double get_correction_factor();
   void set_streaming_cache(bool streaming_cache);
+  bool get_memory_occupancy();
+  void set_memory_occupancy(bool memory_occupancy);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -370,6 +383,7 @@ class RequestManager {
   bool speculative_sampling = false;
   // specify if enable streaming cache for incremental decoding or draft model
   bool streaming_cache = false;
+  bool memory_occupancy = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -485,6 +499,8 @@ class RequestManager {
   void prune_token_tree();
   void add_tokens_toward_slo(RequestGuid guid, int &budget);
   void add_tokens_toward_memory_occupancy(int budget);
+  void add_tokens_toward_goodput(int budget);
+
   /* ---------- Spec Decoding Helper Functions ---------- */
   void renormalize(std::vector<std::pair<TokenId, float>> &D,
                    std::unordered_map<TokenId, float> &R,
@@ -499,5 +515,4 @@ class RequestManager {
   // Profiling related functions
   void reset_profiling_statistics();
 };
-}
-; // namespace FlexFlow
+}; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 864cb8d10..2f9dc3250 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -248,1906 +248,1949 @@ void RequestManager::set_correction_factor(double correction_factor_) {
 
 double RequestManager::get_correction_factor() {
   return correction_factor;
-  void RequestManager::set_streaming_cache(bool streaming_cache_) {
-    streaming_cache = streaming_cache_;
-  }
-
-  void RequestManager::register_tokenizer(ModelType type,
-                                          int bos_token_id,
-                                          int eos_token_id,
-                                          std::string const &path) {
-    this->model_type = type;
-    this->bos_token_id = bos_token_id;
-    this->eos_token_id = eos_token_id;
-    std::string tokenizer_folder =
-        (!path.empty() && path.back() != '/') ? path + '/' : path;
-    if (model_type == ModelType::LLAMA) {
-      bool path_to_file = !path.empty() &&
-                          (path.size() >= strlen("tokenizer.model")) &&
-                          path.find("tokenizer.model") ==
-                              (path.size() - strlen("tokenizer.model"));
-      std::string tokenizer_filepath =
-          path_to_file ? path : tokenizer_folder + "tokenizer.model";
-      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
-          LoadBytesFromFile(tokenizer_filepath));
-    } else if (model_type == ModelType::OPT) {
-      std::string vocab_file = tokenizer_folder + "vocab.json";
-      std::string merges_file = tokenizer_folder + "merges.txt";
-      std::string added_tokens_file =
-          tokenizer_folder + "special_tokens_map.json";
-      std::filesystem::path path1(vocab_file);
-      std::filesystem::path path2(merges_file);
-      std::filesystem::path path3(added_tokens_file);
-      assert(std::filesystem::exists(path1) &&
-             "Vocab file vocab.json does not exist at the specified path");
-      assert(std::filesystem::exists(path2) &&
-             "Merge file merges.txt does not exist at the specified path");
-      // opt_tokenizer = new OptTokenizer(vocab_file, merges_file);
-      std::string vocab = LoadBytesFromFile(path1.string());
-      std::string merges = LoadBytesFromFile(path2.string());
-      std::string added_tokens = LoadBytesFromFile(path3.string());
-
-      this->tokenizer_ =
-          Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
-    } else if (model_type == ModelType::FALCON ||
-               model_type == ModelType::STARCODER ||
-               model_type == ModelType::MPT) {
-      std::string falcon_tokenizer_path = join_path({path, "tokenizer.json"});
-      this->tokenizer_ =
-          Tokenizer::FromBlobJSON(LoadBytesFromFile(falcon_tokenizer_path));
-    }
-  }
-
-  void RequestManager::register_output_filepath(
-      std::string const &_output_filepath) {
-    this->output_filepath = _output_filepath;
-  }
-
-  int RequestManager::register_ssm_model(FFModel * model) {
-    int model_id = ssm_models.size();
-    ssm_models.push_back(model);
-    std::cout << "Register new ssm model with id: " << model_id << std::endl;
-    return model_id;
-  }
-
-  FFModel *RequestManager::get_ssm_model(int model_id) {
-    assert(model_id >= 0 && model_id < ssm_models.size());
-    return ssm_models[model_id];
-  }
-
-  size_t RequestManager::get_num_ssms() {
-    return ssm_models.size();
-  }
-
-  RequestManager::RequestGuid RequestManager::register_new_request(
-      std::vector<TokenId> const &prompt) {
-    std::lock_guard<std::mutex> const lock(request_queue_mutex);
+}
 
-    // Add a new request
-    Request request;
-    request.status = Request::PENDING;
-    request.guid = next_available_guid++;
+void RequestManager::set_streaming_cache(bool streaming_cache_) {
+  streaming_cache = streaming_cache_;
+}
 
-    if (prompt.size() >= get_max_sequence_length()) {
-      std::cout << "Warning: too many tokens in prompt, only load up to "
-                << get_max_sequence_length() << " tokens, but got "
-                << prompt.size() << ".\n";
+bool RequestManager::get_memory_occupancy() {
+  return memory_occupancy;
+}
 
-      printf("tokens size: %zu\n", request.tokens.size());
-      return INVALID_GUID;
-    } else {
-      request.tokens = prompt;
-    }
+void RequestManager::set_memory_occupancy(bool memory_occupancy_) {
+  memory_occupancy = memory_occupancy_;
+}
 
-    if (get_num_ssms() == 0) {
-      std::cout << "No small speculative model registered, using incremental "
-                   "decoding."
-                << std::endl;
-    } else {
-      std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-      assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
-      init_token_tree(request.guid);
-      request.streaming_cache_info = StreamingCacheInfo(
-          BatchConfig::SINK_SIZE,
-          BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
-    }
+void RequestManager::register_tokenizer(ModelType type,
+                                        int bos_token_id,
+                                        int eos_token_id,
+                                        std::string const &path) {
+  this->model_type = type;
+  this->bos_token_id = bos_token_id;
+  this->eos_token_id = eos_token_id;
+  std::string tokenizer_folder =
+      (!path.empty() && path.back() != '/') ? path + '/' : path;
+  if (model_type == ModelType::LLAMA) {
+    bool path_to_file = !path.empty() &&
+                        (path.size() >= strlen("tokenizer.model")) &&
+                        path.find("tokenizer.model") ==
+                            (path.size() - strlen("tokenizer.model"));
+    std::string tokenizer_filepath =
+        path_to_file ? path : tokenizer_folder + "tokenizer.model";
+    this->tokenizer_ =
+        Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
+  } else if (model_type == ModelType::OPT) {
+    std::string vocab_file = tokenizer_folder + "vocab.json";
+    std::string merges_file = tokenizer_folder + "merges.txt";
+    std::string added_tokens_file =
+        tokenizer_folder + "special_tokens_map.json";
+    std::filesystem::path path1(vocab_file);
+    std::filesystem::path path2(merges_file);
+    std::filesystem::path path3(added_tokens_file);
+    assert(std::filesystem::exists(path1) &&
+           "Vocab file vocab.json does not exist at the specified path");
+    assert(std::filesystem::exists(path2) &&
+           "Merge file merges.txt does not exist at the specified path");
+    // opt_tokenizer = new OptTokenizer(vocab_file, merges_file);
+    std::string vocab = LoadBytesFromFile(path1.string());
+    std::string merges = LoadBytesFromFile(path2.string());
+    std::string added_tokens = LoadBytesFromFile(path3.string());
+
+    this->tokenizer_ =
+        Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
+  } else if (model_type == ModelType::FALCON ||
+             model_type == ModelType::STARCODER ||
+             model_type == ModelType::MPT) {
+    std::string falcon_tokenizer_path = join_path({path, "tokenizer.json"});
+    this->tokenizer_ =
+        Tokenizer::FromBlobJSON(LoadBytesFromFile(falcon_tokenizer_path));
+  }
+}
 
-    pending_request_queue.push(request);
-    all_requests[request.guid] = request;
-    {
-      std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
-      request_to_promise[request.guid] = new std::promise<void>();
-    }
+void RequestManager::register_output_filepath(
+    std::string const &_output_filepath) {
+  this->output_filepath = _output_filepath;
+}
 
-    if (verbose) {
-      std::cout << "new req: " << request.tokens.size() << std::endl;
-      for (int i = 0; i < request.tokens.size(); i++) {
-        std::cout << i << " : " << request.tokens[i] << std::endl;
-      }
-    }
+int RequestManager::register_ssm_model(FFModel *model) {
+  int model_id = ssm_models.size();
+  ssm_models.push_back(model);
+  std::cout << "Register new ssm model with id: " << model_id << std::endl;
+  return model_id;
+}
+
+FFModel *RequestManager::get_ssm_model(int model_id) {
+  assert(model_id >= 0 && model_id < ssm_models.size());
+  return ssm_models[model_id];
+}
 
-    GenerationResult gr;
-    gr.guid = request.guid;
-    gr.input_text = "";
-    gr.input_tokens = prompt;
-    gr.output_text = "";
-    gr.output_tokens = prompt;
-    request_generation_results[request.guid] = gr;
+size_t RequestManager::get_num_ssms() {
+  return ssm_models.size();
+}
 
-    return request.guid;
+RequestManager::RequestGuid
+    RequestManager::register_new_request(std::vector<TokenId> const &prompt) {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+
+  // Add a new request
+  Request request;
+  request.status = Request::PENDING;
+  request.guid = next_available_guid++;
+
+  if (prompt.size() >= get_max_sequence_length()) {
+    std::cout << "Warning: too many tokens in prompt, only load up to "
+              << get_max_sequence_length() << " tokens, but got "
+              << prompt.size() << ".\n";
+
+    printf("tokens size: %zu\n", request.tokens.size());
+    return INVALID_GUID;
+  } else {
+    request.tokens = prompt;
+  }
+
+  if (get_num_ssms() == 0) {
+    std::cout << "No small speculative model registered, using incremental "
+                 "decoding."
+              << std::endl;
+  } else {
+    std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
+    assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
+    init_token_tree(request.guid);
+    request.streaming_cache_info = StreamingCacheInfo(
+        BatchConfig::SINK_SIZE,
+        BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+  }
+
+  pending_request_queue.push(request);
+  all_requests[request.guid] = request;
+  {
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
+    request_to_promise[request.guid] = new std::promise<void>();
   }
 
-  RequestManager::RequestGuid RequestManager::register_new_request(
-      std::string const &prompt) {
-    std::lock_guard<std::mutex> const lock(request_queue_mutex);
-    // Add a new request
-    Request request;
-    request.status = Request::PENDING;
-    request.guid = next_available_guid++;
-    if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
-      request.tokens.push_back(bos_token_id);
-    }
-    std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
-    if (tokens.size() >= get_max_sequence_length()) {
-      std::cout << "Warning: too many tokens in prompt, only load up to "
-                << get_max_sequence_length() << " tokens, but got "
-                << tokens.size() << ".\n";
-
-      printf("tokens size: %zu\n", tokens.size());
-      return INVALID_GUID;
-    }
-    for (int i = 0; i < tokens.size(); i++) {
-      std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-    }
-    request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
-
-    if (get_num_ssms() == 0) {
-      std::cout << "No small speculative model registered, using incremental "
-                   "decoding."
-                << std::endl;
-    } else {
-      std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-      assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
-      init_token_tree(request.guid);
-      request.streaming_cache_info = StreamingCacheInfo(
-          BatchConfig::SINK_SIZE,
-          BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
-    }
-
-    pending_request_queue.push(request);
-    all_requests[request.guid] = request;
-    {
-      std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
-      request_to_promise[request.guid] = new std::promise<void>();
-    }
-
-    {
-      std::string output = "New request tokens:";
-      output = "[" + std::to_string(request.guid) + "] " + output;
-      for (int i = 0; i < request.tokens.size(); i++) {
-        output = output + " " + std::to_string(request.tokens[i]);
-      }
-      log_req_mgr.print("%s", output.c_str());
-      write_to_output_file("", output);
+  if (verbose) {
+    std::cout << "new req: " << request.tokens.size() << std::endl;
+    for (int i = 0; i < request.tokens.size(); i++) {
+      std::cout << i << " : " << request.tokens[i] << std::endl;
     }
-
-    GenerationResult gr;
-    gr.guid = request.guid;
-    gr.input_text = prompt;
-    gr.input_tokens = request.tokens;
-    gr.output_text = prompt;
-    gr.output_tokens = request.tokens;
-    request_generation_results[request.guid] = gr;
-    return request.guid;
   }
 
-  bool RequestManager::is_request_completed(RequestGuid const &guid) {
-    std::lock_guard<std::mutex> const lock(request_queue_mutex);
-    assert(all_requests.find(guid) != all_requests.end());
-    Request const &request = all_requests[guid];
-    // return request.tokens.size() >= request.max_sequence_length;
-    return request.status == Request::COMPLETED;
-  }
+  GenerationResult gr;
+  gr.guid = request.guid;
+  gr.input_text = "";
+  gr.input_tokens = prompt;
+  gr.output_text = "";
+  gr.output_tokens = prompt;
+  request_generation_results[request.guid] = gr;
 
-  GenerationResult RequestManager::get_generation_result(
-      RequestGuid const &guid) {
-    // First get the future of the request
-    std::future<void> future;
-    {
-      std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
-      assert(request_to_promise.find(guid) != request_to_promise.end());
-      future = request_to_promise[guid]->get_future();
-    }
-    // Wait until the result is completed
-    future.get();
-    // Get the generation result
-    {
-      std::lock_guard<std::mutex> const lock(request_queue_mutex);
-      assert(request_generation_results.find(guid) !=
-             request_generation_results.end());
-      return request_generation_results[guid];
-    }
-  }
+  return request.guid;
+}
 
-  size_t RequestManager::get_num_processed_requests() {
-    return num_processed_requests;
+RequestManager::RequestGuid
+    RequestManager::register_new_request(std::string const &prompt) {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  // Add a new request
+  Request request;
+  request.status = Request::PENDING;
+  request.guid = next_available_guid++;
+  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+    request.tokens.push_back(bos_token_id);
+  }
+  std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
+  if (tokens.size() >= get_max_sequence_length()) {
+    std::cout << "Warning: too many tokens in prompt, only load up to "
+              << get_max_sequence_length() << " tokens, but got "
+              << tokens.size() << ".\n";
+
+    printf("tokens size: %zu\n", tokens.size());
+    return INVALID_GUID;
+  }
+  for (int i = 0; i < tokens.size(); i++) {
+    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+  }
+  request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
+
+  if (get_num_ssms() == 0) {
+    std::cout << "No small speculative model registered, using incremental "
+                 "decoding."
+              << std::endl;
+  } else {
+    std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
+    assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
+    init_token_tree(request.guid);
+    request.streaming_cache_info = StreamingCacheInfo(
+        BatchConfig::SINK_SIZE,
+        BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+  }
+
+  pending_request_queue.push(request);
+  all_requests[request.guid] = request;
+  {
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
+    request_to_promise[request.guid] = new std::promise<void>();
   }
 
-  int RequestManager::get_num_active_requests() {
-    int count = 0;
-    for (int i = 0; i < get_max_requests_per_batch(); i++) {
-      if (guid_of_requests[i] != INVALID_GUID) {
-        count++;
-      }
+  {
+    std::string output = "New request tokens:";
+    output = "[" + std::to_string(request.guid) + "] " + output;
+    for (int i = 0; i < request.tokens.size(); i++) {
+      output = output + " " + std::to_string(request.tokens[i]);
     }
-    return count;
+    log_req_mgr.print("%s", output.c_str());
+    write_to_output_file("", output);
   }
 
-  int RequestManager::get_empty_request_index() {
-    for (int i = 0; i < get_max_requests_per_batch(); i++) {
-      if (guid_of_requests[i] == INVALID_GUID) {
-        return i;
-      }
-    }
-    return -1;
-  }
+  GenerationResult gr;
+  gr.guid = request.guid;
+  gr.input_text = prompt;
+  gr.input_tokens = request.tokens;
+  gr.output_text = prompt;
+  gr.output_tokens = request.tokens;
+  request_generation_results[request.guid] = gr;
+  return request.guid;
+}
+
+bool RequestManager::is_request_completed(RequestGuid const &guid) {
+  std::lock_guard<std::mutex> const lock(request_queue_mutex);
+  assert(all_requests.find(guid) != all_requests.end());
+  Request const &request = all_requests[guid];
+  // return request.tokens.size() >= request.max_sequence_length;
+  return request.status == Request::COMPLETED;
+}
 
-  BatchConfigFuture RequestManager::get_next_batch_config(
-      InferenceResultFuture const &result, Context ctx, Runtime *runtime) {
-    RequestManager *rm = this;
-    TaskLauncher launcher(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
-                          TaskArgument(&rm, sizeof(RequestManager *)));
-    launcher.add_future(result);
-    return runtime->execute_task(ctx, launcher);
+GenerationResult
+    RequestManager::get_generation_result(RequestGuid const &guid) {
+  // First get the future of the request
+  std::future<void> future;
+  {
+    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
+    assert(request_to_promise.find(guid) != request_to_promise.end());
+    future = request_to_promise[guid]->get_future();
+  }
+  // Wait until the result is completed
+  future.get();
+  // Get the generation result
+  {
+    std::lock_guard<std::mutex> const lock(request_queue_mutex);
+    assert(request_generation_results.find(guid) !=
+           request_generation_results.end());
+    return request_generation_results[guid];
   }
+}
 
-  BatchConfig RequestManager::get_next_batch_config_task(
-      Task const *task,
-      std::vector<PhysicalRegion> const &regions,
-      Context ctx,
-      Runtime *runtime) {
-    RequestManager *rm = *((RequestManager **)task->args);
-    if (rm->request_manager_status == PREFILLING and
-        rm->prefill_model == SSM and rm->current_ssm_step != 0) {
-      // Return an empty batch config
-      return rm->get_next_batch_config(InferenceResult());
-    } else if (rm->request_manager_status == SSM_SPEC and rm->ssm_completed) {
-      return rm->get_next_batch_config(InferenceResult());
-    }
+size_t RequestManager::get_num_processed_requests() {
+  return num_processed_requests;
+}
 
-    InferenceResult const &result =
-        Future(task->futures[0]).get_result<InferenceResult>();
-    return rm->get_next_batch_config(result);
+int RequestManager::get_num_active_requests() {
+  int count = 0;
+  for (int i = 0; i < get_max_requests_per_batch(); i++) {
+    if (guid_of_requests[i] != INVALID_GUID) {
+      count++;
+    }
   }
+  return count;
+}
 
-  BatchConfig RequestManager::get_next_batch_config(
-      InferenceResult const &result) {
-    update_inference_results(result);
-    return prepare_next_batch();
+int RequestManager::get_empty_request_index() {
+  for (int i = 0; i < get_max_requests_per_batch(); i++) {
+    if (guid_of_requests[i] == INVALID_GUID) {
+      return i;
+    }
   }
+  return -1;
+}
 
-  void RequestManager::load_pending_request_to_batch() {
-    assert(!pending_request_queue.empty() && "No pending request to process.");
-    RequestGuid guid = pending_request_queue.front().guid;
-    pending_request_queue.pop();
+BatchConfigFuture RequestManager::get_next_batch_config(
+    InferenceResultFuture const &result, Context ctx, Runtime *runtime) {
+  RequestManager *rm = this;
+  TaskLauncher launcher(RM_GET_NEXT_BATCH_CONFIG_TASK_ID,
+                        TaskArgument(&rm, sizeof(RequestManager *)));
+  launcher.add_future(result);
+  return runtime->execute_task(ctx, launcher);
+}
 
-    prefill_request = &all_requests[guid];
-    prefill_request->status = Request::RUNNING;
+BatchConfig RequestManager::get_next_batch_config_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  RequestManager *rm = *((RequestManager **)task->args);
+  if (rm->request_manager_status == PREFILLING and rm->prefill_model == SSM and
+      rm->current_ssm_step != 0) {
+    // Return an empty batch config
+    return rm->get_next_batch_config(InferenceResult());
+  } else if (rm->request_manager_status == SSM_SPEC and rm->ssm_completed) {
+    return rm->get_next_batch_config(InferenceResult());
+  }
+
+  InferenceResult const &result =
+      Future(task->futures[0]).get_result<InferenceResult>();
+  return rm->get_next_batch_config(result);
+}
 
-    // Find an empty slot
-    int request_index = get_empty_request_index();
-    assert(request_index != -1 && "No empty request slot to load the request.");
-    // Load request into batch
-    prefill_request->batch_index = request_index;
-    guid_of_requests[request_index] = guid;
-    request_available[request_index] = true;
-    num_available_requests++;
-    // Initialize the bitmask for the new request with its prompt length
-    init_bitmask_prompt(guid, prefill_request->tokens.size());
+BatchConfig
+    RequestManager::get_next_batch_config(InferenceResult const &result) {
+  update_inference_results(result);
+  return prepare_next_batch();
+}
 
-    profiling_requests[guid] = RequestProfileInfo();
-    profiling_requests[guid].start_time =
-        Realm::Clock::current_time_in_microseconds();
-  }
+void RequestManager::load_pending_request_to_batch() {
+  assert(!pending_request_queue.empty() && "No pending request to process.");
+  RequestGuid guid = pending_request_queue.front().guid;
+  pending_request_queue.pop();
+
+  prefill_request = &all_requests[guid];
+  prefill_request->status = Request::RUNNING;
+
+  // Find an empty slot
+  int request_index = get_empty_request_index();
+  assert(request_index != -1 && "No empty request slot to load the request.");
+  // Load request into batch
+  prefill_request->batch_index = request_index;
+  guid_of_requests[request_index] = guid;
+  request_available[request_index] = true;
+  num_available_requests++;
+  // Initialize the bitmask for the new request with its prompt length
+  init_bitmask_prompt(guid, prefill_request->tokens.size());
+
+  profiling_requests[guid] = RequestProfileInfo();
+  profiling_requests[guid].start_time =
+      Realm::Clock::current_time_in_microseconds();
+}
 
-  void RequestManager::request_complete_clean_up(int batch_index) {
-    RequestGuid guid = guid_of_requests[batch_index];
-    profiling_requests[guid].finish_time =
-        Realm::Clock::current_time_in_microseconds();
-    Request &request = all_requests[guid];
-    guid_of_requests[batch_index] = INVALID_GUID;
-    request_available[batch_index] = false;
-    num_available_requests--;
-    request.status = Request::COMPLETED;
-
-    // Find the sos and eos in the sequence
-    auto bos_it = std::find(
-        request.tokens.begin(), request.tokens.end(), this->bos_token_id);
-    auto eos_rit = std::find(
-        request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id);
-    std::vector<int>::iterator eos_it;
-    if (eos_rit != request.tokens.rend()) {
-      eos_it = eos_rit.base();
-    } else {
-      eos_it = request.tokens.end();
-    }
-    std::string output =
-        this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
-
-    std::cout << "Request " << guid << " completed: " << std::endl << std::endl;
-    std::cout << "<bos>" << output;
-    if (eos_rit != request.tokens.rend()) {
-      std::cout << "<eos>";
-    }
-    std::cout << std::endl << std::endl;
-    {
-      RequestProfileInfo profile_info = profiling_requests[guid];
-
-      std::ostream *os = &std::cout;
-      std::ofstream output_file;
-      if (!output_filepath.empty()) {
-        output_file.open(output_filepath, std::ios::app);
-        if (output_file.is_open()) {
-          os = &output_file;
-        } else {
-          std::cout << "Unable to open the output file: " << output_filepath
-                    << std::endl;
-          assert(false);
-        }
-      }
-      *os << "Request " << guid << " profiling: " << std::endl;
-      if (profile_info.start_decoding_time != 0) {
-        *os << "Decoding time: "
-            << (profile_info.finish_time - profile_info.start_decoding_time) *
-                   1e-3
-            << " ms" << std::endl;
+void RequestManager::request_complete_clean_up(int batch_index) {
+  RequestGuid guid = guid_of_requests[batch_index];
+  profiling_requests[guid].finish_time =
+      Realm::Clock::current_time_in_microseconds();
+  Request &request = all_requests[guid];
+  guid_of_requests[batch_index] = INVALID_GUID;
+  request_available[batch_index] = false;
+  num_available_requests--;
+  request.status = Request::COMPLETED;
+
+  // Find the sos and eos in the sequence
+  auto bos_it = std::find(
+      request.tokens.begin(), request.tokens.end(), this->bos_token_id);
+  auto eos_rit = std::find(
+      request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id);
+  std::vector<int>::iterator eos_it;
+  if (eos_rit != request.tokens.rend()) {
+    eos_it = eos_rit.base();
+  } else {
+    eos_it = request.tokens.end();
+  }
+  std::string output =
+      this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
+
+  std::cout << "Request " << guid << " completed: " << std::endl << std::endl;
+  std::cout << "<bos>" << output;
+  if (eos_rit != request.tokens.rend()) {
+    std::cout << "<eos>";
+  }
+  std::cout << std::endl << std::endl;
+  {
+    RequestProfileInfo profile_info = profiling_requests[guid];
+
+    std::ostream *os = &std::cout;
+    std::ofstream output_file;
+    if (!output_filepath.empty()) {
+      output_file.open(output_filepath, std::ios::app);
+      if (output_file.is_open()) {
+        os = &output_file;
       } else {
-        *os << "Decoding time: 0 ms" << std::endl;
+        std::cout << "Unable to open the output file: " << output_filepath
+                  << std::endl;
+        assert(false);
       }
-      *os << "Total time: "
-          << (profile_info.finish_time - profile_info.start_time) * 1e-3
+    }
+    *os << "Request " << guid << " profiling: " << std::endl;
+    if (profile_info.start_decoding_time != 0) {
+      *os << "Decoding time: "
+          << (profile_info.finish_time - profile_info.start_decoding_time) *
+                 1e-3
           << " ms" << std::endl;
-      *os << "LLM decoding steps: " << profile_info.llm_decoding_steps
+    } else {
+      *os << "Decoding time: 0 ms" << std::endl;
+    }
+    *os << "Total time: "
+        << (profile_info.finish_time - profile_info.start_time) * 1e-3 << " ms"
+        << std::endl;
+    *os << "LLM decoding steps: " << profile_info.llm_decoding_steps
+        << std::endl;
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
           << std::endl;
-      if (decoding_mode == SPECULATIVE_DECODING) {
-        *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
-            << std::endl;
-      }
-      *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
+    }
+    *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
+
+    if (!output_filepath.empty()) {
+      output_file.close();
+    }
+  }
+  // RequestProfileInfo profile_info = profiling_requests[guid];
+  // std::string str =
+  //     "[" + std::to_string(guid) +
+  //     "] Request completed:" + " decoding_time_ms(" +
+  //     std::to_string(
+  //         (profile_info.finish_time - profile_info.start_decoding_time) *
+  //         1e-3) +
+  //     ")" + " total_time_ms(" +
+  //     std::to_string((profile_info.finish_time - profile_info.start_time) *
+  //                    1e-3) +
+  //     ")" + " LLM_decoding_steps(" +
+  //     std::to_string(profile_info.llm_decoding_steps) + ")";
+  // if (decoding_mode == SPECULATIVE_DECODING) {
+  //   str = str + " SSM_decoding_steps(" +
+  //         std::to_string(profile_info.ssm_decoding_steps) + ")";
+  // }
+  // write_to_output_file("", str);
+
+  trigger_request_completion_future(guid);
+}
 
-      if (!output_filepath.empty()) {
-        output_file.close();
+void RequestManager::update_inference_results(InferenceResult const &result) {
+  // Update the inference results
+  std::lock_guard<std::mutex> const rm_state_lock(rm_state_mutex);
+  std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
+
+  if (num_available_requests == 0) {
+    // Update nothing
+    if (!pending_request_queue.empty()) {
+      // Load the pending request to the batch
+      load_pending_request_to_batch();
+      request_manager_status = PREFILLING;
+      if (decoding_mode == SPECULATIVE_DECODING) {
+        prefill_model = SSM;
+        current_ssm_step = 0;
       }
     }
-    // RequestProfileInfo profile_info = profiling_requests[guid];
-    // std::string str =
-    //     "[" + std::to_string(guid) +
-    //     "] Request completed:" + " decoding_time_ms(" +
-    //     std::to_string(
-    //         (profile_info.finish_time - profile_info.start_decoding_time) *
-    //         1e-3) +
-    //     ")" + " total_time_ms(" +
-    //     std::to_string((profile_info.finish_time - profile_info.start_time) *
-    //                    1e-3) +
-    //     ")" + " LLM_decoding_steps(" +
-    //     std::to_string(profile_info.llm_decoding_steps) + ")";
-    // if (decoding_mode == SPECULATIVE_DECODING) {
-    //   str = str + " SSM_decoding_steps(" +
-    //         std::to_string(profile_info.ssm_decoding_steps) + ")";
-    // }
-    // write_to_output_file("", str);
-
-    trigger_request_completion_future(guid);
-  }
-
-  void RequestManager::update_inference_results(InferenceResult const &result) {
-    // Update the inference results
-    std::lock_guard<std::mutex> const rm_state_lock(rm_state_mutex);
-    std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
-
-    if (num_available_requests == 0) {
-      // Update nothing
-      if (!pending_request_queue.empty()) {
-        // Load the pending request to the batch
-        load_pending_request_to_batch();
-        request_manager_status = PREFILLING;
-        if (decoding_mode == SPECULATIVE_DECODING) {
-          prefill_model = SSM;
-          current_ssm_step = 0;
+    return;
+  }
+
+  switch (request_manager_status) {
+    case PREFILLING:
+      if (decoding_mode == INCREMENTAL_DECODING) {
+        if (update_llm_prefill_results(result)) {
+          // This indicates that the prefilling of the current request
+          // finishes Reset the prefill_request
+          prefill_request = nullptr;
+
+          // Check if there are more empty slots
+          if (num_available_requests < get_max_requests_per_batch() &&
+              !pending_request_queue.empty()) {
+            // Load the pending request to the batch
+            load_pending_request_to_batch();
+            request_manager_status = PREFILLING;
+          } else {
+            // No more empty slots, start the decoding
+            request_manager_status = DECODING;
+          }
         }
-      }
-      return;
-    }
+        // Not completed, continue prefilling
+      } else if (decoding_mode == SPECULATIVE_DECODING) {
+        if (prefill_model == SSM) {
+          // A single iteration contains max_tree_depth SSM steps and a single
+          // LLM step. To align with this structure, we have to create
+          // max_tree_depth - 1 empty SSM steps during the prefilling phase.
+          if (current_ssm_step == 0) {
+            update_ssm_prefill_results(result);
+          }
+          // Except for the first step, we do nothing.
+          current_ssm_step++;
 
-    switch (request_manager_status) {
-      case PREFILLING:
-        if (decoding_mode == INCREMENTAL_DECODING) {
+          if (current_ssm_step == get_max_tree_depth()) {
+            prefill_model = LLM;
+          }
+        } else if (prefill_model == LLM) {
           if (update_llm_prefill_results(result)) {
-            // This indicates that the prefilling of the current request
-            // finishes Reset the prefill_request
+            // This indicates that the prefilling phase finishes
             prefill_request = nullptr;
-
             // Check if there are more empty slots
             if (num_available_requests < get_max_requests_per_batch() &&
                 !pending_request_queue.empty()) {
               // Load the pending request to the batch
               load_pending_request_to_batch();
-              request_manager_status = PREFILLING;
-            } else {
-              // No more empty slots, start the decoding
-              request_manager_status = DECODING;
-            }
-          }
-          // Not completed, continue prefilling
-        } else if (decoding_mode == SPECULATIVE_DECODING) {
-          if (prefill_model == SSM) {
-            // A single iteration contains max_tree_depth SSM steps and a single
-            // LLM step. To align with this structure, we have to create
-            // max_tree_depth - 1 empty SSM steps during the prefilling phase.
-            if (current_ssm_step == 0) {
-              update_ssm_prefill_results(result);
-            }
-            // Except for the first step, we do nothing.
-            current_ssm_step++;
-
-            if (current_ssm_step == get_max_tree_depth()) {
-              prefill_model = LLM;
-            }
-          } else if (prefill_model == LLM) {
-            if (update_llm_prefill_results(result)) {
-              // This indicates that the prefilling phase finishes
-              prefill_request = nullptr;
-              // Check if there are more empty slots
-              if (num_available_requests < get_max_requests_per_batch() &&
-                  !pending_request_queue.empty()) {
-                // Load the pending request to the batch
-                load_pending_request_to_batch();
-                prefill_model = SSM;
-                current_ssm_step = 0;
-              } else {
-                // No more empty slots, start the speculation
-                request_manager_status = SSM_SPEC;
-                // Reset the prefill_request
-                current_ssm_step = 0;
-                ssm_completed = false;
-              }
-            } else {
-              // Not completed, start the next iteration of prefilling
               prefill_model = SSM;
               current_ssm_step = 0;
+            } else {
+              // No more empty slots, start the speculation
+              request_manager_status = SSM_SPEC;
+              // Reset the prefill_request
+              current_ssm_step = 0;
+              ssm_completed = false;
             }
           } else {
-            assert(false && "Invalid prefill model.");
-          }
-        } else {
-          assert(false && "Invalid inference mode.");
-        }
-        break;
-      case DECODING:
-        if (update_llm_decode_results(result)) {
-          // A request completed after the decode
-          if (pending_request_queue.empty()) {
-            // No pending request to process, continue the speculation
-            request_manager_status = DECODING;
-          } else {
-            request_manager_status = PREFILLING;
-            load_pending_request_to_batch();
-          }
-        }
-        break;
-      case LLM_VERIFY:
-        if (update_llm_verify_results(result)) {
-          // A request completed after the verification
-          if (pending_request_queue.empty()) {
-            // No pending request to process, continue the speculation
-            request_manager_status = SSM_SPEC;
-            current_ssm_step = 0;
-            ssm_completed = false;
-          } else {
-            request_manager_status = PREFILLING;
-            load_pending_request_to_batch();
+            // Not completed, start the next iteration of prefilling
             prefill_model = SSM;
             current_ssm_step = 0;
           }
         } else {
+          assert(false && "Invalid prefill model.");
+        }
+      } else {
+        assert(false && "Invalid inference mode.");
+      }
+      break;
+    case DECODING:
+      if (update_llm_decode_results(result)) {
+        // A request completed after the decode
+        if (pending_request_queue.empty()) {
+          // No pending request to process, continue the speculation
+          request_manager_status = DECODING;
+        } else {
+          request_manager_status = PREFILLING;
+          load_pending_request_to_batch();
+        }
+      }
+      break;
+    case LLM_VERIFY:
+      if (update_llm_verify_results(result)) {
+        // A request completed after the verification
+        if (pending_request_queue.empty()) {
+          // No pending request to process, continue the speculation
           request_manager_status = SSM_SPEC;
           current_ssm_step = 0;
           ssm_completed = false;
+        } else {
+          request_manager_status = PREFILLING;
+          load_pending_request_to_batch();
+          prefill_model = SSM;
+          current_ssm_step = 0;
         }
-        break;
-      case SSM_SPEC:
-        // Update current_ssm_step first because when we first call
-        // update_ssm_inference_results, there's already a step of small model
-        // inference
-        current_ssm_step++;
-        if (!ssm_completed) {
-          ssm_completed = update_ssm_inference_results(result);
-        }
-
-        if (current_ssm_step == get_max_tree_depth()) {
-          request_manager_status = LLM_VERIFY;
-        }
-        break;
-      default:
-        assert(false && "Invalid request manager status.");
-    }
-  }
-
-  bool RequestManager::update_llm_prefill_results(
-      InferenceResult const &result) {
-    bool prefill_completed = false;
-    if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
-      prefill_request->streaming_cache_info.commit_cache(
-          prefill_request->num_tokens_in_batch);
-      prefill_request->llm_cache_size =
-          prefill_request->streaming_cache_info.commit_len;
-    } else {
-      prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
-    }
-    prefill_request->llm_prefill_len += prefill_request->num_tokens_in_batch;
-
-    if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
-      // Indicates that the LLM prefilling phase finishes
-      prefill_request->tokens.push_back(
-          result.token_ids[prefill_request->num_tokens_in_batch - 1]);
-      std::cout << std::endl;
-      std::cout << std::endl;
-      std::cout << std::endl;
-      std::cout << result.token_ids[prefill_request->num_tokens_in_batch - 1]
-                << std::endl;
-      std::cout << std::endl;
-      std::cout << std::endl;
-      std::cout << std::endl;
-      prefill_completed = true;
-
-      if (prefill_request->tokens.back() == eos_token_id) {
-        request_complete_clean_up(prefill_request->batch_index);
-      }
-
-      if (decoding_mode == SPECULATIVE_DECODING) {
-        // Add the last token to the token tree
-        assert(prefill_request->committed_tokens.empty() &&
-               "The committed tokens should be empty.");
-        prefill_request->committed_tokens.push_back(
-            Request::CommittedToken{-1,
-                                    (int)prefill_request->tokens.size() - 1,
-                                    prefill_request->tokens.back()});
-        init_token_tree(prefill_request->guid);
-        add_root_to_spec_token_tree(prefill_request->guid,
-                                    prefill_request->tokens.back());
-        update_bitmask_prompt(prefill_request->guid, 1);
-      }
-    }
-
-    profiling_requests[prefill_request->guid].llm_prefilling_steps++;
-
-    return prefill_completed;
-  }
-
-  bool RequestManager::update_llm_decode_results(
-      InferenceResult const &result) {
-    bool request_completed = false;
-    int nb_requests_decoded = 0;
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        // Request in this slot is unavailable
-        continue;
-      }
-      int guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      request.llm_cache_size++;
-      if (streaming_cache) {
-        request.streaming_cache_info.commit_cache(1);
-        request.llm_cache_size = request.streaming_cache_info.commit_len;
       } else {
-        request.llm_cache_size++;
+        request_manager_status = SSM_SPEC;
+        current_ssm_step = 0;
+        ssm_completed = false;
       }
-      request.tokens.push_back(
-          result.token_ids[request.first_token_offset_in_batch]);
-
-      profiling_requests[guid].llm_decoding_steps++;
-      nb_requests_decoded++;
-      if (request.tokens.back() == eos_token_id or
-          request.tokens.size() >= get_max_sequence_length()) {
-        request_completed = true;
-        request_complete_clean_up(request_index);
+      break;
+    case SSM_SPEC:
+      // Update current_ssm_step first because when we first call
+      // update_ssm_inference_results, there's already a step of small model
+      // inference
+      current_ssm_step++;
+      if (!ssm_completed) {
+        ssm_completed = update_ssm_inference_results(result);
       }
 
-      if (verbose) {
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        std::cout << "Request " << guid << " tokens: " << std::endl
-                  << output << std::endl;
+      if (current_ssm_step == get_max_tree_depth()) {
+        request_manager_status = LLM_VERIFY;
       }
-    }
-    profiling.llm_step_times.push_back(
-        (Realm::Clock::current_time_in_microseconds() -
-         profiling.llm_step_start) *
-        1e-3);
-    profiling.requests_per_step.push_back(nb_requests_decoded);
-    profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
-    return request_completed;
+      break;
+    default:
+      assert(false && "Invalid request manager status.");
   }
+}
 
-  void RequestManager::update_ssm_prefill_results(
-      InferenceResult const &ssm_prefill_result) {
-    // This function is called by update_inference_results when the
-    // request_manager_status is PREFILLING and the prefill_model is SSM.
-    // There's no results to update, but we should update ssm_cache_size.
+bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
+  bool prefill_completed = false;
+  if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
+    prefill_request->streaming_cache_info.commit_cache(
+        prefill_request->num_tokens_in_batch);
+    prefill_request->llm_cache_size =
+        prefill_request->streaming_cache_info.commit_len;
+  } else {
+    prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
+  }
+  prefill_request->llm_prefill_len += prefill_request->num_tokens_in_batch;
+
+  if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
+    // Indicates that the LLM prefilling phase finishes
+    prefill_request->tokens.push_back(
+        result.token_ids[prefill_request->num_tokens_in_batch - 1]);
+    std::cout << std::endl;
+    std::cout << std::endl;
+    std::cout << std::endl;
+    std::cout << result.token_ids[prefill_request->num_tokens_in_batch - 1]
+              << std::endl;
+    std::cout << std::endl;
+    std::cout << std::endl;
+    std::cout << std::endl;
+    prefill_completed = true;
+
+    if (prefill_request->tokens.back() == eos_token_id) {
+      request_complete_clean_up(prefill_request->batch_index);
+    }
+
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      // Add the last token to the token tree
+      assert(prefill_request->committed_tokens.empty() &&
+             "The committed tokens should be empty.");
+      prefill_request->committed_tokens.push_back(
+          Request::CommittedToken{-1,
+                                  (int)prefill_request->tokens.size() - 1,
+                                  prefill_request->tokens.back()});
+      init_token_tree(prefill_request->guid);
+      add_root_to_spec_token_tree(prefill_request->guid,
+                                  prefill_request->tokens.back());
+      update_bitmask_prompt(prefill_request->guid, 1);
+    }
+  }
+
+  profiling_requests[prefill_request->guid].llm_prefilling_steps++;
+
+  return prefill_completed;
+}
+
+bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
+  bool request_completed = false;
+  int nb_requests_decoded = 0;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    request.llm_cache_size++;
     if (streaming_cache) {
-      prefill_request->streaming_cache_info.commit_cache(
-          prefill_request->num_tokens_in_batch);
-      prefill_request->ssm_cache_size =
-          prefill_request->streaming_cache_info.commit_len;
+      request.streaming_cache_info.commit_cache(1);
+      request.llm_cache_size = request.streaming_cache_info.commit_len;
     } else {
-      prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
+      request.llm_cache_size++;
     }
-    prefill_request->ssm_prefill_len += prefill_request->num_tokens_in_batch;
+    request.tokens.push_back(
+        result.token_ids[request.first_token_offset_in_batch]);
 
-    profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
-  }
+    profiling_requests[guid].llm_decoding_steps++;
+    nb_requests_decoded++;
+    if (request.tokens.back() == eos_token_id or
+        request.tokens.size() >= get_max_sequence_length()) {
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    }
 
-  BatchConfig RequestManager::prepare_next_batch() {
-    switch (request_manager_status) {
-      case PREFILLING:
-        if (decoding_mode == INCREMENTAL_DECODING) {
-          return prepare_llm_prefilling_batch();
-        } else if (decoding_mode == SPECULATIVE_DECODING) {
-          if (prefill_model == SSM) {
-            if (current_ssm_step == 0) {
-              return prepare_ssm_prefilling_batch();
-            } else {
-              // Return an empty batch config
-              return BatchConfig();
-            }
-          } else if (prefill_model == LLM) {
-            return prepare_llm_prefilling_batch();
+    if (verbose) {
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Request " << guid << " tokens: " << std::endl
+                << output << std::endl;
+    }
+  }
+  profiling.llm_step_times.push_back(
+      (Realm::Clock::current_time_in_microseconds() -
+       profiling.llm_step_start) *
+      1e-3);
+  profiling.requests_per_step.push_back(nb_requests_decoded);
+  profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
+  return request_completed;
+}
+
+void RequestManager::update_ssm_prefill_results(
+    InferenceResult const &ssm_prefill_result) {
+  // This function is called by update_inference_results when the
+  // request_manager_status is PREFILLING and the prefill_model is SSM.
+  // There's no results to update, but we should update ssm_cache_size.
+  if (streaming_cache) {
+    prefill_request->streaming_cache_info.commit_cache(
+        prefill_request->num_tokens_in_batch);
+    prefill_request->ssm_cache_size =
+        prefill_request->streaming_cache_info.commit_len;
+  } else {
+    prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
+  }
+  prefill_request->ssm_prefill_len += prefill_request->num_tokens_in_batch;
+
+  profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
+}
+
+BatchConfig RequestManager::prepare_next_batch() {
+  switch (request_manager_status) {
+    case PREFILLING:
+      if (decoding_mode == INCREMENTAL_DECODING) {
+        return prepare_llm_prefilling_batch();
+      } else if (decoding_mode == SPECULATIVE_DECODING) {
+        if (prefill_model == SSM) {
+          if (current_ssm_step == 0) {
+            return prepare_ssm_prefilling_batch();
           } else {
-            assert(false && "Invalid prefill model.");
+            // Return an empty batch config
+            return BatchConfig();
           }
+        } else if (prefill_model == LLM) {
+          return prepare_llm_prefilling_batch();
         } else {
-          assert(false && "Invalid inference mode.");
-        }
-        break;
-      case DECODING:
-        return prepare_decoding_batch();
-      case SSM_SPEC:
-        if (current_ssm_step == 0) {
-          return prepare_first_spec_batch_config();
-        } else if (!ssm_completed) {
-          return prepare_next_spec_batch_config();
-        } else {
-          // Return an empty batch config
-          return BatchConfig();
+          assert(false && "Invalid prefill model.");
         }
-      case LLM_VERIFY:
-        return prepare_verify_batch_config();
-      default:
-        std::cout << "Invalid request manager status: "
-                  << request_manager_status << std::endl;
-        assert(false);
-    }
+      } else {
+        assert(false && "Invalid inference mode.");
+      }
+      break;
+    case DECODING:
+      return prepare_decoding_batch();
+    case SSM_SPEC:
+      if (current_ssm_step == 0) {
+        return prepare_first_spec_batch_config();
+      } else if (!ssm_completed) {
+        return prepare_next_spec_batch_config();
+      } else {
+        // Return an empty batch config
+        return BatchConfig();
+      }
+    case LLM_VERIFY:
+      return prepare_verify_batch_config();
+    default:
+      std::cout << "Invalid request manager status: " << request_manager_status
+                << std::endl;
+      assert(false);
   }
+}
 
-  BatchConfig RequestManager::prepare_llm_prefilling_batch() {
-    // This function is called when the request_manager_status is PREFILLING,
-    // which means that there is a request in the prefilling phase.
-    // This function load its prefilling tokens, constructing a BatchConfig with
-    // only one request.
-    if (verbose) {
-      std::cout << "\n############### prepare_llm_prefilling_batch "
-                   "##############\n";
-    }
-    assert(prefill_request != nullptr &&
-           "No prefilling request to process in the prefilling phase.");
+BatchConfig RequestManager::prepare_llm_prefilling_batch() {
+  // This function is called when the request_manager_status is PREFILLING,
+  // which means that there is a request in the prefilling phase.
+  // This function load its prefilling tokens, constructing a BatchConfig with
+  // only one request.
+  if (verbose) {
+    std::cout << "\n############### prepare_llm_prefilling_batch "
+                 "##############\n";
+  }
+  assert(prefill_request != nullptr &&
+         "No prefilling request to process in the prefilling phase.");
 
-    BatchConfig bc;
-    if (decoding_mode == INCREMENTAL_DECODING) {
-      bc.inference_mode = InferenceMode::INC_DECODING_MODE;
-    } else if (decoding_mode == SPECULATIVE_DECODING) {
-      bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
-    }
-    bc.prompt_phase = true;
-    bc.request_available[prefill_request->batch_index] = true;
-    bc.num_available_requests = 1;
+  BatchConfig bc;
+  if (decoding_mode == INCREMENTAL_DECODING) {
+    bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  } else if (decoding_mode == SPECULATIVE_DECODING) {
+    bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
+  }
+  bc.prompt_phase = true;
+  bc.request_available[prefill_request->batch_index] = true;
+  bc.num_available_requests = 1;
 
-    int request_index = prefill_request->batch_index;
-    RequestGuid guid = guid_of_requests[request_index];
-    Request &request = all_requests[guid];
-    assert(request.status == Request::RUNNING);
+  int request_index = prefill_request->batch_index;
+  RequestGuid guid = guid_of_requests[request_index];
+  Request &request = all_requests[guid];
+  assert(request.status == Request::RUNNING);
 
-    // Request Info
-    bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-    bc.requestsInfo[request_index].first_token_index_in_request =
-        prefill_request->llm_cache_size;
-    int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
-                                       (int)prefill_request->tokens.size() -
-                                           prefill_request->llm_prefill_len);
-    bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
+  // Request Info
+  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+  bc.requestsInfo[request_index].first_token_index_in_request =
+      prefill_request->llm_cache_size;
+  int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
+                                     (int)prefill_request->tokens.size() -
+                                         prefill_request->llm_prefill_len);
+  bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
 
-    // Copy the streaming cache info
-    bc.streamingCacheInfo[request_index] =
-        prefill_request->streaming_cache_info;
+  // Copy the streaming cache info
+  bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
 
-    prefill_request->first_token_offset_in_batch = 0;
-    prefill_request->num_tokens_in_batch = num_tokens_in_batch;
+  prefill_request->first_token_offset_in_batch = 0;
+  prefill_request->num_tokens_in_batch = num_tokens_in_batch;
 
-    // Token Info
-    for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
-      int abs_idx = prefill_request->llm_cache_size + token_idx;
-      assert(abs_idx < prefill_request->tokens.size());
+  // Token Info
+  for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
+    int abs_idx = prefill_request->llm_cache_size + token_idx;
+    assert(abs_idx < prefill_request->tokens.size());
 
-      bc.tokensInfo[token_idx].request_index = request_index;
-      bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
-      bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-      bc.tokensInfo[token_idx].token_id =
-          prefill_request->tokens[prefill_request->llm_prefill_len + token_idx];
+    bc.tokensInfo[token_idx].request_index = request_index;
+    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+    bc.tokensInfo[token_idx].token_id =
+        prefill_request->tokens[prefill_request->llm_prefill_len + token_idx];
 
-      bc.num_tokens++;
-    }
+    bc.num_tokens++;
+  }
 
-    if (verbose) {
-      std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
-      bc.print();
-    }
-    return bc;
+  if (verbose) {
+    std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
+    bc.print();
   }
+  return bc;
+}
 
-  BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
-    // This function is called when the request_manager_status is PREFILLING,
-    // which means that there is a request in the prefilling phase.
-    // This function load its prefilling tokens, constructing a BatchConfig with
-    // only one request.
-    if (verbose) {
-      std::cout << "\n############### prepare_ssm_prefilling_batch "
-                   "##############\n";
-    }
-    assert(prefill_request != nullptr &&
-           "No prefilling request to process in the prefilling phase.");
-
-    BatchConfig bc;
-    bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
-    bc.prompt_phase = true;
-    // Only set the prefilling request to be available
-    bc.request_available[prefill_request->batch_index] = true;
-    bc.num_available_requests = 1;
-
-    int request_index = prefill_request->batch_index;
-    // Request Info
-    bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
+  // This function is called when the request_manager_status is PREFILLING,
+  // which means that there is a request in the prefilling phase.
+  // This function load its prefilling tokens, constructing a BatchConfig with
+  // only one request.
+  if (verbose) {
+    std::cout << "\n############### prepare_ssm_prefilling_batch "
+                 "##############\n";
+  }
+  assert(prefill_request != nullptr &&
+         "No prefilling request to process in the prefilling phase.");
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
+  bc.prompt_phase = true;
+  // Only set the prefilling request to be available
+  bc.request_available[prefill_request->batch_index] = true;
+  bc.num_available_requests = 1;
+
+  int request_index = prefill_request->batch_index;
+  // Request Info
+  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
+  bc.requestsInfo[request_index].first_token_index_in_request =
+      prefill_request->ssm_cache_size;
+  int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
+                                     (int)prefill_request->tokens.size() -
+                                         prefill_request->ssm_prefill_len);
+  bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
+
+  // Copy the streaming cache info
+  bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
+
+  prefill_request->first_token_offset_in_batch = 0;
+  prefill_request->num_tokens_in_batch = num_tokens_in_batch;
+
+  // Token Info
+  for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
+    int abs_idx = prefill_request->ssm_cache_size + token_idx;
+    assert(abs_idx < prefill_request->tokens.size());
+
+    bc.tokensInfo[token_idx].request_index = request_index;
+    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+    bc.tokensInfo[token_idx].token_id =
+        prefill_request->tokens[prefill_request->ssm_prefill_len + token_idx];
+
+    bc.num_tokens++;
+  }
+
+  if (verbose) {
+    std::cout << "prepare_ssm_prefilling_batch NEW batchconfig:" << std::endl;
+    bc.print();
+  }
+  return bc;
+}
+
+BatchConfig RequestManager::prepare_decoding_batch() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
+  if (verbose) {
+    std::cout << "\n############### prepare_decoding_batch "
+                 "##############\n";
+  }
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  bc.prompt_phase = false;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+
+    // Per Request Info
     bc.requestsInfo[request_index].first_token_index_in_request =
-        prefill_request->ssm_cache_size;
-    int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
-                                       (int)prefill_request->tokens.size() -
-                                           prefill_request->ssm_prefill_len);
-    bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
 
     // Copy the streaming cache info
-    bc.streamingCacheInfo[request_index] =
-        prefill_request->streaming_cache_info;
-
-    prefill_request->first_token_offset_in_batch = 0;
-    prefill_request->num_tokens_in_batch = num_tokens_in_batch;
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
-    // Token Info
-    for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
-      int abs_idx = prefill_request->ssm_cache_size + token_idx;
-      assert(abs_idx < prefill_request->tokens.size());
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
 
-      bc.tokensInfo[token_idx].request_index = request_index;
-      bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
-      bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-      bc.tokensInfo[token_idx].token_id =
-          prefill_request->tokens[prefill_request->ssm_prefill_len + token_idx];
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
 
-      bc.num_tokens++;
-    }
+    bc.num_tokens++;
 
-    if (verbose) {
-      std::cout << "prepare_ssm_prefilling_batch NEW batchconfig:" << std::endl;
-      bc.print();
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
     }
-    return bc;
   }
 
-  BatchConfig RequestManager::prepare_decoding_batch() {
-    // This function is called when the request_manager_status is DECODING. It
-    // fills the last token of each request in the current batch to the
-    // BatchConfig for the LLM to decode.
-    if (verbose) {
-      std::cout << "\n############### prepare_decoding_batch "
-                   "##############\n";
-    }
-
-    BatchConfig bc;
-    bc.inference_mode = InferenceMode::INC_DECODING_MODE;
-    bc.prompt_phase = false;
-    std::copy(std::begin(request_available),
-              std::end(request_available),
-              std::begin(bc.request_available));
-    bc.num_available_requests = num_available_requests;
-
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         request_index++) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      Request &request = all_requests[guid_of_requests[request_index]];
-      assert(request.status == Request::RUNNING);
-
-      // Per Request Info
-      bc.requestsInfo[request_index].first_token_index_in_request =
-          request.llm_cache_size;
-      bc.requestsInfo[request_index].first_token_offset_in_batch =
-          bc.num_tokens;
-      bc.requestsInfo[request_index].num_tokens_in_batch = 1;
-
-      // Copy the streaming cache info
-      bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
-
-      request.first_token_offset_in_batch = bc.num_tokens;
-      request.num_tokens_in_batch = 1;
-
-      // Per Token Info
-      bc.tokensInfo[bc.num_tokens].request_index = request_index;
-      bc.tokensInfo[bc.num_tokens].abs_index_in_request =
-          request.llm_cache_size;
-      bc.tokensInfo[bc.num_tokens].abs_depth_in_request =
-          request.llm_cache_size;
-      bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
-
-      bc.num_tokens++;
-
-      if (profiling_requests[request.guid].llm_decoding_steps == 0) {
-        profiling_requests[request.guid].start_decoding_time =
-            Realm::Clock::current_time_in_microseconds();
-      }
-    }
-
-    if (verbose) {
-      std::cout << "prepare_decoding_batch NEW batchconfig:" << std::endl;
-      bc.print();
-    }
-    profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
-    return bc;
+  if (verbose) {
+    std::cout << "prepare_decoding_batch NEW batchconfig:" << std::endl;
+    bc.print();
   }
-  /* ----- Speculative Inference Specific functions ----- */
-
-  /***** Request Init Phase *****/
-  BatchConfig RequestManager::prepare_first_spec_batch_config() {
-    if (verbose) {
-      std::cout << "\n############### prepare_first_spec_batch_config "
-                   "##############\n";
-    }
-    // This method does the following:
-    // 1. Commit the verified tokens through BatchConfig. The infomation
-    // of the committed tokens are stored in request.committed_tokens. Put the
-    // information of the committed tokens into BatchConfig.TokensInfo.
-    // 2. Maintain BatchConfig::RequestsInfo and all other fields of
-    // BatchConfig.
-    assert(current_ssm_step == 0);
-
-    BatchConfig new_bc;
-    new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
-    // Assume that only one small model is in use now
-    new_bc.prompt_phase = true;
-    std::copy(std::begin(request_available),
-              std::end(request_available),
-              std::begin(new_bc.request_available));
-    new_bc.num_available_requests = num_available_requests;
-
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-
-      std::vector<Request::CommittedToken> &committed_tokens =
-          request.committed_tokens;
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return bc;
+}
+/* ----- Speculative Inference Specific functions ----- */
+
+/***** Request Init Phase *****/
+BatchConfig RequestManager::prepare_first_spec_batch_config() {
+  if (verbose) {
+    std::cout << "\n############### prepare_first_spec_batch_config "
+                 "##############\n";
+  }
+  // This method does the following:
+  // 1. Commit the verified tokens through BatchConfig. The infomation
+  // of the committed tokens are stored in request.committed_tokens. Put the
+  // information of the committed tokens into BatchConfig.TokensInfo.
+  // 2. Maintain BatchConfig::RequestsInfo and all other fields of
+  // BatchConfig.
+  assert(current_ssm_step == 0);
+
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
+  // Assume that only one small model is in use now
+  new_bc.prompt_phase = true;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-      // Maintain requestsInfo
-      new_bc.requestsInfo[request_index].first_token_offset_in_batch =
-          new_bc.num_tokens;
-      new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.ssm_cache_size;
-
-      // Store committed tokens to tokensInfo
-      int num_committed_tokens = committed_tokens.size();
-      if (num_committed_tokens == 1) {
-        new_bc.requestsInfo[request_index].num_tokens_in_batch = 1;
-        // The case where the prefilling is just finished. Although the last
-        // token's kv cache is already there, the we need to decode the last
-        // token because it's the root of the token tree.
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
+
+    // Maintain requestsInfo
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
+        request.ssm_cache_size;
+
+    // Store committed tokens to tokensInfo
+    int num_committed_tokens = committed_tokens.size();
+    if (num_committed_tokens == 1) {
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 1;
+      // The case where the prefilling is just finished. Although the last
+      // token's kv cache is already there, the we need to decode the last
+      // token because it's the root of the token tree.
+      new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+      if (streaming_cache) {
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            request.streaming_cache_info.global_2_cache_index(
+                committed_tokens[0].to_index);
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.streaming_cache_info.global_2_cache_index(
+                committed_tokens[0].to_index);
+      } else {
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            committed_tokens[0].to_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            committed_tokens[0].to_index;
+      }
+      new_bc.tokensInfo[new_bc.num_tokens].token_id =
+          committed_tokens[0].token_id;
+      new_bc.num_tokens++;
+    } else {
+      for (int committed_token_index = 1;
+           committed_token_index < committed_tokens.size();
+           committed_token_index++) {
         new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
         if (streaming_cache) {
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               request.streaming_cache_info.global_2_cache_index(
-                  committed_tokens[0].to_index);
+                  committed_tokens[committed_token_index].to_index);
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
               request.streaming_cache_info.global_2_cache_index(
-                  committed_tokens[0].to_index);
+                  committed_tokens[committed_token_index].to_index);
         } else {
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              committed_tokens[0].to_index;
+              committed_tokens[committed_token_index].to_index;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-              committed_tokens[0].to_index;
+              committed_tokens[committed_token_index].to_index;
         }
         new_bc.tokensInfo[new_bc.num_tokens].token_id =
-            committed_tokens[0].token_id;
+            committed_tokens[committed_token_index].token_id;
         new_bc.num_tokens++;
-      } else {
-        for (int committed_token_index = 1;
-             committed_token_index < committed_tokens.size();
-             committed_token_index++) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-          if (streaming_cache) {
-            new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-                request.streaming_cache_info.global_2_cache_index(
-                    committed_tokens[committed_token_index].to_index);
-            new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-                request.streaming_cache_info.global_2_cache_index(
-                    committed_tokens[committed_token_index].to_index);
-          } else {
-            new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-                committed_tokens[committed_token_index].to_index;
-            new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-                committed_tokens[committed_token_index].to_index;
-          }
-          new_bc.tokensInfo[new_bc.num_tokens].token_id =
-              committed_tokens[committed_token_index].token_id;
-          new_bc.num_tokens++;
-        }
-        new_bc.requestsInfo[request_index].num_tokens_in_batch =
-            num_committed_tokens - 1;
       }
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
+          num_committed_tokens - 1;
+    }
 
-      request.first_token_offset_in_batch =
-          new_bc.requestsInfo[request_index].first_token_offset_in_batch;
-      request.num_tokens_in_batch =
-          new_bc.requestsInfo[request_index].num_tokens_in_batch;
+    request.first_token_offset_in_batch =
+        new_bc.requestsInfo[request_index].first_token_offset_in_batch;
+    request.num_tokens_in_batch =
+        new_bc.requestsInfo[request_index].num_tokens_in_batch;
 
-      // Copy the causal mask, it should already been updated in
-      // update_llm_verify_results
-      new_bc.causalMask[request_index] = request.causal_mask;
-      if (streaming_cache) {
-        new_bc.causalMask[request_index].non_tree_cache_size =
-            request.ssm_cache_size - 1;
-      }
+    // Copy the causal mask, it should already been updated in
+    // update_llm_verify_results
+    new_bc.causalMask[request_index] = request.causal_mask;
+    if (streaming_cache) {
+      new_bc.causalMask[request_index].non_tree_cache_size =
+          request.ssm_cache_size - 1;
+    }
 
-      // Copy the streaming cache info
-      new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
-      if (profiling_requests[guid].ssm_decoding_steps == 0) {
-        profiling_requests[guid].start_decoding_time =
-            Realm::Clock::current_time_in_microseconds();
-      }
-      profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
-    }
-    if (verbose) {
-      std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
-                << std::endl;
-      new_bc.print();
+    if (profiling_requests[guid].ssm_decoding_steps == 0) {
+      profiling_requests[guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
     }
-    return new_bc;
+    profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
   }
+  if (verbose) {
+    std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
+              << std::endl;
+    new_bc.print();
+  }
+  return new_bc;
+}
 
-  /***** Speculative Decoding Phase *****/
-  BatchConfig RequestManager::prepare_next_spec_batch_config() {
-    if (verbose) {
-      std::cout << "\n############### prepare_next_spec_batch_config "
-                   "###############\n";
-      std::cout << "Current tree depth: " << current_ssm_step + 1 << "\n";
-    }
-
-    // Prepare the next batch for existing requests
-    BatchConfig new_bc;
-    new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
-    // We assume that only one small model is in use now
-    new_bc.model_id = 0;
-    std::copy(std::begin(request_available),
-              std::end(request_available),
-              std::begin(new_bc.request_available));
-    new_bc.num_available_requests = num_available_requests;
-
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      int guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      new_bc.requestsInfo[request_index].first_token_offset_in_batch =
-          new_bc.num_tokens;
-
-      // Fill in the tokens
-      TokenTree &token_tree =
-          request.speculative_token_trees.at(new_bc.model_id);
-      if (token_tree.tree_layers.size() <= current_ssm_step) {
-        // This request has no token to decode in this and the following small
-        // model inference steps
-        new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
-        // non_tree_cache_size = ssm_cache_size - 1
-        new_bc.requestsInfo[request_index].first_token_index_in_request =
-            request.ssm_cache_size - 1 +
-            request.causal_mask.tree_or_prompt_size -
-            request.causal_mask.current_layer_size;
-        request.num_tokens_in_batch = 0;
-        request.first_token_offset_in_batch = new_bc.num_tokens;
-        continue;
-      } else {
-        std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
-            token_tree.tree_layers.back();
-        // Exclude the current layer from the token tree, because we want the
-        // start index
-        // non_tree_cache_size = ssm_cache_size - 1
-        new_bc.requestsInfo[request_index].first_token_index_in_request =
-            request.ssm_cache_size - 1 +
-            request.causal_mask.tree_or_prompt_size -
-            request.causal_mask.current_layer_size;
-        new_bc.requestsInfo[request_index].num_tokens_in_batch =
-            request.causal_mask.current_layer_size;
-
-        request.num_tokens_in_batch =
-            new_bc.requestsInfo[request_index].num_tokens_in_batch;
-        request.first_token_offset_in_batch = new_bc.num_tokens;
-
-        int child_index = 0;
-        for (auto const &node_ptr : current_layer) {
-          new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              new_bc.requestsInfo[request_index].first_token_index_in_request +
-              child_index;
-          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-              request.ssm_cache_size - 1 + current_ssm_step;
-          new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
+/***** Speculative Decoding Phase *****/
+BatchConfig RequestManager::prepare_next_spec_batch_config() {
+  if (verbose) {
+    std::cout << "\n############### prepare_next_spec_batch_config "
+                 "###############\n";
+    std::cout << "Current tree depth: " << current_ssm_step + 1 << "\n";
+  }
+
+  // Prepare the next batch for existing requests
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
+  // We assume that only one small model is in use now
+  new_bc.model_id = 0;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
 
-          new_bc.num_tokens++;
-          child_index++;
-        }
-      }
+    // Fill in the tokens
+    TokenTree &token_tree = request.speculative_token_trees.at(new_bc.model_id);
+    if (token_tree.tree_layers.size() <= current_ssm_step) {
+      // This request has no token to decode in this and the following small
+      // model inference steps
+      new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+      // non_tree_cache_size = ssm_cache_size - 1
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size -
+          request.causal_mask.current_layer_size;
+      request.num_tokens_in_batch = 0;
+      request.first_token_offset_in_batch = new_bc.num_tokens;
+      continue;
+    } else {
+      std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
+          token_tree.tree_layers.back();
+      // Exclude the current layer from the token tree, because we want the
+      // start index
+      // non_tree_cache_size = ssm_cache_size - 1
+      new_bc.requestsInfo[request_index].first_token_index_in_request =
+          request.ssm_cache_size - 1 + request.causal_mask.tree_or_prompt_size -
+          request.causal_mask.current_layer_size;
+      new_bc.requestsInfo[request_index].num_tokens_in_batch =
+          request.causal_mask.current_layer_size;
 
-      // Copy the causal mask, it should already been updated by
-      // update_ssm_inference_results
-      new_bc.causalMask[request_index] = request.causal_mask;
-      if (streaming_cache) {
-        new_bc.causalMask[request_index].non_tree_cache_size =
-            request.ssm_cache_size - 1;
-      }
+      request.num_tokens_in_batch =
+          new_bc.requestsInfo[request_index].num_tokens_in_batch;
+      request.first_token_offset_in_batch = new_bc.num_tokens;
 
-      // Copy the streaming cache info
-      new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+      int child_index = 0;
+      for (auto const &node_ptr : current_layer) {
+        new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+            new_bc.requestsInfo[request_index].first_token_index_in_request +
+            child_index;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+            request.ssm_cache_size - 1 + current_ssm_step;
+        new_bc.tokensInfo[new_bc.num_tokens].token_id = node_ptr->id;
+
+        new_bc.num_tokens++;
+        child_index++;
+      }
     }
 
-    if (verbose) {
-      std::cout << "prepare_next_spec_batch_config NEW batchconfig:"
-                << std::endl;
-      new_bc.print();
+    // Copy the causal mask, it should already been updated by
+    // update_ssm_inference_results
+    new_bc.causalMask[request_index] = request.causal_mask;
+    if (streaming_cache) {
+      new_bc.causalMask[request_index].non_tree_cache_size =
+          request.ssm_cache_size - 1;
     }
-    return new_bc;
-  }
 
-  /***** Verify Phase *****/
-  BatchConfig RequestManager::prepare_verify_batch_config() {
-    if (verbose) {
-      std::cout
-          << "\n############### prepare_verify_batch_config ###############\n";
-    }
-    // This method does the following:
-    // 1. Commit the verified tokens in the last iteration through the
-    // BatchConfig. We can do this request by request.
-    // The information of the committed tokens is stored in
-    // Request.llm_committed_tokens. Put the information of the committed tokens
-    // into BatchConfig.committed_tokens.
-    // 2. Load the tokens on the token tree that are not yet pruned to
-    // BatchConfig.tokensInfo. Be careful with the abs_depth etc.
-    // (skip the pruned tokens).
-    // 3. Create the causal mask for the large model based on the small model
-    // causal mask (call create_llm_bitmask()).
-    // 4. Maintain BatchConfig::RequestsInfo and all other fields of
-    // BatchConfig.
-    // Please refer to the implementation of prepare_next_spec_batch_config()
-    // for more details.
-    BatchConfig new_bc;
-    new_bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
-    std::copy(std::begin(request_available),
-              std::end(request_available),
-              std::begin(new_bc.request_available));
-    new_bc.num_available_requests = num_available_requests;
-
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      int guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+  }
 
-      // 1. Maintain requestsInfo
-      new_bc.requestsInfo[request_index].first_token_index_in_request =
-          request.tokens.size() - 1; // Exclude the last token
-      new_bc.requestsInfo[request_index].first_token_offset_in_batch =
-          new_bc.num_tokens;
-      new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+  if (verbose) {
+    std::cout << "prepare_next_spec_batch_config NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
+  return new_bc;
+}
 
-      // Put the information of the committed tokens into
-      // BatchConfig.committed_tokens.
-      // Note here, we shouldn't put the last token in request.committed_tokens
-      // into new_bc. Because the LLM don't have that token's KV cache.
-      std::vector<Request::CommittedToken> &committed_tokens =
-          request.committed_tokens;
-      for (int committed_token_index = 0;
-           committed_token_index < committed_tokens.size() - 1;
-           committed_token_index++) {
-        Request::CommittedToken &committed_token =
-            committed_tokens.at(committed_token_index);
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
-            request_index;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
-            committed_token.from_index;
-        new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-            committed_token.to_index;
-        new_bc.num_tokens_to_commit++;
-      }
+/***** Verify Phase *****/
+BatchConfig RequestManager::prepare_verify_batch_config() {
+  if (verbose) {
+    std::cout
+        << "\n############### prepare_verify_batch_config ###############\n";
+  }
+  // This method does the following:
+  // 1. Commit the verified tokens in the last iteration through the
+  // BatchConfig. We can do this request by request.
+  // The information of the committed tokens is stored in
+  // Request.llm_committed_tokens. Put the information of the committed tokens
+  // into BatchConfig.committed_tokens.
+  // 2. Load the tokens on the token tree that are not yet pruned to
+  // BatchConfig.tokensInfo. Be careful with the abs_depth etc.
+  // (skip the pruned tokens).
+  // 3. Create the causal mask for the large model based on the small model
+  // causal mask (call create_llm_bitmask()).
+  // 4. Maintain BatchConfig::RequestsInfo and all other fields of
+  // BatchConfig.
+  // Please refer to the implementation of prepare_next_spec_batch_config()
+  // for more details.
+  BatchConfig new_bc;
+  new_bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(new_bc.request_available));
+  new_bc.num_available_requests = num_available_requests;
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-      // Load the tokens on the token tree that are not yet pruned to
-      // BatchConfig.tokensInfo.
-      TokenTree &token_tree = request.speculative_token_trees[0];
-      int token_tree_index = 0;
-      int layer_index = 0;
-      for (auto const &tree_layer : token_tree.tree_layers) {
-        for (auto const &tree_node : tree_layer) {
-          if (tree_node->included == true) {
-            new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
-            new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-                request.tokens.size() - 1 + token_tree_index;
-            new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-                request.tokens.size() - 1 + layer_index;
-            new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
-            new_bc.num_tokens++;
-            token_tree_index++;
-          }
+    // 1. Maintain requestsInfo
+    new_bc.requestsInfo[request_index].first_token_index_in_request =
+        request.tokens.size() - 1; // Exclude the last token
+    new_bc.requestsInfo[request_index].first_token_offset_in_batch =
+        new_bc.num_tokens;
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = 0;
+
+    // Put the information of the committed tokens into
+    // BatchConfig.committed_tokens.
+    // Note here, we shouldn't put the last token in request.committed_tokens
+    // into new_bc. Because the LLM don't have that token's KV cache.
+    std::vector<Request::CommittedToken> &committed_tokens =
+        request.committed_tokens;
+    for (int committed_token_index = 0;
+         committed_token_index < committed_tokens.size() - 1;
+         committed_token_index++) {
+      Request::CommittedToken &committed_token =
+          committed_tokens.at(committed_token_index);
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
+          request_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
+          committed_token.from_index;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+          committed_token.to_index;
+      new_bc.num_tokens_to_commit++;
+    }
+
+    // Load the tokens on the token tree that are not yet pruned to
+    // BatchConfig.tokensInfo.
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    int token_tree_index = 0;
+    int layer_index = 0;
+    for (auto const &tree_layer : token_tree.tree_layers) {
+      for (auto const &tree_node : tree_layer) {
+        if (tree_node->included == true) {
+          new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
+              request.tokens.size() - 1 + token_tree_index;
+          new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
+              request.tokens.size() - 1 + layer_index;
+          new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
+          new_bc.num_tokens++;
+          token_tree_index++;
         }
-        layer_index++;
       }
-      new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
-
-      request.first_token_offset_in_batch =
-          new_bc.num_tokens - token_tree_index;
-      request.num_tokens_in_batch = token_tree_index;
-
-      // Create the causal mask for the large model based on the small model
-      // causal mask.
-      new_bc.causalMask[request_index] = create_llm_bitmask(guid);
-
-      // Copy the streaming cache info
-      new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+      layer_index++;
     }
+    new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
-    if (verbose) {
-      std::cout << "prepare_verify_batch_config NEW batchconfig:" << std::endl;
-      new_bc.print();
-    }
-    profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
-    return new_bc;
-  }
-
-  bool RequestManager::update_llm_verify_results(
-      InferenceResult const &llm_verify_result) {
-    // We may have two types of InferenceResults, one is the results from
-    // sampling the large model, the other is the top-p / top-k logits of the
-    // large model, we can first implement the former one. For the latter one,
-    // we have to add a CPU based verify function.
-
-    // Compare the results returned from the LLM and compare them with the
-    // SSM's speculative token tree. For the greedy construction of the
-    // speculative token tree, we can simply compare LLM's sample result at each
-    // token, this is implemented in get_verify_results_greedy(). This function
-    // stores the commmitted tokens into the corresponding fields in the
-    // Request. For the sampling construction of the speculative token tree, we
-    // need to implement a CPU based verify function.
-
-    // Update llm_cache_size with the last committed_tokens, and clear
-    // committed_tokens
-    int nb_requests_decoded = 0;
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        // Request in this slot is unavailable
-        continue;
-      }
-      int guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      request.llm_cache_size += request.committed_tokens.size() - 1;
-      request.committed_tokens.clear();
+    request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
+    request.num_tokens_in_batch = token_tree_index;
 
-      profiling_requests[guid].llm_decoding_steps++;
-      nb_requests_decoded++;
-    }
+    // Create the causal mask for the large model based on the small model
+    // causal mask.
+    new_bc.causalMask[request_index] = create_llm_bitmask(guid);
 
-    // Process the LLM results greedily
-    if (speculative_sampling) {
-      get_verify_results_sample(llm_verify_result);
-    } else {
-      get_verify_results_greedy(llm_verify_result);
-    }
+    // Copy the streaming cache info
+    new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+  }
 
-    long long int current_time = Realm::Clock::current_time_in_microseconds();
-    profiling.llm_step_times.push_back(
-        (current_time - profiling.llm_step_start) * 1e-3);
-    profiling.requests_per_step.push_back(nb_requests_decoded);
+  if (verbose) {
+    std::cout << "prepare_verify_batch_config NEW batchconfig:" << std::endl;
+    new_bc.print();
+  }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return new_bc;
+}
 
-    bool request_completed = false;
+bool RequestManager::update_llm_verify_results(
+    InferenceResult const &llm_verify_result) {
+  // We may have two types of InferenceResults, one is the results from
+  // sampling the large model, the other is the top-p / top-k logits of the
+  // large model, we can first implement the former one. For the latter one,
+  // we have to add a CPU based verify function.
+
+  // Compare the results returned from the LLM and compare them with the
+  // SSM's speculative token tree. For the greedy construction of the
+  // speculative token tree, we can simply compare LLM's sample result at each
+  // token, this is implemented in get_verify_results_greedy(). This function
+  // stores the commmitted tokens into the corresponding fields in the
+  // Request. For the sampling construction of the speculative token tree, we
+  // need to implement a CPU based verify function.
+
+  // Update llm_cache_size with the last committed_tokens, and clear
+  // committed_tokens
+  int nb_requests_decoded = 0;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    request.llm_cache_size += request.committed_tokens.size() - 1;
+    request.committed_tokens.clear();
 
-    // Iterate over the requests
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        // Request in this slot is unavailable
-        continue;
-      }
-      int guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      if (verbose) {
-        std::cout << "Request " << guid << " token tree: " << std::endl;
-        std::cout << request.speculative_token_trees[0];
-      }
+    profiling_requests[guid].llm_decoding_steps++;
+    nb_requests_decoded++;
+  }
 
-      request.decode_latency_ms =
-          (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+  // Process the LLM results greedily
+  if (speculative_sampling) {
+    get_verify_results_sample(llm_verify_result);
+  } else {
+    get_verify_results_greedy(llm_verify_result);
+  }
 
-      // Initialize the token tree for the request
-      init_token_tree(guid);
-      assert(!request.committed_tokens.empty() &&
-             "The committed tokens should not be empty.");
-      // Add the last committed token as the root of the speculative token tree
-      add_root_to_spec_token_tree(guid,
-                                  request.committed_tokens.back().token_id);
+  long long int current_time = Realm::Clock::current_time_in_microseconds();
+  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
+                                     1e-3);
+  profiling.requests_per_step.push_back(nb_requests_decoded);
 
-      // Check if the request is completed. If its completed, clean up the
-      // metainfo stored in the RequestManager. Otherwise, update its bitmask.
-      bool eos_token_found = false;
-      for (auto const &committed_token : request.committed_tokens) {
-        if (committed_token.token_id == eos_token_id) {
-          eos_token_found = true;
-          break;
-        }
-      }
-      if (eos_token_found or
-          request.tokens.size() >= get_max_sequence_length()) {
-        // Request is completed
-        request_completed = true;
-        request_complete_clean_up(request_index);
-      } else if (request.decode_latency_ms > request.tokens.size() *
-                                                 baseline_latency_ms *
-                                                 request.get_slo_ratio()) {
-        // The request violates the SLO, drop that request
-        request_completed = true;
-        request_complete_clean_up(request_index);
-      } else {
-        update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
+  bool request_completed = false;
+
+  // Iterate over the requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    int guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (verbose) {
+      std::cout << "Request " << guid << " token tree: " << std::endl;
+      std::cout << request.speculative_token_trees[0];
+    }
+
+    request.decode_latency_ms =
+        (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+
+    // Initialize the token tree for the request
+    init_token_tree(guid);
+    assert(!request.committed_tokens.empty() &&
+           "The committed tokens should not be empty.");
+    // Add the last committed token as the root of the speculative token tree
+    add_root_to_spec_token_tree(guid, request.committed_tokens.back().token_id);
+
+    // Check if the request is completed. If its completed, clean up the
+    // metainfo stored in the RequestManager. Otherwise, update its bitmask.
+    bool eos_token_found = false;
+    for (auto const &committed_token : request.committed_tokens) {
+      if (committed_token.token_id == eos_token_id) {
+        eos_token_found = true;
+        break;
       }
     }
-
-    // Some requests may be completed after appending the verified tokens.
-    // If there is a request completed, return true.
-    return request_completed;
+    if (eos_token_found or request.tokens.size() >= get_max_sequence_length()) {
+      // Request is completed
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else if (request.decode_latency_ms > request.tokens.size() *
+                                               baseline_latency_ms *
+                                               request.get_slo_ratio()) {
+      // The request violates the SLO, drop that request
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else {
+      update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
+    }
   }
 
-  bool RequestManager::update_ssm_inference_results(
-      InferenceResult const &ssm_inference_result) {
-    // This function returns true if no tokens are added to the token tree,
-    // which indicates that the ssm inference phase is done.
-    assert(current_ssm_step >= 1 &&
-           "The current speculation step should be no less than 1");
+  // Some requests may be completed after appending the verified tokens.
+  // If there is a request completed, return true.
+  return request_completed;
+}
 
-    // Here we assume that the order of the tokens in the last
-    // BatchConfig and hence the last InferenceResult is equal to
-    // the order of the request in the last BatchConfig
-    add_tokens_to_spec_token_tree(ssm_inference_result);
+bool RequestManager::update_ssm_inference_results(
+    InferenceResult const &ssm_inference_result) {
+  // This function returns true if no tokens are added to the token tree,
+  // which indicates that the ssm inference phase is done.
+  assert(current_ssm_step >= 1 &&
+         "The current speculation step should be no less than 1");
 
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        // Request in this slot is unavailable
-        continue;
-      }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
+  // Here we assume that the order of the tokens in the last
+  // BatchConfig and hence the last InferenceResult is equal to
+  // the order of the request in the last BatchConfig
+  add_tokens_to_spec_token_tree(ssm_inference_result);
 
-      if (current_ssm_step == 1) {
-        if (streaming_cache) {
-          request.streaming_cache_info.commit_cache(
-              request.num_tokens_in_batch);
-          request.ssm_cache_size = request.streaming_cache_info.commit_len;
-        } else {
-          request.ssm_cache_size = request.tokens.size();
-        }
-      }
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
 
-      if (current_ssm_step == 1) {
-        init_bitmask_spec(guid);
+    if (current_ssm_step == 1) {
+      if (streaming_cache) {
+        request.streaming_cache_info.commit_cache(request.num_tokens_in_batch);
+        request.ssm_cache_size = request.streaming_cache_info.commit_len;
+      } else {
+        request.ssm_cache_size = request.tokens.size();
       }
-      append_bitmask(guid);
-
-      profiling_requests[guid].ssm_decoding_steps++;
     }
 
-    // Stop conditions
-    if (current_ssm_step == get_max_tree_depth()) {
-      // Prune the token tree at the last step
-      prune_token_tree();
-      // Update profiling statistics before returning
-      profiling.ssm_step_times.push_back(
-          (Realm::Clock::current_time_in_microseconds() -
-           profiling.ssm_step_start) *
-          1e-3);
-      profiling.ssm_steps.push_back(current_ssm_step);
-      return true;
+    if (current_ssm_step == 1) {
+      init_bitmask_spec(guid);
     }
-    return false;
+    append_bitmask(guid);
+
+    profiling_requests[guid].ssm_decoding_steps++;
   }
 
-  /* --------- Bitmask Related Functions --------- */
+  // Stop conditions
+  if (current_ssm_step == get_max_tree_depth()) {
+    // Prune the token tree at the last step
+    prune_token_tree();
+    // Update profiling statistics before returning
+    profiling.ssm_step_times.push_back(
+        (Realm::Clock::current_time_in_microseconds() -
+         profiling.ssm_step_start) *
+        1e-3);
+    profiling.ssm_steps.push_back(current_ssm_step);
+    return true;
+  }
+  return false;
+}
 
-  void RequestManager::init_bitmask_prompt(RequestGuid guid,
-                                           int prompt_length) {
-    // This method is called by load_pending_request_to_batch when there is a
-    // new request to load into the batch
-    Request &request = all_requests[guid];
-    BatchConfig::BitMask &bitmask = request.causal_mask;
-
-    // Clear because the prompt kernel doesn't use mask
-    bitmask.clear_bitmask();
-    // Set the info for the mask which is used to store the KV cache
-    bitmask.tree_or_prompt_size = prompt_length;
-    bitmask.current_layer_size = prompt_length;
-    bitmask.non_tree_cache_size = 0;
-  }
-
-  void RequestManager::update_bitmask_prompt(RequestGuid guid,
-                                             int num_committed_tokens) {
-    // This method modifies the bitmask in place
-    // This method is called by update_llm_verify_results
-    // 1. Clear the causal mask because the first SSM inference uses the prompt
-    // kernel and it doesn't use mask.
-    // 2. Maintain all other fields.
-    Request &request = all_requests[guid];
-    BatchConfig::BitMask &bitmask = request.causal_mask;
-    // Clear because the prompt kernel doesn't use mask
-    bitmask.clear_bitmask();
-    bitmask.tree_or_prompt_size = num_committed_tokens;
-    bitmask.current_layer_size = num_committed_tokens;
-
-    // If the request just finishes the prefilling phase, we need to set the
-    // non_tree_cache_size to the size of the prompt
-    if (bitmask.non_tree_cache_size == 0) {
-      bitmask.non_tree_cache_size =
-          request.tokens.size() - num_committed_tokens;
-    }
-  }
-
-  void RequestManager::init_bitmask_spec(RequestGuid guid) {
-    // This method modifies the bitmask in place
-    // This method is called by the first call of update_ssm_inference_results
-    // in a speculative iteration CAUTION: You should still call
-    // append_bitmask() after this method
-    // 1. Clear the causal mask and add a root into it, because the tree is
-    // currently empty but we have a root.
-    // 2. Maintain all other fields.
-    assert(current_ssm_step == 1 && "The current speculation step should be 1");
-    Request &request = all_requests[guid];
-    request.causal_mask = BatchConfig::BitMask();
-    // Set the mask for the root
-    request.causal_mask.bit_mask[0].set_bit(0);
-    request.causal_mask.tree_or_prompt_size = 1;
-    request.causal_mask.non_tree_cache_size = request.tokens.size() - 1;
-    request.causal_mask.current_layer_size = 1;
+/* --------- Bitmask Related Functions --------- */
+
+void RequestManager::init_bitmask_prompt(RequestGuid guid, int prompt_length) {
+  // This method is called by load_pending_request_to_batch when there is a
+  // new request to load into the batch
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+
+  // Clear because the prompt kernel doesn't use mask
+  bitmask.clear_bitmask();
+  // Set the info for the mask which is used to store the KV cache
+  bitmask.tree_or_prompt_size = prompt_length;
+  bitmask.current_layer_size = prompt_length;
+  bitmask.non_tree_cache_size = 0;
+}
+
+void RequestManager::update_bitmask_prompt(RequestGuid guid,
+                                           int num_committed_tokens) {
+  // This method modifies the bitmask in place
+  // This method is called by update_llm_verify_results
+  // 1. Clear the causal mask because the first SSM inference uses the prompt
+  // kernel and it doesn't use mask.
+  // 2. Maintain all other fields.
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  // Clear because the prompt kernel doesn't use mask
+  bitmask.clear_bitmask();
+  bitmask.tree_or_prompt_size = num_committed_tokens;
+  bitmask.current_layer_size = num_committed_tokens;
+
+  // If the request just finishes the prefilling phase, we need to set the
+  // non_tree_cache_size to the size of the prompt
+  if (bitmask.non_tree_cache_size == 0) {
+    bitmask.non_tree_cache_size = request.tokens.size() - num_committed_tokens;
   }
+}
 
-  void RequestManager::append_bitmask(RequestGuid guid) {
-    // This method changes the bitmask in place
-    // This method is called by update_ssm_inference_results(), after the new
-    // tokens are added to the token tree
-    assert(current_ssm_step >= 1 &&
-           "The current speculation step should be no less than 1");
+void RequestManager::init_bitmask_spec(RequestGuid guid) {
+  // This method modifies the bitmask in place
+  // This method is called by the first call of update_ssm_inference_results
+  // in a speculative iteration CAUTION: You should still call
+  // append_bitmask() after this method
+  // 1. Clear the causal mask and add a root into it, because the tree is
+  // currently empty but we have a root.
+  // 2. Maintain all other fields.
+  assert(current_ssm_step == 1 && "The current speculation step should be 1");
+  Request &request = all_requests[guid];
+  request.causal_mask = BatchConfig::BitMask();
+  // Set the mask for the root
+  request.causal_mask.bit_mask[0].set_bit(0);
+  request.causal_mask.tree_or_prompt_size = 1;
+  request.causal_mask.non_tree_cache_size = request.tokens.size() - 1;
+  request.causal_mask.current_layer_size = 1;
+}
 
-    Request &request = all_requests[guid];
-    BatchConfig::BitMask &bitmask = request.causal_mask;
-    TokenTree &token_tree = request.speculative_token_trees[0];
+void RequestManager::append_bitmask(RequestGuid guid) {
+  // This method changes the bitmask in place
+  // This method is called by update_ssm_inference_results(), after the new
+  // tokens are added to the token tree
+  assert(current_ssm_step >= 1 &&
+         "The current speculation step should be no less than 1");
+
+  Request &request = all_requests[guid];
+  BatchConfig::BitMask &bitmask = request.causal_mask;
+  TokenTree &token_tree = request.speculative_token_trees[0];
+
+  if (token_tree.tree_layers.size() <= current_ssm_step) {
+    // This request has no token added in this and the following small model
+    // inference steps, skip it
+    return;
+  }
+  std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
+      request.speculative_token_trees[0].tree_layers.back();
+  int new_layer_size = tree_layer.size();
+  int last_layer_size = bitmask.current_layer_size;
+  int previous_tree_size = bitmask.tree_or_prompt_size;
+  bitmask.current_layer_size = new_layer_size;
+  bitmask.tree_or_prompt_size += new_layer_size;
+
+  assert(bitmask.tree_or_prompt_size <= get_max_spec_tree_token_num());
+
+  int parent_offset = previous_tree_size - last_layer_size;
+  int child_offset = previous_tree_size;
+
+  int child_idx = 0;
+  for (auto const &child_ptr : tree_layer) {
+    // Each child copy its parent's mask
+    bitmask.bit_mask[child_offset + child_idx] =
+        bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
+    // Each child attend to itself
+    bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
+                                                       child_idx);
+    child_idx++;
+  }
+}
 
-    if (token_tree.tree_layers.size() <= current_ssm_step) {
-      // This request has no token added in this and the following small model
-      // inference steps, skip it
-      return;
+BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
+  // This method creates a new bitmask for LLM verification model's bitmask,
+  // it does not modify the small model's bitmask This method is called by
+  // prepare_verify_batch_config().
+
+  Request &request = all_requests[guid];
+  TokenTree &token_tree = request.speculative_token_trees[0];
+  BatchConfig::BitMask llm_bitmask = BatchConfig::BitMask();
+
+  int abs_index_in_tree = 0;
+  std::vector<int> parent_pos_2_abs_index;
+  std::vector<int> current_layer_abs_index;
+  for (auto const &tree_layer : token_tree.tree_layers) {
+    for (auto const &tree_node : tree_layer) {
+      current_layer_abs_index.push_back(abs_index_in_tree);
+      if (tree_node->included == true) {
+        if (abs_index_in_tree == 0) {
+          // The root token, set itself
+          llm_bitmask.bit_mask[0].set_bit(0);
+        } else {
+          // Copy from the parent, and set itself
+          int parent_abs_index = parent_pos_2_abs_index[tree_node->parent_pos];
+          llm_bitmask.bit_mask[abs_index_in_tree] =
+              llm_bitmask.bit_mask[parent_abs_index];
+          llm_bitmask.bit_mask[abs_index_in_tree].set_bit(abs_index_in_tree);
+        }
+        abs_index_in_tree++;
+      }
     }
-    std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
-        request.speculative_token_trees[0].tree_layers.back();
-    int new_layer_size = tree_layer.size();
-    int last_layer_size = bitmask.current_layer_size;
-    int previous_tree_size = bitmask.tree_or_prompt_size;
-    bitmask.current_layer_size = new_layer_size;
-    bitmask.tree_or_prompt_size += new_layer_size;
-
-    assert(bitmask.tree_or_prompt_size <= get_max_spec_tree_token_num());
+    parent_pos_2_abs_index.clear();
+    parent_pos_2_abs_index.swap(current_layer_abs_index);
+  }
 
-    int parent_offset = previous_tree_size - last_layer_size;
-    int child_offset = previous_tree_size;
+  // Maintain other fields of llm_bitmask
+  llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size;
+  // We don't need to set llm_bitmask.current_layer_size and
+  // llm_bitmask.tree_or_prompt_size here because they are not used in LLM
+  // verification.
+  return llm_bitmask;
+}
+/* --------- Bitmask Related Functions --------- */
+void RequestManager::gumbel_conditioned_on_max(
+    float target_max, std::vector<std::pair<float, int>> &logits) {
+  // Assume the logits are sorted in descending order
+  if (logits.size() == 0) {
+    return;
+  }
+  float max_logit = logits[0].first;
+  for (auto &logit_n_idx : logits) {
+    logit_n_idx.first =
+        -log(exp(-target_max) - exp(-max_logit) + exp(-logit_n_idx.first));
+  }
+}
 
-    int child_idx = 0;
-    for (auto const &child_ptr : tree_layer) {
-      // Each child copy its parent's mask
-      bitmask.bit_mask[child_offset + child_idx] =
-          bitmask.bit_mask[parent_offset + child_ptr->parent_pos];
-      // Each child attend to itself
-      bitmask.bit_mask[child_offset + child_idx].set_bit(child_offset +
-                                                         child_idx);
-      child_idx++;
+void RequestManager::renormalize(std::vector<std::pair<TokenId, float>> &D,
+                                 std::unordered_map<TokenId, float> &R,
+                                 TokenId token_id) {
+  float token_prob;
+  for (auto &kv : D) {
+    TokenId d_token_id = kv.first;
+    float d_prob = kv.second;
+    if (R.find(d_token_id) != R.end()) {
+      float r_prob = R[d_token_id];
+      R[d_token_id] = max(0.0f, r_prob - d_prob);
     }
+    if (d_token_id == token_id) {
+      token_prob = d_prob;
+      kv.second = 0.0f;
+    }
+  }
+  // Normalize R
+  float sum_r = 0.0f;
+  for (auto &kv : R) {
+    sum_r += kv.second;
+  }
+  for (auto &kv : R) {
+    kv.second /= (sum_r + 1e-6);
   }
+  // Normalize D
+  for (auto &kv : D) {
+    kv.second /= (1.0f - token_prob - 1e-6);
+  }
+}
 
-  BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
-    // This method creates a new bitmask for LLM verification model's bitmask,
-    // it does not modify the small model's bitmask This method is called by
-    // prepare_verify_batch_config().
+std::tuple<int, BatchConfig::TokenId, bool>
+    RequestManager::reject_sampling(std::vector<std::pair<TokenId, float>> &D,
+                                    std::unordered_map<TokenId, float> &R,
+                                    int k) {
+  assert(D.size() == k);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+  double r;
+  for (int i = 0; i < k; ++i) {
+    // Generate a random number in the range [0, 1)
+    r = dis(gen);
+    double d_prob = (double)D[i].second;
+    if (R.find(D[i].first) != R.end()) {
+      double r_prob = (double)R[D[i].first];
+      if (r < d_prob / d_prob + 1e-6) {
+        return {i, D[i].first, true};
+      }
+    }
+    // else, r_prob = 0.0, reject the token
+    renormalize(D, R, D[i].first);
+  }
+  std::vector<double> r_probs;
+  std::vector<BatchConfig::TokenId> r_tokens;
+  for (auto &kv : R) {
+    r_probs.push_back(kv.second);
+    r_tokens.push_back(kv.first);
+  }
+  std::discrete_distribution<> r_dist(r_probs.begin(), r_probs.end());
+  int sampled_index = r_dist(gen);
+  return {-1, r_tokens[sampled_index], false};
+}
 
+void RequestManager::get_verify_results_sample(
+    InferenceResult const &llm_verify_result) {
+  // This function maintain the generated token list of the request and the
+  // committed tokens.
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
-    TokenTree &token_tree = request.speculative_token_trees[0];
-    BatchConfig::BitMask llm_bitmask = BatchConfig::BitMask();
+    assert(request.status == Request::RUNNING);
 
-    int abs_index_in_tree = 0;
-    std::vector<int> parent_pos_2_abs_index;
-    std::vector<int> current_layer_abs_index;
-    for (auto const &tree_layer : token_tree.tree_layers) {
-      for (auto const &tree_node : tree_layer) {
-        current_layer_abs_index.push_back(abs_index_in_tree);
-        if (tree_node->included == true) {
-          if (abs_index_in_tree == 0) {
-            // The root token, set itself
-            llm_bitmask.bit_mask[0].set_bit(0);
-          } else {
-            // Copy from the parent, and set itself
-            int parent_abs_index =
-                parent_pos_2_abs_index[tree_node->parent_pos];
-            llm_bitmask.bit_mask[abs_index_in_tree] =
-                llm_bitmask.bit_mask[parent_abs_index];
-            llm_bitmask.bit_mask[abs_index_in_tree].set_bit(abs_index_in_tree);
-          }
-          abs_index_in_tree++;
-        }
-      }
-      parent_pos_2_abs_index.clear();
-      parent_pos_2_abs_index.swap(current_layer_abs_index);
-    }
-
-    // Maintain other fields of llm_bitmask
-    llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size;
-    // We don't need to set llm_bitmask.current_layer_size and
-    // llm_bitmask.tree_or_prompt_size here because they are not used in LLM
-    // verification.
-    return llm_bitmask;
-  }
-  /* --------- Bitmask Related Functions --------- */
-  void RequestManager::gumbel_conditioned_on_max(
-      float target_max, std::vector<std::pair<float, int>> &logits) {
-    // Assume the logits are sorted in descending order
-    if (logits.size() == 0) {
-      return;
-    }
-    float max_logit = logits[0].first;
-    for (auto &logit_n_idx : logits) {
-      logit_n_idx.first =
-          -log(exp(-target_max) - exp(-max_logit) + exp(-logit_n_idx.first));
-    }
-  }
-
-  void RequestManager::renormalize(std::vector<std::pair<TokenId, float>> & D,
-                                   std::unordered_map<TokenId, float> & R,
-                                   TokenId token_id) {
-    float token_prob;
-    for (auto &kv : D) {
-      TokenId d_token_id = kv.first;
-      float d_prob = kv.second;
-      if (R.find(d_token_id) != R.end()) {
-        float r_prob = R[d_token_id];
-        R[d_token_id] = max(0.0f, r_prob - d_prob);
-      }
-      if (d_token_id == token_id) {
-        token_prob = d_prob;
-        kv.second = 0.0f;
-      }
-    }
-    // Normalize R
-    float sum_r = 0.0f;
-    for (auto &kv : R) {
-      sum_r += kv.second;
-    }
-    for (auto &kv : R) {
-      kv.second /= (sum_r + 1e-6);
-    }
-    // Normalize D
-    for (auto &kv : D) {
-      kv.second /= (1.0f - token_prob - 1e-6);
-    }
-  }
-
-  std::tuple<int, BatchConfig::TokenId, bool> RequestManager::reject_sampling(
-      std::vector<std::pair<TokenId, float>> & D,
-      std::unordered_map<TokenId, float> & R,
-      int k) {
-    assert(D.size() == k);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<> dis(0.0, 1.0);
-    double r;
-    for (int i = 0; i < k; ++i) {
-      // Generate a random number in the range [0, 1)
-      r = dis(gen);
-      double d_prob = (double)D[i].second;
-      if (R.find(D[i].first) != R.end()) {
-        double r_prob = (double)R[D[i].first];
-        if (r < d_prob / d_prob + 1e-6) {
-          return {i, D[i].first, true};
-        }
-      }
-      // else, r_prob = 0.0, reject the token
-      renormalize(D, R, D[i].first);
-    }
-    std::vector<double> r_probs;
-    std::vector<BatchConfig::TokenId> r_tokens;
-    for (auto &kv : R) {
-      r_probs.push_back(kv.second);
-      r_tokens.push_back(kv.first);
-    }
-    std::discrete_distribution<> r_dist(r_probs.begin(), r_probs.end());
-    int sampled_index = r_dist(gen);
-    return {-1, r_tokens[sampled_index], false};
-  }
-
-  void RequestManager::get_verify_results_sample(
-      InferenceResult const &llm_verify_result) {
-    // This function maintain the generated token list of the request and the
-    // committed tokens.
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-
-      int llm_result_offset =
-          request.first_token_offset_in_batch * BatchConfig::MAX_K_LOGITS;
-      int llm_input_offset = request.first_token_offset_in_batch;
-      int committed_token_index = request.tokens.size() - 1;
-
-      TokenTree &token_tree = request.speculative_token_trees[0];
-      // First add the root to the committed tokens
-      request.committed_tokens.push_back(Request::CommittedToken(
-          llm_input_offset, committed_token_index, request.tokens.back()));
-      committed_token_index++;
-      // Don't add it to request.tokens because it has already been added.
-
-      // The position of the last accepted token in its tree layer (includeing
-      // the pruned tokens)
-      int last_accepted_token_index_in_layer = 0;
-      // The index of the last accepted token in the entire tree (excluding the
-      // pruned tokens)
-      int last_accepted_token_index = 0;
-      float last_accepted_token_accumulated_log_prob = 0.0f;
-      int current_token_index = 1; // Because we skip the root
-      bool rejected = false;
-
-      auto layer_it = token_tree.tree_layers.begin();
-      ++layer_it;
-      for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
-        // We skip the first layer
-        std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
-        std::vector<std::pair<TokenId, float>> D;
-        std::unordered_map<TokenId, float> R;
-        // Data format: <current_token_index, current_token_index_in_layer,
-        // acc_log_prob>
-        std::unordered_map<TokenId, std::tuple<int, int, float>> d_token_info;
-
-        int current_token_index_in_layer = 0;
-
-        // Iterate through the tokens in the current layer to find the candidate
-        // tokens whose parent is the last accepted token
-        for (auto const &node_ptr : tree_layer) {
-          if (!node_ptr->included) {
-            // Don't increase current_token_index here
-            current_token_index_in_layer++;
-            continue;
-          }
-          if (node_ptr->parent_pos != last_accepted_token_index_in_layer) {
-            // The token's parent is not accepted
-            current_token_index++;
-            current_token_index_in_layer++;
-            continue;
-          } else {
-            // The token's parent is accepted
-            float prob = std::exp(node_ptr->log_accumulated_prob -
-                                  last_accepted_token_accumulated_log_prob);
-            D.push_back({node_ptr->id, prob});
-            d_token_info[node_ptr->id] = {current_token_index,
-                                          current_token_index_in_layer,
-                                          node_ptr->log_accumulated_prob};
-            current_token_index++;
-            current_token_index_in_layer++;
-          }
-        }
+    int llm_result_offset =
+        request.first_token_offset_in_batch * BatchConfig::MAX_K_LOGITS;
+    int llm_input_offset = request.first_token_offset_in_batch;
+    int committed_token_index = request.tokens.size() - 1;
 
-        int result_offset = llm_result_offset + last_accepted_token_index *
-                                                    BatchConfig::MAX_K_LOGITS;
-        for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
-          TokenId token_id = llm_verify_result.token_ids[result_offset + i];
-          R[token_id] = llm_verify_result.probs[result_offset + i];
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    // First add the root to the committed tokens
+    request.committed_tokens.push_back(Request::CommittedToken(
+        llm_input_offset, committed_token_index, request.tokens.back()));
+    committed_token_index++;
+    // Don't add it to request.tokens because it has already been added.
+
+    // The position of the last accepted token in its tree layer (includeing
+    // the pruned tokens)
+    int last_accepted_token_index_in_layer = 0;
+    // The index of the last accepted token in the entire tree (excluding the
+    // pruned tokens)
+    int last_accepted_token_index = 0;
+    float last_accepted_token_accumulated_log_prob = 0.0f;
+    int current_token_index = 1; // Because we skip the root
+    bool rejected = false;
+
+    auto layer_it = token_tree.tree_layers.begin();
+    ++layer_it;
+    for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
+      // We skip the first layer
+      std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+      std::vector<std::pair<TokenId, float>> D;
+      std::unordered_map<TokenId, float> R;
+      // Data format: <current_token_index, current_token_index_in_layer,
+      // acc_log_prob>
+      std::unordered_map<TokenId, std::tuple<int, int, float>> d_token_info;
+
+      int current_token_index_in_layer = 0;
+
+      // Iterate through the tokens in the current layer to find the candidate
+      // tokens whose parent is the last accepted token
+      for (auto const &node_ptr : tree_layer) {
+        if (!node_ptr->included) {
+          // Don't increase current_token_index here
+          current_token_index_in_layer++;
+          continue;
         }
-
-        auto [sampled_index, token_id, accepted] =
-            reject_sampling(D, R, D.size());
-        if (accepted) {
-          // The token's parent is accepted, and this token's id equals the
-          // llm's sample at its parent's position. We accept this token.
-          // from_index: the index of the token in the tree (excluding the
-          // pruned tokens)
-          // to_index: the committed token index in the request
-          request.committed_tokens.push_back(Request::CommittedToken(
-              llm_input_offset + std::get<0>(d_token_info[token_id]),
-              committed_token_index,
-              token_id));
-          request.tokens.push_back(token_id);
-
-          last_accepted_token_index = std::get<0>(d_token_info[token_id]);
-          last_accepted_token_index_in_layer =
-              std::get<1>(d_token_info[token_id]);
-          last_accepted_token_accumulated_log_prob =
-              std::get<2>(d_token_info[token_id]);
-          committed_token_index++;
+        if (node_ptr->parent_pos != last_accepted_token_index_in_layer) {
+          // The token's parent is not accepted
+          current_token_index++;
+          current_token_index_in_layer++;
+          continue;
         } else {
-          request.committed_tokens.push_back(
-              Request::CommittedToken(-1, committed_token_index, token_id));
-          rejected = true;
-          break;
+          // The token's parent is accepted
+          float prob = std::exp(node_ptr->log_accumulated_prob -
+                                last_accepted_token_accumulated_log_prob);
+          D.push_back({node_ptr->id, prob});
+          d_token_info[node_ptr->id] = {current_token_index,
+                                        current_token_index_in_layer,
+                                        node_ptr->log_accumulated_prob};
+          current_token_index++;
+          current_token_index_in_layer++;
         }
       }
 
-      // Add the last token (that is not in the cache of the LLM) if the
-      // sampling procedure succeed in the last layer from_index: since this
-      // token is not in the token tree, the llm doesn't have its KV cache, so
-      // the from_index should be a place holder, which is -1
-      if (!rejected) {
-        std::unordered_map<TokenId, float> R;
-        std::vector<std::pair<TokenId, float>> D;
-        int result_offset = llm_result_offset + last_accepted_token_index *
-                                                    BatchConfig::MAX_K_LOGITS;
-        for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
-          TokenId token_id = llm_verify_result.token_ids[result_offset + i];
-          R[token_id] = llm_verify_result.probs[result_offset + i];
-        }
-        auto [sampled_index, token_id, accepted] =
-            reject_sampling(D, R, D.size());
+      int result_offset = llm_result_offset +
+                          last_accepted_token_index * BatchConfig::MAX_K_LOGITS;
+      for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
+        TokenId token_id = llm_verify_result.token_ids[result_offset + i];
+        R[token_id] = llm_verify_result.probs[result_offset + i];
+      }
+
+      auto [sampled_index, token_id, accepted] =
+          reject_sampling(D, R, D.size());
+      if (accepted) {
+        // The token's parent is accepted, and this token's id equals the
+        // llm's sample at its parent's position. We accept this token.
+        // from_index: the index of the token in the tree (excluding the
+        // pruned tokens)
+        // to_index: the committed token index in the request
+        request.committed_tokens.push_back(Request::CommittedToken(
+            llm_input_offset + std::get<0>(d_token_info[token_id]),
+            committed_token_index,
+            token_id));
+        request.tokens.push_back(token_id);
+
+        last_accepted_token_index = std::get<0>(d_token_info[token_id]);
+        last_accepted_token_index_in_layer =
+            std::get<1>(d_token_info[token_id]);
+        last_accepted_token_accumulated_log_prob =
+            std::get<2>(d_token_info[token_id]);
+        committed_token_index++;
+      } else {
         request.committed_tokens.push_back(
             Request::CommittedToken(-1, committed_token_index, token_id));
-        request.tokens.push_back(token_id);
+        rejected = true;
+        break;
       }
+    }
 
-      if (verbose) {
-        std::cout << "Request " << request.guid << " committed tokens: ";
-        for (auto const &committed_token : request.committed_tokens) {
-          std::cout << committed_token.token_id << " ("
-                    << tokenizer_->Decode({committed_token.token_id}) << ") ";
-        }
-        std::cout << std::endl;
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        std::cout << "Output sequence: " << output << std::endl;
+    // Add the last token (that is not in the cache of the LLM) if the
+    // sampling procedure succeed in the last layer from_index: since this
+    // token is not in the token tree, the llm doesn't have its KV cache, so
+    // the from_index should be a place holder, which is -1
+    if (!rejected) {
+      std::unordered_map<TokenId, float> R;
+      std::vector<std::pair<TokenId, float>> D;
+      int result_offset = llm_result_offset +
+                          last_accepted_token_index * BatchConfig::MAX_K_LOGITS;
+      for (int i = 0; i < BatchConfig::MAX_K_LOGITS; ++i) {
+        TokenId token_id = llm_verify_result.token_ids[result_offset + i];
+        R[token_id] = llm_verify_result.probs[result_offset + i];
       }
+      auto [sampled_index, token_id, accepted] =
+          reject_sampling(D, R, D.size());
+      request.committed_tokens.push_back(
+          Request::CommittedToken(-1, committed_token_index, token_id));
+      request.tokens.push_back(token_id);
     }
-  }
 
-  void RequestManager::get_verify_results_greedy(
-      InferenceResult const &llm_verify_result) {
-    // This function maintain the generated token list of the request and the
-    // committed tokens.
-    int total_nb_generated_tokens = 0;
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
+    if (verbose) {
+      std::cout << "Request " << request.guid << " committed tokens: ";
+      for (auto const &committed_token : request.committed_tokens) {
+        std::cout << committed_token.token_id << " ("
+                  << tokenizer_->Decode({committed_token.token_id}) << ") ";
       }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-
-      int llm_result_offset = request.first_token_offset_in_batch;
-      int llm_cache_size = request.tokens.size() - 1;
-      int committed_token_index = request.tokens.size() - 1;
-
-      TokenTree &token_tree = request.speculative_token_trees[0];
-      // First add the root to the committed tokens
-      request.committed_tokens.push_back(Request::CommittedToken(
-          llm_cache_size, committed_token_index, request.tokens.back()));
-      committed_token_index++;
-      // Don't add it to request.tokens because it has already been added.
-
-      // The position of the last accepted token in its tree layer (includeing
-      // the pruned tokens)
-      int last_accepted_token_index_in_layer = 0;
-      // The index of the last accepted token in the entire tree (excluding the
-      // pruned tokens)
-      int last_accepted_token_index = 0;
-
-      int current_token_index = 1; // Because we skip the root
-      auto layer_it = token_tree.tree_layers.begin();
-      ++layer_it;
-      for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
-        // We skip the first layer
-        std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
-
-        bool token_accepted_this_layer = false;
-        int current_token_index_in_layer = 0;
-
-        for (auto const &node_ptr : tree_layer) {
-          if (!node_ptr->included) {
-            current_token_index_in_layer++;
-            continue;
-          }
-          if ((node_ptr->parent_pos != last_accepted_token_index_in_layer) ||
-              token_accepted_this_layer) {
-            // The token's parent is not accepted, or there is already another
-            // token accepted in this layer
-            current_token_index++;
-            current_token_index_in_layer++;
-            continue;
-          } else {
-            // The token's parent is accepted, and no token has been accepted in
-            // this layer yet
-            if (node_ptr->id ==
-                llm_verify_result
-                    .token_ids[llm_result_offset + last_accepted_token_index]) {
-              // The token's parent is accepted, and this token's id equals the
-              // llm's sample at its parent's position. We accept this token.
-
-              // from_index: the index of the token in the tree (excluding the
-              // pruned tokens)
-              // to_index: the committed token index in the request
-              request.committed_tokens.push_back(
-                  Request::CommittedToken(llm_cache_size + current_token_index,
-                                          committed_token_index,
-                                          node_ptr->id));
-              request.tokens.push_back(node_ptr->id);
-
-              token_accepted_this_layer = true;
-              last_accepted_token_index = current_token_index;
-              last_accepted_token_index_in_layer = current_token_index_in_layer;
-              committed_token_index++;
-            }
-            current_token_index++;
-            current_token_index_in_layer++;
-          }
+      std::cout << std::endl;
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Output sequence: " << output << std::endl;
+    }
+  }
+}
+
+void RequestManager::get_verify_results_greedy(
+    InferenceResult const &llm_verify_result) {
+  // This function maintain the generated token list of the request and the
+  // committed tokens.
+  int total_nb_generated_tokens = 0;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int llm_result_offset = request.first_token_offset_in_batch;
+    int llm_cache_size = request.tokens.size() - 1;
+    int committed_token_index = request.tokens.size() - 1;
+
+    TokenTree &token_tree = request.speculative_token_trees[0];
+    // First add the root to the committed tokens
+    request.committed_tokens.push_back(Request::CommittedToken(
+        llm_cache_size, committed_token_index, request.tokens.back()));
+    committed_token_index++;
+    // Don't add it to request.tokens because it has already been added.
+
+    // The position of the last accepted token in its tree layer (includeing
+    // the pruned tokens)
+    int last_accepted_token_index_in_layer = 0;
+    // The index of the last accepted token in the entire tree (excluding the
+    // pruned tokens)
+    int last_accepted_token_index = 0;
+
+    int current_token_index = 1; // Because we skip the root
+    auto layer_it = token_tree.tree_layers.begin();
+    ++layer_it;
+    for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
+      // We skip the first layer
+      std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+
+      bool token_accepted_this_layer = false;
+      int current_token_index_in_layer = 0;
+
+      for (auto const &node_ptr : tree_layer) {
+        if (!node_ptr->included) {
+          current_token_index_in_layer++;
+          continue;
         }
-        if (!token_accepted_this_layer) {
-          // No token is accepted in this layer, we should stop the traversal
-          break;
+        if ((node_ptr->parent_pos != last_accepted_token_index_in_layer) ||
+            token_accepted_this_layer) {
+          // The token's parent is not accepted, or there is already another
+          // token accepted in this layer
+          current_token_index++;
+          current_token_index_in_layer++;
+          continue;
+        } else {
+          // The token's parent is accepted, and no token has been accepted in
+          // this layer yet
+          if (node_ptr->id ==
+              llm_verify_result
+                  .token_ids[llm_result_offset + last_accepted_token_index]) {
+            // The token's parent is accepted, and this token's id equals the
+            // llm's sample at its parent's position. We accept this token.
+
+            // from_index: the index of the token in the tree (excluding the
+            // pruned tokens)
+            // to_index: the committed token index in the request
+            request.committed_tokens.push_back(
+                Request::CommittedToken(llm_cache_size + current_token_index,
+                                        committed_token_index,
+                                        node_ptr->id));
+            request.tokens.push_back(node_ptr->id);
+
+            token_accepted_this_layer = true;
+            last_accepted_token_index = current_token_index;
+            last_accepted_token_index_in_layer = current_token_index_in_layer;
+            committed_token_index++;
+          }
+          current_token_index++;
+          current_token_index_in_layer++;
         }
       }
-
-      // Add the last token (that is not verified by the LLM)
-      // from_index: since this token is not in the token tree, the llm
-      // doesn't have its KV cache, so the from_index should be a place
-      // holder, which is -1
-      request.committed_tokens.push_back(Request::CommittedToken(
-          -1,
-          committed_token_index,
-          llm_verify_result
-              .token_ids[llm_result_offset + last_accepted_token_index]));
-      request.tokens.push_back(
-          llm_verify_result
-              .token_ids[llm_result_offset + last_accepted_token_index]);
-
-      total_nb_generated_tokens += request.committed_tokens.size() - 1;
-      if (verbose) {
-        std::cout << "Request " << request.guid << " committed tokens: ";
-        for (auto const &committed_token : request.committed_tokens) {
-          std::cout << committed_token.token_id << " ("
-                    << tokenizer_->Decode({committed_token.token_id}) << ") ";
-        }
-        std::cout << std::endl;
-        std::string output = this->tokenizer_->Decode(request.tokens);
-        std::cout << "Output sequence: " << output << std::endl;
+      if (!token_accepted_this_layer) {
+        // No token is accepted in this layer, we should stop the traversal
+        break;
       }
     }
-    profiling.generated_tokens_per_step.push_back(total_nb_generated_tokens);
-  }
 
-  // TODO: the max_seq_length is not used in the current implementation
-  std::vector<GenerationResult> FFModel::generate(
-      std::vector<std::string> & prompts, int max_seq_length) {
-    RequestManager *rm = RequestManager::get_request_manager();
-    std::vector<RequestManager::RequestGuid> guids;
-    for (int i = 0; i < prompts.size(); i++) {
-      RequestManager::RequestGuid guid =
-          rm->register_new_request(prompts.at(i));
-      if (guid != RequestManager::INVALID_GUID) {
-        guids.push_back(guid);
+    // Add the last token (that is not verified by the LLM)
+    // from_index: since this token is not in the token tree, the llm
+    // doesn't have its KV cache, so the from_index should be a place
+    // holder, which is -1
+    request.committed_tokens.push_back(Request::CommittedToken(
+        -1,
+        committed_token_index,
+        llm_verify_result
+            .token_ids[llm_result_offset + last_accepted_token_index]));
+    request.tokens.push_back(
+        llm_verify_result
+            .token_ids[llm_result_offset + last_accepted_token_index]);
+
+    total_nb_generated_tokens += request.committed_tokens.size() - 1;
+    if (verbose) {
+      std::cout << "Request " << request.guid << " committed tokens: ";
+      for (auto const &committed_token : request.committed_tokens) {
+        std::cout << committed_token.token_id << " ("
+                  << tokenizer_->Decode({committed_token.token_id}) << ") ";
       }
+      std::cout << std::endl;
+      std::string output = this->tokenizer_->Decode(request.tokens);
+      std::cout << "Output sequence: " << output << std::endl;
     }
-    std::vector<GenerationResult> results;
-    for (int i = 0; i < guids.size(); i++) {
-      results.push_back(rm->get_generation_result(guids[i]));
+  }
+  profiling.generated_tokens_per_step.push_back(total_nb_generated_tokens);
+}
+
+// TODO: the max_seq_length is not used in the current implementation
+std::vector<GenerationResult>
+    FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
+  RequestManager *rm = RequestManager::get_request_manager();
+  std::vector<RequestManager::RequestGuid> guids;
+  for (int i = 0; i < prompts.size(); i++) {
+    RequestManager::RequestGuid guid = rm->register_new_request(prompts.at(i));
+    if (guid != RequestManager::INVALID_GUID) {
+      guids.push_back(guid);
     }
-    return results;
   }
+  std::vector<GenerationResult> results;
+  for (int i = 0; i < guids.size(); i++) {
+    results.push_back(rm->get_generation_result(guids[i]));
+  }
+  return results;
+}
 
-  void RequestManager::start_background_server(FFModel * model) {
-    assert(background_server_status == INITIALIZED);
-    background_server_status = SERVING;
-    // Start background task
-    Runtime *runtime = Runtime::get_runtime();
-    Context ctx = Runtime::get_context();
-    TaskLauncher launcher(RM_BACKGROUND_SERVING_TASK_ID,
-                          TaskArgument(&model, sizeof(FFModel *)));
-    background_server_handler = runtime->execute_task(ctx, launcher);
-    // Register callbacks for normal exit
-    {
-      int ret =
-          std::atexit(RequestManager::terminate_background_server_at_exit);
-      assert(ret == 0); // make sure the callback is successfully registered
-    }
-    // Register callbacks for termination
-    {
-      std::set_terminate([]() {
-        RequestManager::terminate_background_server_at_exit();
-        std::abort();
-      });
-    }
-  }
-
-  void RequestManager::background_serving_task(
-      Task const *task,
-      std::vector<PhysicalRegion> const &regions,
-      Context ctx,
-      Runtime *runtime) {
-    RequestManager *rm = RequestManager::get_request_manager();
-    FFModel *llm = *(FFModel **)task->args;
-    {
-      // Update FFModel's lg_hlr and lg_ctx to the current
-      // task's runtime and ctx, since all future legion tasks are
-      // launched in this task
-      llm->config.lg_hlr = runtime;
-      llm->config.lg_ctx = ctx;
-      // Update the lg_hlr and lg_ctx of all SSMs' FFConfig
-      // since all future legion tasks are launched in this task
-      for (size_t i = 0; i < rm->get_num_ssms(); i++) {
-        FFModel *ssm = rm->get_ssm_model(i);
-        ssm->config.lg_hlr = runtime;
-        ssm->config.lg_ctx = ctx;
+void RequestManager::start_background_server(FFModel *model) {
+  assert(background_server_status == INITIALIZED);
+  background_server_status = SERVING;
+  // Start background task
+  Runtime *runtime = Runtime::get_runtime();
+  Context ctx = Runtime::get_context();
+  TaskLauncher launcher(RM_BACKGROUND_SERVING_TASK_ID,
+                        TaskArgument(&model, sizeof(FFModel *)));
+  background_server_handler = runtime->execute_task(ctx, launcher);
+  // Register callbacks for normal exit
+  {
+    int ret = std::atexit(RequestManager::terminate_background_server_at_exit);
+    assert(ret == 0); // make sure the callback is successfully registered
+  }
+  // Register callbacks for termination
+  {
+    std::set_terminate([]() {
+      RequestManager::terminate_background_server_at_exit();
+      std::abort();
+    });
+  }
+}
+
+void RequestManager::background_serving_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  RequestManager *rm = RequestManager::get_request_manager();
+  FFModel *llm = *(FFModel **)task->args;
+  {
+    // Update FFModel's lg_hlr and lg_ctx to the current
+    // task's runtime and ctx, since all future legion tasks are
+    // launched in this task
+    llm->config.lg_hlr = runtime;
+    llm->config.lg_ctx = ctx;
+    // Update the lg_hlr and lg_ctx of all SSMs' FFConfig
+    // since all future legion tasks are launched in this task
+    for (size_t i = 0; i < rm->get_num_ssms(); i++) {
+      FFModel *ssm = rm->get_ssm_model(i);
+      ssm->config.lg_hlr = runtime;
+      ssm->config.lg_ctx = ctx;
+    }
+  }
+  if (rm->decoding_mode == INCREMENTAL_DECODING) {
+    // No SSMs: perform incremental decoding
+    rm->serve_decoding(llm);
+  } else {
+    // Registered SSMs: perform speculative inference
+    rm->serve_spec_infer(llm);
+  }
+}
+
+/*static*/
+void RequestManager::serve_decoding(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  // Compile the llm
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  im->compile_model_and_allocate_buffer(llm);
+  assert(im->model_weights_loaders.find(llm) !=
+         im->model_weights_loaders.end());
+  // Load model weights
+  im->model_weights_loaders[llm]->load_weights(llm);
+  // init operators
+  im->init_operators_inference(llm);
+  // Legion futures for inc_decoding and spec_infer
+  InferenceResultFuture last_irf;
+  {
+    // Initialize futures for incr decoding
+    InferenceResult ir;
+    last_irf = Future::from_value<InferenceResult>(ir);
+  }
+
+  std::queue<InferenceResultFuture> batch_pipeline;
+  { batch_pipeline.push(last_irf); }
+
+  reset_profiling_statistics();
+  while (!is_background_server_terminated()) {
+
+    if (batch_pipeline.size() >= 4) {
+      // Block here to avoid launching too many batches
+      auto const &ir = batch_pipeline.front();
+      ir.get_void_result();
+    }
+    // deque finished batches
+    while (batch_pipeline.size() > 1) {
+      auto const &ir = batch_pipeline.front();
+      if (ir.is_ready()) {
+        batch_pipeline.pop();
+      } else {
+        break;
       }
     }
-    if (rm->decoding_mode == INCREMENTAL_DECODING) {
-      // No SSMs: perform incremental decoding
-      rm->serve_decoding(llm);
-    } else {
-      // Registered SSMs: perform speculative inference
-      rm->serve_spec_infer(llm);
-    }
+    runtime->begin_trace(ctx, 12346 /*trace_id*/);
+    InferenceResultFuture next_ir = batch_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
+    FutureMap fm = im->inference(llm, 0, bcf);
+    assert(fm.get_future_map_domain().get_volume() == 1);
+    InferenceResultFuture irf = fm.get_future(0);
+    batch_pipeline.push(irf);
+    runtime->end_trace(ctx, 12346 /*trace_id*/);
   }
+}
 
-  /*static*/
-  void RequestManager::serve_decoding(FFModel * llm) {
-    Context ctx = llm->config.lg_ctx;
-    Runtime *runtime = llm->config.lg_hlr;
+/*static*/
+void RequestManager::serve_spec_infer(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  {
     // Compile the llm
-    InferenceManager *im = InferenceManager::get_inference_manager();
     im->compile_model_and_allocate_buffer(llm);
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
@@ -2155,606 +2198,614 @@ double RequestManager::get_correction_factor() {
     im->model_weights_loaders[llm]->load_weights(llm);
     // init operators
     im->init_operators_inference(llm);
-    // Legion futures for inc_decoding and spec_infer
-    InferenceResultFuture last_irf;
-    {
-      // Initialize futures for incr decoding
-      InferenceResult ir;
-      last_irf = Future::from_value<InferenceResult>(ir);
-    }
+  }
+  for (size_t i = 0; i < get_num_ssms(); i++) {
+    // Compile the i-th ssm
+    FFModel *ssm = get_ssm_model(i);
+    im->compile_model_and_allocate_buffer(ssm);
+    assert(im->model_weights_loaders.find(ssm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[ssm]->load_weights(ssm);
+    // init operators
+    im->init_operators_inference(ssm);
+  }
 
-    std::queue<InferenceResultFuture> batch_pipeline;
-    { batch_pipeline.push(last_irf); }
+  InferenceResultFuture irf_0;
+  {
+    // Initialize futures for incr decoding
+    InferenceResult ir_0;
+    irf_0 = Future::from_value<InferenceResult>(ir_0);
+  }
 
-    reset_profiling_statistics();
-    while (!is_background_server_terminated()) {
+  request_manager_status = PREFILLING;
+  prefill_model = SSM;
 
-      if (batch_pipeline.size() >= 4) {
-        // Block here to avoid launching too many batches
-        auto const &ir = batch_pipeline.front();
-        ir.get_void_result();
-      }
-      // deque finished batches
-      while (batch_pipeline.size() > 1) {
-        auto const &ir = batch_pipeline.front();
-        if (ir.is_ready()) {
-          batch_pipeline.pop();
-        } else {
-          break;
-        }
-      }
-      runtime->begin_trace(ctx, 12346 /*trace_id*/);
-      InferenceResultFuture next_ir = batch_pipeline.back();
-      BatchConfigFuture bcf = get_next_batch_config(next_ir, ctx, runtime);
-      FutureMap fm = im->inference(llm, 0, bcf);
-      assert(fm.get_future_map_domain().get_volume() == 1);
-      InferenceResultFuture irf = fm.get_future(0);
-      batch_pipeline.push(irf);
-      runtime->end_trace(ctx, 12346 /*trace_id*/);
-    }
-  }
-
-  /*static*/
-  void RequestManager::serve_spec_infer(FFModel * llm) {
-    Context ctx = llm->config.lg_ctx;
-    Runtime *runtime = llm->config.lg_hlr;
-    InferenceManager *im = InferenceManager::get_inference_manager();
-    {
-      // Compile the llm
-      im->compile_model_and_allocate_buffer(llm);
-      assert(im->model_weights_loaders.find(llm) !=
-             im->model_weights_loaders.end());
-      // Load model weights
-      im->model_weights_loaders[llm]->load_weights(llm);
-      // init operators
-      im->init_operators_inference(llm);
-    }
-    for (size_t i = 0; i < get_num_ssms(); i++) {
-      // Compile the i-th ssm
-      FFModel *ssm = get_ssm_model(i);
-      im->compile_model_and_allocate_buffer(ssm);
-      assert(im->model_weights_loaders.find(ssm) !=
-             im->model_weights_loaders.end());
-      // Load model weights
-      im->model_weights_loaders[ssm]->load_weights(ssm);
-      // init operators
-      im->init_operators_inference(ssm);
-    }
-
-    InferenceResultFuture irf_0;
-    {
-      // Initialize futures for incr decoding
-      InferenceResult ir_0;
-      irf_0 = Future::from_value<InferenceResult>(ir_0);
-    }
-
-    request_manager_status = PREFILLING;
-    prefill_model = SSM;
-
-    std::queue<InferenceResultFuture> infer_result_future_pipeline;
-    infer_result_future_pipeline.push(irf_0);
-
-    reset_profiling_statistics();
-    while (!is_background_server_terminated()) {
-      if (infer_result_future_pipeline.size() >= 4) {
-        // Block here to avoid launching too many batches
-        auto const &ir = infer_result_future_pipeline.front();
-        ir.get_void_result();
-      }
-      // deque finished batches
-      while (infer_result_future_pipeline.size() > 1) {
-        auto const &ir = infer_result_future_pipeline.front();
-        if (ir.is_ready()) {
-          infer_result_future_pipeline.pop();
-        } else {
-          break;
-        }
-      }
+  std::queue<InferenceResultFuture> infer_result_future_pipeline;
+  infer_result_future_pipeline.push(irf_0);
 
-      runtime->begin_trace(ctx, 12345 /*trace_id*/);
-      for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth();
-           ssm_step_i++) {
-        InferenceResultFuture irf = infer_result_future_pipeline.back();
-        BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
-        FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-        infer_result_future_pipeline.push(fm.get_future(0));
+  reset_profiling_statistics();
+  while (!is_background_server_terminated()) {
+    if (infer_result_future_pipeline.size() >= 4) {
+      // Block here to avoid launching too many batches
+      auto const &ir = infer_result_future_pipeline.front();
+      ir.get_void_result();
+    }
+    // deque finished batches
+    while (infer_result_future_pipeline.size() > 1) {
+      auto const &ir = infer_result_future_pipeline.front();
+      if (ir.is_ready()) {
+        infer_result_future_pipeline.pop();
+      } else {
+        break;
       }
+    }
+
+    runtime->begin_trace(ctx, 12345 /*trace_id*/);
+    for (int ssm_step_i = 0; ssm_step_i < get_max_tree_depth(); ssm_step_i++) {
       InferenceResultFuture irf = infer_result_future_pipeline.back();
       BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
-      FutureMap fm = im->inference(llm, 0, bcf);
+      FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
       infer_result_future_pipeline.push(fm.get_future(0));
-      runtime->end_trace(ctx, 12345 /*trace_id*/);
     }
+    InferenceResultFuture irf = infer_result_future_pipeline.back();
+    BatchConfigFuture bcf = get_next_batch_config(irf, ctx, runtime);
+    FutureMap fm = im->inference(llm, 0, bcf);
+    infer_result_future_pipeline.push(fm.get_future(0));
+    runtime->end_trace(ctx, 12345 /*trace_id*/);
   }
+}
 
-  /*static*/
-  void RequestManager::serve_spec_infer_sync(FFModel * llm) {
-    Context ctx = llm->config.lg_ctx;
-    Runtime *runtime = llm->config.lg_hlr;
-    InferenceManager *im = InferenceManager::get_inference_manager();
-    {
-      // Compile the llm
-      im->compile_model_and_allocate_buffer(llm);
-      assert(im->model_weights_loaders.find(llm) !=
-             im->model_weights_loaders.end());
-      // Load model weights
-      im->model_weights_loaders[llm]->load_weights(llm);
-      // init operators
-      im->init_operators_inference(llm);
-    }
-    for (size_t i = 0; i < get_num_ssms(); i++) {
-      // Compile the i-th ssm
-      FFModel *ssm = get_ssm_model(i);
-      im->compile_model_and_allocate_buffer(ssm);
-      assert(im->model_weights_loaders.find(ssm) !=
-             im->model_weights_loaders.end());
-      // Load model weights
-      im->model_weights_loaders[ssm]->load_weights(ssm);
-      // init operators
-      im->init_operators_inference(ssm);
-    }
-
-    InferenceResultFuture irf_0;
-    {
-      // Initialize futures for incr decoding
-      InferenceResult ir_0;
-      irf_0 = Future::from_value<InferenceResult>(ir_0);
-    }
-
-    request_manager_status = PREFILLING;
-    prefill_model = SSM;
-
-    while (!is_background_server_terminated()) {
-      BatchConfigFuture bcf = get_next_batch_config(irf_0, ctx, runtime);
-      bcf.get_void_result();
-      if ((request_manager_status == PREFILLING and prefill_model == LLM) or
-          request_manager_status == LLM_VERIFY) {
-        runtime->begin_trace(ctx, 12345 /*trace_id*/);
-        FutureMap fm = im->inference(llm, 0, bcf);
-        irf_0 = fm.get_future(0);
-        runtime->end_trace(ctx, 12345 /*trace_id*/);
-      } else if ((request_manager_status == PREFILLING and
-                  prefill_model == SSM) or
-                 request_manager_status == SSM_SPEC) {
-        runtime->begin_trace(ctx, 23456 /*trace_id*/);
-        FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
-        irf_0 = fm.get_future(0);
-        runtime->end_trace(ctx, 23456 /*trace_id*/);
-      } else {
-        assert(false && "Invalid request manager status");
-      }
-    }
+/*static*/
+void RequestManager::serve_spec_infer_sync(FFModel *llm) {
+  Context ctx = llm->config.lg_ctx;
+  Runtime *runtime = llm->config.lg_hlr;
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  {
+    // Compile the llm
+    im->compile_model_and_allocate_buffer(llm);
+    assert(im->model_weights_loaders.find(llm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[llm]->load_weights(llm);
+    // init operators
+    im->init_operators_inference(llm);
   }
-
-  void RequestManager::trigger_request_completion_future(
-      RequestGuid const &guid) {
-    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
-    assert(request_to_promise.find(guid) != request_to_promise.end());
-    // Set the completion promise in case other threads are waiting
-    request_to_promise[guid]->set_value();
+  for (size_t i = 0; i < get_num_ssms(); i++) {
+    // Compile the i-th ssm
+    FFModel *ssm = get_ssm_model(i);
+    im->compile_model_and_allocate_buffer(ssm);
+    assert(im->model_weights_loaders.find(ssm) !=
+           im->model_weights_loaders.end());
+    // Load model weights
+    im->model_weights_loaders[ssm]->load_weights(ssm);
+    // init operators
+    im->init_operators_inference(ssm);
   }
 
-  /*static*/
-  void RequestManager::terminate_background_server_at_exit() {
-    RequestManager *rm = RequestManager::get_request_manager();
-    rm->terminate_background_server();
+  InferenceResultFuture irf_0;
+  {
+    // Initialize futures for incr decoding
+    InferenceResult ir_0;
+    irf_0 = Future::from_value<InferenceResult>(ir_0);
   }
 
-  void RequestManager::terminate_background_server() {
-    if (background_server_status == SERVING) {
-      assert(profiling.llm_step_times.size() ==
-             profiling.requests_per_step.size());
-      // Write the last profiling statistics to output file
-      std::string str = "[Profiling Statistics]";
-
-      long long total_time = Realm::Clock::current_time_in_microseconds() -
-                             profiling.server_start_time;
-      int total_requests = profiling_requests.size();
-      int total_tokens = 0;
-      for (int num_tokens : profiling.generated_tokens_per_step) {
-        total_tokens += num_tokens;
-      }
-      str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
-      str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
-      // throughput
-      str += "\n throughput_requests_per_sec(" +
-             std::to_string(total_requests / (total_time / 1e6)) + ")";
-      str += "\n throughput_tokens_per_sec(" +
-             std::to_string(total_tokens / (total_time / 1e6)) + ")";
-
-      double average_latency_per_request = 0;
-      std::string latency_per_request_ms = "\n latency_per_request_ms( ";
-      for (auto const &profiling_info : profiling_requests) {
-        double latency_ms = (profiling_info.second.finish_time -
-                             profiling_info.second.start_time) /
-                            1000.0;
-        // latency_per_request_ms += "[" + std::to_string(profiling_info.first)
-        // +
-        // ","; latency_per_request_ms += std::to_string(latency_ms) + "] ";
-        latency_per_request_ms += std::to_string(latency_ms) + " ";
-        average_latency_per_request += latency_ms;
-      }
-      latency_per_request_ms += ")";
-      str += latency_per_request_ms;
-      average_latency_per_request /= total_requests;
-      str += "\n average_latency_per_request_ms(" +
-             std::to_string(average_latency_per_request) + ")";
-
-      std::string req_per_step = "\n requests_per_step( ";
-      for (int nb : profiling.requests_per_step) {
-        req_per_step += std::to_string(nb) + " ";
-      }
-      req_per_step += ")";
-      str += req_per_step;
-
-      if (profiling.ssm_step_times.size() > 0) {
-        // assert(profiling.ssm_step_times.size() ==
-        //        profiling.llm_step_times.size());
-        std::string ssm_step_times_ms = "\n ssm_step_times_ms( ";
-        for (double time : profiling.ssm_step_times) {
-          ssm_step_times_ms += std::to_string(time) + " ";
-        }
-        ssm_step_times_ms += ")";
-        str += ssm_step_times_ms;
-      }
+  request_manager_status = PREFILLING;
+  prefill_model = SSM;
 
-      if (profiling.ssm_steps.size() > 0) {
-        std::string ssm_steps = "\n ssm_steps( ";
-        for (int nb : profiling.ssm_steps) {
-          ssm_steps += std::to_string(nb) + " ";
-        }
-        ssm_steps += ")";
-        str += ssm_steps;
-      }
+  while (!is_background_server_terminated()) {
+    BatchConfigFuture bcf = get_next_batch_config(irf_0, ctx, runtime);
+    bcf.get_void_result();
+    if ((request_manager_status == PREFILLING and prefill_model == LLM) or
+        request_manager_status == LLM_VERIFY) {
+      runtime->begin_trace(ctx, 12345 /*trace_id*/);
+      FutureMap fm = im->inference(llm, 0, bcf);
+      irf_0 = fm.get_future(0);
+      runtime->end_trace(ctx, 12345 /*trace_id*/);
+    } else if ((request_manager_status == PREFILLING and
+                prefill_model == SSM) or
+               request_manager_status == SSM_SPEC) {
+      runtime->begin_trace(ctx, 23456 /*trace_id*/);
+      FutureMap fm = im->inference(get_ssm_model(0), 0, bcf);
+      irf_0 = fm.get_future(0);
+      runtime->end_trace(ctx, 23456 /*trace_id*/);
+    } else {
+      assert(false && "Invalid request manager status");
+    }
+  }
+}
 
-      std::string llm_step_times_ms = "\n llm_step_times_ms( ";
-      for (double time : profiling.llm_step_times) {
-        llm_step_times_ms += std::to_string(time) + " ";
-      }
-      llm_step_times_ms += ")";
-      str += llm_step_times_ms;
+void RequestManager::trigger_request_completion_future(
+    RequestGuid const &guid) {
+  std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
+  assert(request_to_promise.find(guid) != request_to_promise.end());
+  // Set the completion promise in case other threads are waiting
+  request_to_promise[guid]->set_value();
+}
 
-      std::string generated_tokens_per_step = "\n generated_tokens_per_step( ";
-      for (int nb : profiling.generated_tokens_per_step) {
-        generated_tokens_per_step += std::to_string(nb) + " ";
-      }
-      generated_tokens_per_step += ")";
-      str += generated_tokens_per_step;
+/*static*/
+void RequestManager::terminate_background_server_at_exit() {
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->terminate_background_server();
+}
 
-      write_to_output_file("", str);
-      background_server_status = TERMINATED;
-      // Wait for the background server to terminate
-      Runtime *runtime = Runtime::get_runtime();
-      Context ctx = Runtime::get_context();
-      background_server_handler.get_void_result();
-    }
+void RequestManager::terminate_background_server() {
+  if (background_server_status == SERVING) {
+    assert(profiling.llm_step_times.size() ==
+           profiling.requests_per_step.size());
+    // Write the last profiling statistics to output file
+    std::string str = "[Profiling Statistics]";
+
+    long long total_time = Realm::Clock::current_time_in_microseconds() -
+                           profiling.server_start_time;
+    int total_requests = profiling_requests.size();
+    int total_tokens = 0;
+    for (int num_tokens : profiling.generated_tokens_per_step) {
+      total_tokens += num_tokens;
+    }
+    str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
+    str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
+    // throughput
+    str += "\n throughput_requests_per_sec(" +
+           std::to_string(total_requests / (total_time / 1e6)) + ")";
+    str += "\n throughput_tokens_per_sec(" +
+           std::to_string(total_tokens / (total_time / 1e6)) + ")";
+
+    double average_latency_per_request = 0;
+    std::string latency_per_request_ms = "\n latency_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double latency_ms = (profiling_info.second.finish_time -
+                           profiling_info.second.start_time) /
+                          1000.0;
+      // latency_per_request_ms += "[" + std::to_string(profiling_info.first)
+      // +
+      // ","; latency_per_request_ms += std::to_string(latency_ms) + "] ";
+      latency_per_request_ms += std::to_string(latency_ms) + " ";
+      average_latency_per_request += latency_ms;
+    }
+    latency_per_request_ms += ")";
+    str += latency_per_request_ms;
+    average_latency_per_request /= total_requests;
+    str += "\n average_latency_per_request_ms(" +
+           std::to_string(average_latency_per_request) + ")";
+
+    std::string req_per_step = "\n requests_per_step( ";
+    for (int nb : profiling.requests_per_step) {
+      req_per_step += std::to_string(nb) + " ";
+    }
+    req_per_step += ")";
+    str += req_per_step;
+
+    if (profiling.ssm_step_times.size() > 0) {
+      // assert(profiling.ssm_step_times.size() ==
+      //        profiling.llm_step_times.size());
+      std::string ssm_step_times_ms = "\n ssm_step_times_ms( ";
+      for (double time : profiling.ssm_step_times) {
+        ssm_step_times_ms += std::to_string(time) + " ";
+      }
+      ssm_step_times_ms += ")";
+      str += ssm_step_times_ms;
+    }
+
+    if (profiling.ssm_steps.size() > 0) {
+      std::string ssm_steps = "\n ssm_steps( ";
+      for (int nb : profiling.ssm_steps) {
+        ssm_steps += std::to_string(nb) + " ";
+      }
+      ssm_steps += ")";
+      str += ssm_steps;
+    }
+
+    std::string llm_step_times_ms = "\n llm_step_times_ms( ";
+    for (double time : profiling.llm_step_times) {
+      llm_step_times_ms += std::to_string(time) + " ";
+    }
+    llm_step_times_ms += ")";
+    str += llm_step_times_ms;
+
+    std::string generated_tokens_per_step = "\n generated_tokens_per_step( ";
+    for (int nb : profiling.generated_tokens_per_step) {
+      generated_tokens_per_step += std::to_string(nb) + " ";
+    }
+    generated_tokens_per_step += ")";
+    str += generated_tokens_per_step;
+
+    write_to_output_file("", str);
+    background_server_status = TERMINATED;
+    // Wait for the background server to terminate
+    Runtime *runtime = Runtime::get_runtime();
+    Context ctx = Runtime::get_context();
+    background_server_handler.get_void_result();
   }
+}
 
-  bool RequestManager::is_background_server_terminated() {
-    return background_server_status == TERMINATED;
-  }
+bool RequestManager::is_background_server_terminated() {
+  return background_server_status == TERMINATED;
+}
 
-  RequestManager *request_manager_singleton = nullptr;
+RequestManager *request_manager_singleton = nullptr;
 
-  /*static*/
-  RequestManager *RequestManager::get_request_manager() {
-    if (request_manager_singleton == nullptr) {
-      request_manager_singleton = new RequestManager();
-    }
-    return request_manager_singleton;
+/*static*/
+RequestManager *RequestManager::get_request_manager() {
+  if (request_manager_singleton == nullptr) {
+    request_manager_singleton = new RequestManager();
   }
+  return request_manager_singleton;
+}
 
-  /* --------- Request Token Tree Related Functions --------- */
-  void RequestManager::init_token_tree(RequestGuid guid) {
-    Request &request = all_requests[guid];
-    request.speculative_token_trees.clear();
-    // Assume we only use one small model for speculation
-    request.speculative_token_trees.emplace_back();
-  }
-
-  void RequestManager::add_root_to_spec_token_tree(
-      RequestGuid guid, BatchConfig::TokenId token_id) {
-    // This method is called by update_llm_verify_results()
-    // The last token in the accepted sequence should be the root of the next
-    // speculation tree. The reason is that the KV cache of this token is not
-    // computed yet, and we need the large model to decode the logit of this
-    // token to verify its childs (the tokens in the first layer). This method
-    // should: construct and add the root token to the empty speculative token
-    // tree, with parent_pos being -1 and log_accumulated_prob being 0.0
-    Request &request = all_requests[guid];
-    TokenTree &speculative_token_tree = request.speculative_token_trees[0];
-    speculative_token_tree.add_layer();
-    auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
-    if (speculative_sampling) {
-      node_ptr->gumbel = true;
-    }
-    speculative_token_tree.tree_layers.front().push_back(node_ptr);
-    request.token_tree_nodes_pq.push(node_ptr);
-  }
+/* --------- Request Token Tree Related Functions --------- */
+void RequestManager::init_token_tree(RequestGuid guid) {
+  Request &request = all_requests[guid];
+  request.speculative_token_trees.clear();
+  // Assume we only use one small model for speculation
+  request.speculative_token_trees.emplace_back();
+}
 
-  void RequestManager::add_tokens_to_spec_token_tree(
-      InferenceResult const &ssm_inference_result) {
+void RequestManager::add_root_to_spec_token_tree(
+    RequestGuid guid, BatchConfig::TokenId token_id) {
+  // This method is called by update_llm_verify_results()
+  // The last token in the accepted sequence should be the root of the next
+  // speculation tree. The reason is that the KV cache of this token is not
+  // computed yet, and we need the large model to decode the logit of this
+  // token to verify its childs (the tokens in the first layer). This method
+  // should: construct and add the root token to the empty speculative token
+  // tree, with parent_pos being -1 and log_accumulated_prob being 0.0
+  Request &request = all_requests[guid];
+  TokenTree &speculative_token_tree = request.speculative_token_trees[0];
+  speculative_token_tree.add_layer();
+  auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
+  if (speculative_sampling) {
+    node_ptr->gumbel = true;
+  }
+  speculative_token_tree.tree_layers.front().push_back(node_ptr);
+  request.token_tree_nodes_pq.push(node_ptr);
+}
 
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        // Request in this slot is unavailable
-        continue;
-      }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
+void RequestManager::add_tokens_to_spec_token_tree(
+    InferenceResult const &ssm_inference_result) {
 
-      int parent_num = request.num_tokens_in_batch;
-      if (parent_num == 0) {
-        continue;
-      }
-      int result_offset = request.first_token_offset_in_batch *
-                          BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-      TokenTree &spec_token_tree = request.speculative_token_trees[0];
-      std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
-          spec_token_tree.tree_layers.back();
-      std::set<std::shared_ptr<TokenTreeNode>, SharedTokenTreeNodePtrLess>
-          tokens;
-      int parent_pos = 0;
-      for (auto const &parent_ptr : last_layer) {
-        // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
-        float parent_log_prob = parent_ptr->log_accumulated_prob;
-        int child_start_idx =
-            result_offset +
-            parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-        // TODO: rename child_probs to child_logits after change the output of
-        // argmax from prob to logprob
-        std::vector<std::pair<float, int>> child_probs(
-            BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
-        for (int child_pos = 0;
-             child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-             child_pos++) {
-          int result_idx = child_start_idx + child_pos;
-          if (!speculative_sampling) {
-            // TODO: the argmax will return log prob instead of prob
-            if (log(ssm_inference_result.probs[result_idx]) !=
-                -std::numeric_limits<float>::infinity()) {
-              child_probs[child_pos] = std::make_pair(
-                  log(ssm_inference_result.probs[result_idx]), result_idx);
-            }
-          } else {
-            // Use gumbel perturbed logits here
-            // TODO: handle the case when the child logit is -inf
-            // TODO: this branch is not tested
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int parent_num = request.num_tokens_in_batch;
+    if (parent_num == 0) {
+      continue;
+    }
+    int result_offset = request.first_token_offset_in_batch *
+                        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    TokenTree &spec_token_tree = request.speculative_token_trees[0];
+    std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
+        spec_token_tree.tree_layers.back();
+    std::set<std::shared_ptr<TokenTreeNode>, SharedTokenTreeNodePtrLess> tokens;
+    int parent_pos = 0;
+    for (auto const &parent_ptr : last_layer) {
+      // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
+      float parent_log_prob = parent_ptr->log_accumulated_prob;
+      int child_start_idx =
+          result_offset +
+          parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      // TODO: rename child_probs to child_logits after change the output of
+      // argmax from prob to logprob
+      std::vector<std::pair<float, int>> child_probs(
+          BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+      for (int child_pos = 0;
+           child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           child_pos++) {
+        int result_idx = child_start_idx + child_pos;
+        if (!speculative_sampling) {
+          // TODO: the argmax will return log prob instead of prob
+          if (log(ssm_inference_result.probs[result_idx]) !=
+              -std::numeric_limits<float>::infinity()) {
             child_probs[child_pos] = std::make_pair(
-                ssm_inference_result.gumbel_logits[result_idx], result_idx);
+                log(ssm_inference_result.probs[result_idx]), result_idx);
           }
+        } else {
+          // Use gumbel perturbed logits here
+          // TODO: handle the case when the child logit is -inf
+          // TODO: this branch is not tested
+          child_probs[child_pos] = std::make_pair(
+              ssm_inference_result.gumbel_logits[result_idx], result_idx);
         }
-        // Sort in descending order
-        std::sort(child_probs.begin(),
-                  child_probs.end(),
-                  std::greater<std::pair<float, int>>());
+      }
+      // Sort in descending order
+      std::sort(child_probs.begin(),
+                child_probs.end(),
+                std::greater<std::pair<float, int>>());
+      if (speculative_sampling) {
+        // TODO: this branch is not tested
+        // Condition the gumbel perturbed logits on the maximum
+        gumbel_conditioned_on_max(parent_ptr->gumbel_logit, child_probs);
+      }
+
+      for (auto const &child_prob : child_probs) {
+        float logit = child_prob.first;
+        // The value used to compare between tokens
+        float accumulated_log_prob = logit + parent_log_prob;
+        float gumbel_logit = 0.0f;
+        float cmp_value;
         if (speculative_sampling) {
-          // TODO: this branch is not tested
-          // Condition the gumbel perturbed logits on the maximum
-          gumbel_conditioned_on_max(parent_ptr->gumbel_logit, child_probs);
+          cmp_value = gumbel_logit = logit;
+        } else {
+          cmp_value = accumulated_log_prob;
         }
-
-        for (auto const &child_prob : child_probs) {
-          float logit = child_prob.first;
-          // The value used to compare between tokens
-          float accumulated_log_prob = logit + parent_log_prob;
-          float gumbel_logit = 0.0f;
-          float cmp_value;
+        int result_idx = child_prob.second;
+
+        //   std::cout << "Probability at result index " << result_idx << ":
+        //   "
+        //             << ssm_inference_result.probs[result_idx] << "\t";
+        //   std::cout << "Token id: "
+        //             << ssm_inference_result.token_ids[result_idx] <<
+        //             std::endl;
+        assert(logit != -std::numeric_limits<float>::infinity() &&
+               "Child log probability should not be -inf.");
+
+        if (tokens.size() == max_tree_width and
+            cmp_value <= (speculative_sampling
+                              ? (*tokens.begin())->gumbel_logit
+                              : (*tokens.begin())->log_accumulated_prob)) {
+          // The current layer is full, and the new token has a lower compare
+          // value than the minimum node in tokens, we don't need to add the
+          // new token and the following tokens belong to the same parent to
+          // it, because the tokens are sorted by their compare value
+          break;
+        } else {
+          std::shared_ptr<TokenTreeNode> node_ptr(nullptr);
           if (speculative_sampling) {
-            cmp_value = gumbel_logit = logit;
+            node_ptr = std::make_shared<TokenTreeNode>(
+                ssm_inference_result.token_ids[result_idx],
+                accumulated_log_prob,
+                parent_pos,
+                true,
+                gumbel_logit);
           } else {
-            cmp_value = accumulated_log_prob;
+            node_ptr = std::make_shared<TokenTreeNode>(
+                ssm_inference_result.token_ids[result_idx],
+                accumulated_log_prob,
+                parent_pos);
           }
-          int result_idx = child_prob.second;
-
-          //   std::cout << "Probability at result index " << result_idx << ":
-          //   "
-          //             << ssm_inference_result.probs[result_idx] << "\t";
-          //   std::cout << "Token id: "
-          //             << ssm_inference_result.token_ids[result_idx] <<
-          //             std::endl;
-          assert(logit != -std::numeric_limits<float>::infinity() &&
-                 "Child log probability should not be -inf.");
-
-          if (tokens.size() == max_tree_width and
-              cmp_value <= (speculative_sampling
-                                ? (*tokens.begin())->gumbel_logit
-                                : (*tokens.begin())->log_accumulated_prob)) {
-            // The current layer is full, and the new token has a lower compare
-            // value than the minimum node in tokens, we don't need to add the
-            // new token and the following tokens belong to the same parent to
-            // it, because the tokens are sorted by their compare value
-            break;
-          } else {
-            std::shared_ptr<TokenTreeNode> node_ptr(nullptr);
-            if (speculative_sampling) {
-              node_ptr = std::make_shared<TokenTreeNode>(
-                  ssm_inference_result.token_ids[result_idx],
-                  accumulated_log_prob,
-                  parent_pos,
-                  true,
-                  gumbel_logit);
-            } else {
-              node_ptr = std::make_shared<TokenTreeNode>(
-                  ssm_inference_result.token_ids[result_idx],
-                  accumulated_log_prob,
-                  parent_pos);
-            }
-            if (tokens.size() == max_tree_width) {
-              // The current layer is full, and the new token has a higher
-              // compare value than the minimum node in tokens, we need to
-              // remove the minimum node from tokens and add the new token to it
-              tokens.erase(tokens.begin());
-            }
-            tokens.insert(node_ptr);
+          if (tokens.size() == max_tree_width) {
+            // The current layer is full, and the new token has a higher
+            // compare value than the minimum node in tokens, we need to
+            // remove the minimum node from tokens and add the new token to it
+            tokens.erase(tokens.begin());
           }
+          tokens.insert(node_ptr);
         }
-        parent_pos++;
       }
+      parent_pos++;
+    }
 
-      // Now add all tokens in the set to the token tree
-      spec_token_tree.add_layer();
-      for (auto token_it = tokens.cbegin(); token_it != tokens.cend();
-           token_it++) {
-        spec_token_tree.tree_layers.back().push_back((*token_it));
-        request.token_tree_nodes_pq.push((*token_it));
-      }
+    // Now add all tokens in the set to the token tree
+    spec_token_tree.add_layer();
+    for (auto token_it = tokens.cbegin(); token_it != tokens.cend();
+         token_it++) {
+      spec_token_tree.tree_layers.back().push_back((*token_it));
+      request.token_tree_nodes_pq.push((*token_it));
     }
   }
+}
 
-  void RequestManager::prune_token_tree() {
-    // Each reqeust has at least one token
-    int budget = get_max_tokens_per_batch() - num_available_requests;
-    assert(budget >= 0);
+void RequestManager::prune_token_tree() {
+  // Each reqeust has at least one token
+  int budget = get_max_tokens_per_batch() - num_available_requests;
+  assert(budget >= 0);
 
-    std::vector<std::pair<double, int>> spare_latency_2_request_index;
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      double spare_latency = request.get_slo_ratio() * baseline_latency_ms *
-                                 request.tokens.size() -
-                             request.decode_latency_ms;
-      assert(spare_latency >= 0.0);
-      spare_latency_2_request_index.push_back(
-          std::make_pair(spare_latency, request_index));
+  std::vector<std::pair<double, int>> spare_latency_2_request_index;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    double spare_latency =
+        request.get_slo_ratio() * baseline_latency_ms * request.tokens.size() -
+        request.decode_latency_ms;
+    assert(spare_latency >= 0.0);
+    spare_latency_2_request_index.push_back(
+        std::make_pair(spare_latency, request_index));
+  }
+
+  // Sort the requests by spare latency in ascending order
+  std::sort(spare_latency_2_request_index.begin(),
+            spare_latency_2_request_index.end(),
+            std::less<std::pair<double, int>>());
+
+  for (auto const &spare_latency_request_index_pair :
+       spare_latency_2_request_index) {
+    int request_index = spare_latency_request_index_pair.second;
+    RequestGuid guid = guid_of_requests[request_index];
+    add_tokens_toward_slo(guid, budget);
+  }
+
+  assert(budget >= 0);
+  if (budget > 0) {
+    if (memory_occupancy) {
+      add_tokens_toward_memory_occupancy(budget);
+    } else {
+      add_tokens_toward_goodput(budget);
     }
+  }
+}
 
-    // Sort the requests by spare latency in ascending order
-    std::sort(spare_latency_2_request_index.begin(),
-              spare_latency_2_request_index.end(),
-              std::less<std::pair<double, int>>());
+void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
+  Request &request = all_requests[guid];
+  double num_tokens_to_decode = (ssm_spec_latency_ms + llm_verify_latency_ms) *
+                                correction_factor /
+                                (baseline_latency_ms * request.get_slo_ratio());
 
-    for (auto const &spare_latency_request_index_pair :
-         spare_latency_2_request_index) {
-      int request_index = spare_latency_request_index_pair.second;
-      RequestGuid guid = guid_of_requests[request_index];
-      add_tokens_toward_slo(guid, budget);
+  double current_added = 1.0;
+  // Include the root of every token tree
+  request.token_tree_nodes_pq.top()->included = true;
+  request.token_tree_nodes_pq.pop();
+
+  while (budget > 0 and current_added < num_tokens_to_decode) {
+    if (request.token_tree_nodes_pq.empty()) {
+      break;
     }
+    auto node_ptr = request.token_tree_nodes_pq.top();
+    request.token_tree_nodes_pq.pop();
+    node_ptr->included = true;
+    current_added += node_ptr->log_accumulated_prob;
+    budget--;
+  }
+}
+
+void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
+  // This is a helper data structure to store help the pruning of the token
+  // trees across different requests.
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, Request &>>,
+      SharedTokenTreeNodePtrRequestWeightedGreater>
+      global_token_tree_node_pq;
+
+  // Initialie the priority queue with the top element in each request's token
+  // tree
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (request.token_tree_nodes_pq.empty()) {
+      continue;
+    }
+    if (!request.token_tree_nodes_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_pq.top(), request});
+      request.token_tree_nodes_pq.pop();
+    }
+  }
 
-    assert(budget >= 0);
-    if (budget > 0) {
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !global_token_tree_node_pq.empty()) {
+    auto [node_ptr, request] = global_token_tree_node_pq.top();
+    global_token_tree_node_pq.pop();
+    node_ptr->included = true;
+    if (!request.token_tree_nodes_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_pq.top(), request});
+      request.token_tree_nodes_pq.pop();
     }
+    budget--;
   }
 
-  void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
+  // Clear the priority queue in each requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
-    double num_tokens_to_decode =
-        (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
-        (baseline_latency_ms * request.get_slo_ratio());
+    assert(request.status == Request::RUNNING);
+    std::priority_queue<std::shared_ptr<TokenTreeNode>,
+                        std::vector<std::shared_ptr<TokenTreeNode>>,
+                        SharedTokenTreeNodePtrLess>()
+        .swap(request.token_tree_nodes_pq);
+  }
+}
 
-    double current_added = 1.0;
-    // Include the root of every token tree
-    request.token_tree_nodes_pq.top()->included = true;
-    request.token_tree_nodes_pq.pop();
+void RequestManager::add_tokens_toward_goodput(int budget) {
+  // This is a helper data structure to store help the pruning of the token
+  // trees across different requests.
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, Request &>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, Request &>>,
+      SharedTokenTreeNodePtrRequestGreater>
+      global_token_tree_node_pq;
+
+  // Initialie the priority queue with the top element in each request's token
+  // tree
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    if (request.token_tree_nodes_pq.empty()) {
+      continue;
+    }
+    if (!request.token_tree_nodes_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_pq.top(), request});
+      request.token_tree_nodes_pq.pop();
+    }
+  }
 
-    while (budget > 0 and current_added < num_tokens_to_decode) {
-      if (request.token_tree_nodes_pq.empty()) {
-        break;
-      }
-      auto node_ptr = request.token_tree_nodes_pq.top();
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !global_token_tree_node_pq.empty()) {
+    auto [node_ptr, request] = global_token_tree_node_pq.top();
+    global_token_tree_node_pq.pop();
+    node_ptr->included = true;
+    if (!request.token_tree_nodes_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_pq.top(), request});
       request.token_tree_nodes_pq.pop();
-      node_ptr->included = true;
-      current_added += node_ptr->log_accumulated_prob;
-      budget--;
-    }
-  }
-
-  void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
-    // This is a helper data structure to store help the pruning of the token
-    // trees across different requests.
-    std::priority_queue<
-        std::pair<std::shared_ptr<TokenTreeNode>, Request &>,
-        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, Request &>>,
-        SharedTokenTreeNodePtrRequestGreater>
-        global_token_tree_node_pq;
-
-    // Initialie the priority queue with the top element in each request's token
-    // tree
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      if (request.token_tree_nodes_pq.empty()) {
-        continue;
-      }
-      if (!request.token_tree_nodes_pq.empty()) {
-        global_token_tree_node_pq.push(
-            {request.token_tree_nodes_pq.top(), request});
-        request.token_tree_nodes_pq.pop();
-      }
     }
+    budget--;
+  }
 
-    // Perform dequeue and enqueue until the budget is used up
-    while (budget > 0 and !global_token_tree_node_pq.empty()) {
-      auto [node_ptr, request] = global_token_tree_node_pq.top();
-      global_token_tree_node_pq.pop();
-      node_ptr->included = true;
-      if (!request.token_tree_nodes_pq.empty()) {
-        global_token_tree_node_pq.push(
-            {request.token_tree_nodes_pq.top(), request});
-        request.token_tree_nodes_pq.pop();
-      }
-      budget--;
+  // Clear the priority queue in each requests
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
     }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    std::priority_queue<std::shared_ptr<TokenTreeNode>,
+                        std::vector<std::shared_ptr<TokenTreeNode>>,
+                        SharedTokenTreeNodePtrLess>()
+        .swap(request.token_tree_nodes_pq);
+  }
+}
 
-    // Clear the priority queue in each requests
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         ++request_index) {
-      if (!request_available[request_index]) {
-        continue;
+std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
+  os << "Token tree: " << std::endl;
+  int layer_idx = 0;
+  for (auto const &layer : token_tree.tree_layers) {
+    os << "Layer: " << layer_idx << std::endl;
+    int token_pos = 0;
+    for (auto const &node : layer) {
+      if (node->included) {
+        os << "token pos: " << token_pos << "\ttoken id: " << node->id
+           << "\tparent pos: " << node->parent_pos
+           << "\tlog prob: " << node->log_accumulated_prob << std::endl;
       }
-      RequestGuid guid = guid_of_requests[request_index];
-      Request &request = all_requests[guid];
-      assert(request.status == Request::RUNNING);
-      std::priority_queue<std::shared_ptr<TokenTreeNode>,
-                          std::vector<std::shared_ptr<TokenTreeNode>>,
-                          SharedTokenTreeNodePtrLess>()
-          .swap(request.token_tree_nodes_pq);
-    }
-  }
-
-  std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
-    os << "Token tree: " << std::endl;
-    int layer_idx = 0;
-    for (auto const &layer : token_tree.tree_layers) {
-      os << "Layer: " << layer_idx << std::endl;
-      int token_pos = 0;
-      for (auto const &node : layer) {
-        if (node->included) {
-          os << "token pos: " << token_pos << "\ttoken id: " << node->id
-             << "\tparent pos: " << node->parent_pos
-             << "\tlog prob: " << node->log_accumulated_prob << std::endl;
-        }
-        token_pos++;
-      }
-      layer_idx++;
+      token_pos++;
     }
-    return os;
+    layer_idx++;
   }
+  return os;
+}
 
-  /* --------- Request Token Tree Related Functions --------- */
-
-  /* --------- Profiling Related Functions --------- */
-  void RequestManager::reset_profiling_statistics() {
-    profiling.llm_step_times.clear();
-    profiling.requests_per_step.clear();
-    profiling.ssm_step_times.clear();
-    profiling.ssm_steps.clear();
-    profiling.generated_tokens_per_step.clear();
-    profiling.llm_step_start = 0;
-    profiling.ssm_step_start = 0;
-    profiling.server_start_time = Realm::Clock::current_time_in_microseconds();
-  }
-  /* --------- Profiling Related Functions --------- */
+/* --------- Request Token Tree Related Functions --------- */
+
+/* --------- Profiling Related Functions --------- */
+void RequestManager::reset_profiling_statistics() {
+  profiling.llm_step_times.clear();
+  profiling.requests_per_step.clear();
+  profiling.ssm_step_times.clear();
+  profiling.ssm_steps.clear();
+  profiling.generated_tokens_per_step.clear();
+  profiling.llm_step_start = 0;
+  profiling.ssm_step_start = 0;
+  profiling.server_start_time = Realm::Clock::current_time_in_microseconds();
+}
+/* --------- Profiling Related Functions --------- */
 }; // namespace FlexFlow

From fc4c1cd8b41abf5954f8e71a6b0abffa2867c6cf Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 10:51:48 -0700
Subject: [PATCH 436/667] docs: minor

---
 .../ops/kernels/inc_multihead_self_attention_kernels.h         | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 969c5ad6e..7c523919c 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -44,6 +44,8 @@ void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
                       DataType data_type,
                       ffStream_t stream);
 
+// [For the tokens in batch]
+// Compute qkv projection for the tokens in the batch.
 template <typename DT>
 void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                  BatchConfig const *bc,
@@ -80,6 +82,7 @@ void apply_pos_encoding_to_streaming_proj(
 // Update the kv cache, and compact the q array.
 // Source: qkv projeciton array of tokens in the batch.
 // Destination: q&kv ptr took by the attention kernel.
+// Note that the q&k here are the value after applying with position encoding.
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,

From e55cc6ef3e44de82047f51f9659a13d830da0f46 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 11:20:15 -0700
Subject: [PATCH 437/667] chore: minor rename

---
 .../ops/kernels/inc_multihead_self_attention_kernels.h |  2 +-
 src/ops/inc_multihead_self_attention.cu                |  2 +-
 .../kernels/inc_multihead_self_attention_kernels.cu    | 10 +++++-----
 src/ops/spec_inc_multihead_self_attention.cu           |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu           |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 7c523919c..1b6c49a3d 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -62,7 +62,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 // For other Key tokens like in streaming cache, we nned other kernel to apply
 // the position embedding.
 template <typename DT>
-void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
+void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         DT *output_ptr,
                         cudaStream_t stream);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 4cb860ddb..b5e2ce7bc 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -256,7 +256,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding(m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
+  apply_pos_encoding_to_tokens_in_batch(m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
 
   // phase 2: Update key/val cache
   update_qkv_in_batch<DT>(m, bc, stream);
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 61b010ba6..6b3514657 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -298,7 +298,7 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 __global__ void
-    apply_pos_encoding_kernel(DT *input_ptr,
+    apply_pos_encoding_to_tokens_in_batch_kernel(DT *input_ptr,
                               BatchConfig::PerRequestInfo const *requestInfos,
                               BatchConfig::PerTokenInfo const *tokenInfos,
                               int qk_dim,
@@ -350,7 +350,7 @@ __global__ void
 }
 
 template <typename DT>
-void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
+void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
                         DT *output_ptr,
                         cudaStream_t stream) {
@@ -361,7 +361,7 @@ void apply_pos_encoding(IncMultiHeadSelfAttentionMeta const *m,
   int num_tokens = bc->num_active_tokens();
   int parallelism = num_tokens * m->local_hidden_size;
   size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
-  apply_pos_encoding_kernel<<<GET_BLOCKS(parallelism),
+  apply_pos_encoding_to_tokens_in_batch_kernel<<<GET_BLOCKS(parallelism),
                               min(CUDA_NUM_THREADS, parallelism),
                               0,
                               stream>>>(output_ptr,
@@ -916,13 +916,13 @@ template void Kernels::IncMultiHeadAttention::compute_qkv<half>(
     half const *bias_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::apply_pos_encoding<float>(
+template void Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     float *output_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::apply_pos_encoding<half>(
+template void Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     half *output_ptr,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index ea4c17a5b..9da3b5f92 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -251,7 +251,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding(m,
+  apply_pos_encoding_to_tokens_in_batch(m,
                      bc,
                      static_cast<DT *>(m->devQKVProjArray),
                      stream);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 5cf58b543..5898f558a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -400,7 +400,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding(m,
+  apply_pos_encoding_to_tokens_in_batch(m,
                      bc,
                      static_cast<DT *>(m->devQKVProjArray),
                      stream);

From 8f056af2d1b0f91672c6a0e94bee0b28de1abaa3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 11:52:09 -0700
Subject: [PATCH 438/667] feat: add streaming-llm logic to attention

---
 src/ops/inc_multihead_self_attention.cu       |  69 ++++------
 .../inc_multihead_self_attention_kernels.cu   | 120 +++++++-----------
 src/ops/spec_inc_multihead_self_attention.cu  |  39 +++---
 src/runtime/batch_config.cc                   |   3 +
 4 files changed, 98 insertions(+), 133 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b5e2ce7bc..da19335fd 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -246,7 +246,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
 
-  // phase 1: Implement kernel to compute KQV for input tokens
+  // phase 1: Compute QKV Projections of the batch
   compute_qkv(m,
               bc,
               shard_id,
@@ -256,54 +256,30 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding_to_tokens_in_batch(m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
-
-  // phase 2: Update key/val cache
-  update_qkv_in_batch<DT>(m, bc, stream);
-
-  // cudaEventRecord(t_end, stream);
-  // checkCUDA(cudaEventSynchronize(t_end));
-  // float elapsed = 0;
-  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  // cudaEventDestroy(t_start);
-  // cudaEventDestroy(t_end);
-  // std::cout << "Prepare attn time: " << elapsed << " ms\n";
+  // phase 2: First maintain the streaming cache, because it need
+  // pre-pos-encoding values
+  if (m->streaming_cache) {
+    // Move pre-pos-encoding cache to where took by attention
+    update_kv_in_streaming_cache<DT>(m, bc, stream);
+    // Apply pos-encoding to those k values
+    apply_pos_encoding_to_streaming_proj<DT>(m, bc, stream);
+    // Commit to the streaming cache
+    commit_kv<DT>(m, bc, stream);
+  }
 
-  // cudaEventCreate(&t_start);
-  // cudaEventCreate(&t_end);
-  // cudaEventRecord(t_start, stream);
+  // phase 3: Take care of the batch
+  {
+    // Apply pos-encoding to the batch
+    apply_pos_encoding_to_tokens_in_batch(
+        m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
+    // Move the batch qkv values to where took by attention
+    update_qkv_in_batch<DT>(m, bc, stream);
+  }
 
-  // phase 3: Compute attention score
+  // phase 4: Attention computation
   incr_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
-  // cudaEventRecord(t_end, stream);
-  // checkCUDA(cudaEventSynchronize(t_end));
-  // elapsed = 0;
-  // checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-  // cudaEventDestroy(t_start);
-  // cudaEventDestroy(t_end);
-  // std::cout << "Attn time: " << elapsed << " ms\n";
-
-  // Debug output:
-  //   int size = m->local_hidden_size * BatchConfig::max_tokens_per_batch();
-  //   float *temp_output = new float[size];
-  //   cudaDeviceSynchronize();
-  //   cudaMemcpy(
-  //       temp_output, m->attn_heads, size * sizeof(float),
-  //       cudaMemcpyDeviceToHost);
-  //   printf("Output: ");
-  //   float temp = 0;
-  //   for (int i = 0; i < 1; ++i) {
-  //     for (int j = 0; j < m->local_hidden_size; ++j) {
-  //       temp += temp_output[i * m->local_hidden_size + j];
-  //     }
-  //     printf("%.6f ", temp);
-  //   }
-  //   printf("\n");
-
-  //   delete[] temp_output;
-
-  // compute output production and bias together for all tokens
+  // phase 5: Compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
@@ -625,7 +601,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     streaming_cache_infos = reinterpret_cast<StreamingCacheInfo *>(
         reinterpret_cast<char *>(handler.batch_config_metadata) +
         sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask));
+        sizeof(BatchConfig::request_available) +
+        sizeof(BatchConfig::causalMask));
 
     if (offload) {
       // token_infos =
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 6b3514657..0de12cc79 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -297,16 +297,13 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void
-    apply_pos_encoding_to_tokens_in_batch_kernel(DT *input_ptr,
-                              BatchConfig::PerRequestInfo const *requestInfos,
-                              BatchConfig::PerTokenInfo const *tokenInfos,
-                              int qk_dim,
-                              int num_tokens,
-                              size_t q_array_size,
-                              int hidden_size,
-                              bool streaming_cache,
-                              StreamingCacheInfo const *streaming_cache_infos) {
+__global__ void apply_pos_encoding_to_tokens_in_batch_kernel(
+    DT *input_ptr,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int qk_dim,
+    int num_tokens,
+    size_t q_array_size,
+    int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
     // create complex number
     bool q_tensor = i < (q_array_size / 2);
@@ -333,13 +330,6 @@ __global__ void
 
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
-    // relative position should be calculated based on current streaming size
-    if (streaming_cache) {
-      int req_idx = tokenInfos[token_idx].request_index;
-      pos += streaming_cache_infos[req_idx].commit_len -
-             requestInfos[req_idx].first_token_index_in_request;
-    }
-
     float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
     cuFloatComplex complex_pos = {cos(freq), sin(freq)};
 
@@ -350,10 +340,11 @@ __global__ void
 }
 
 template <typename DT>
-void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        DT *output_ptr,
-                        cudaStream_t stream) {
+void apply_pos_encoding_to_tokens_in_batch(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    DT *output_ptr,
+    cudaStream_t stream) {
   // apply rotary embedding if needed
   if (!*m->apply_rotary_embedding) {
     return;
@@ -362,17 +353,16 @@ void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *
   int parallelism = num_tokens * m->local_hidden_size;
   size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
   apply_pos_encoding_to_tokens_in_batch_kernel<<<GET_BLOCKS(parallelism),
-                              min(CUDA_NUM_THREADS, parallelism),
-                              0,
-                              stream>>>(output_ptr,
-                                        m->request_infos,
-                                        m->token_infos,
-                                        m->qk_dim,
-                                        num_tokens,
-                                        q_array_size,
-                                        m->local_hidden_size,
-                                        m->streaming_cache,
-                                        m->streaming_cache_infos);
+                                                 min(CUDA_NUM_THREADS,
+                                                     parallelism),
+                                                 0,
+                                                 stream>>>(
+      output_ptr,
+      m->token_infos,
+      m->qk_dim,
+      num_tokens,
+      q_array_size,
+      m->local_hidden_size);
 }
 
 __global__ void apply_pos_encoding_to_streaming_proj_kernel(
@@ -441,7 +431,8 @@ void apply_pos_encoding_to_streaming_proj(
       BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
       BatchConfig::max_spec_tree_token_num());
   apply_pos_encoding_to_streaming_proj_kernel<<<GET_BLOCKS(parallelism),
-                                               min(CUDA_NUM_THREADS, parallelism),
+                                                min(CUDA_NUM_THREADS,
+                                                    parallelism),
                                                 0,
                                                 stream>>>(
       static_cast<half *>(m->kvCache),
@@ -455,19 +446,16 @@ void apply_pos_encoding_to_streaming_proj(
 }
 
 template <typename DT>
-__global__ void update_qkv_in_batch_kernel(
-    DT *qkv_proj_array,
-    half *qTmp_ptr,
-    half *kvCache_ptr,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    BatchConfig::PerRequestInfo const *requestInfos,
-    int const max_num_pages,
-    int num_q_heads,
-    int num_kv_heads,
-    int head_dim,
-    int num_new_tokens,
-    bool streaming_cache,
-    StreamingCacheInfo const *streaming_cache_infos) {
+__global__ void
+    update_qkv_in_batch_kernel(DT *qkv_proj_array,
+                               half *qTmp_ptr,
+                               half *kvCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int const max_num_pages,
+                               int num_q_heads,
+                               int num_kv_heads,
+                               int head_dim,
+                               int num_new_tokens) {
   int const q_hidden_size = num_q_heads * head_dim;
   int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
   int const kv_hidden_size = num_kv_heads * head_dim;
@@ -481,11 +469,6 @@ __global__ void update_qkv_in_batch_kernel(
   int const req_idx = tokenInfos[token_idx].request_index;
   int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
-  if (streaming_cache) {
-    token_abs_idx += streaming_cache_infos[req_idx].commit_len -
-                     requestInfos[req_idx].first_token_index_in_request;
-  }
-
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
       static_cast<half>(qkv_proj_array[from_idx + offset]);
@@ -523,14 +506,11 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                                          static_cast<half *>(m->queryTmp),
                                          static_cast<half *>(m->kvCache),
                                          m->token_infos,
-                                         m->request_infos,
                                          max_num_pages,
                                          m->num_q_heads,
                                          m->num_kv_heads,
                                          m->qk_dim,
-                                         num_new_tokens,
-                                         m->streaming_cache,
-                                         m->streaming_cache_infos);
+                                         num_new_tokens);
 }
 
 __global__ void update_kv_in_streaming_cache_kernel(
@@ -659,9 +639,7 @@ __global__ void
   int const request_idx = tokenInfos[token_idx].request_index;
 
   StreamingCacheInfo const &info = streaming_cache_infos[request_idx];
-  int to_idx = tokenInfos[token_idx].abs_index_in_request -
-               requestInfos[request_idx].first_token_index_in_request +
-               info.commit_len;
+  int to_idx = tokenInfos[token_idx].abs_index_in_request;
   // cases that get over the boundary:
   // 1. commit_len < sink_cache_size: commit to sink, window, window_back is
   // after commit_len.
@@ -672,9 +650,7 @@ __global__ void
   // window is full before this commit, window_back is pointing to the real
   // position.
   if (to_idx >= info.sink_cache_size + info.window_cache_size) {
-    to_idx = tokenInfos[token_idx].abs_index_in_request -
-             requestInfos[request_idx].first_token_index_in_request +
-             info.window_back;
+    to_idx = to_idx - info.commit_len + info.window_back;
     if (info.commit_len < info.sink_cache_size) {
       // For case 1, compensating for sink offset, because window_back is
       // someway back from commit_len.
@@ -916,17 +892,19 @@ template void Kernels::IncMultiHeadAttention::compute_qkv<half>(
     half const *bias_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<float>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    float *output_ptr,
-    cudaStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<
+        float>(IncMultiHeadSelfAttentionMeta const *m,
+               BatchConfig const *bc,
+               float *output_ptr,
+               cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<half>(
-    IncMultiHeadSelfAttentionMeta const *m,
-    BatchConfig const *bc,
-    half *output_ptr,
-    cudaStream_t stream);
+template void
+    Kernels::IncMultiHeadAttention::apply_pos_encoding_to_tokens_in_batch<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        half *output_ptr,
+        cudaStream_t stream);
 
 template void
     Kernels::IncMultiHeadAttention::apply_pos_encoding_to_streaming_proj<float>(
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 9da3b5f92..16dbe7476 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -239,9 +239,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
-  // phase 1: Implement kernel to compute KQV for input tokens
-
-  // long long time_1 = Realm::Clock::current_time_in_microseconds(), time_2;
+  // phase 1: Compute QKV Projections of the batch
   compute_qkv(m,
               bc,
               shard_id,
@@ -251,16 +249,29 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding_to_tokens_in_batch(m,
-                     bc,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     stream);
+  // phase 2: First maintain the streaming cache, because it need
+  // pre-pos-encoding values
+  if (m->streaming_cache) {
+    // Move pre-pos-encoding cache to where took by attention
+    update_kv_in_streaming_cache<DT>(m, bc, stream);
+    // Apply pos-encoding to those k values
+    apply_pos_encoding_to_streaming_proj<DT>(m, bc, stream);
+    // Commit to the streaming cache
+    if (bc->prompt_phase) {
+      commit_kv<DT>(m, bc, stream);
+    }
+  }
 
-  // phase 2: Update key/val cache
-  update_qkv_in_batch<DT>(m, bc, stream);
+  // phase 3: Take care of the batch
+  {
+    // Apply pos-encoding to the batch
+    apply_pos_encoding_to_tokens_in_batch(
+        m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
+    // Move the batch qkv values to where took by attention
+    update_qkv_in_batch<DT>(m, bc, stream);
+  }
 
-  // phase 3: Compute attention score
-  // 3 kernels for pahse 3: matmul1 - softmax - matmal2
+  // phase 4: Attention computation
   tree_search_attention<DT>(m, bc, static_cast<DT *>(m->attn_heads), stream);
 
   // Debug output:
@@ -283,14 +294,10 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta *m,
 
   //   delete[] temp_output;
 
-  // compute output production and bias together for all tokens
+  // phase 5: Compute output production and bias together for all tokens
   int num_tokens = bc->num_active_tokens();
-
   compute_o_prod_bias(
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
-  // time_2 = Realm::Clock::current_time_in_microseconds();
-  // std::cout << "SpecIncMultiHeadSelfAttention kernel time: "
-  //           << (time_2 - time_1) << "us" << std::endl;
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index a3ad2894a..d7696d064 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -296,6 +296,9 @@ void StreamingCacheInfo::reset_cache() {
 }
 
 int StreamingCacheInfo::global_2_cache_index(int global_index) {
+  if (global_index < sink_cache_size) {
+    return global_index;
+  }
   return (global_index - sink_cache_size) % window_cache_size + sink_cache_size;
 }
 

From 813e43f3a2255a699d9b12407edaed87d365f3f3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 14:20:38 -0700
Subject: [PATCH 439/667] fix: typo

---
 src/runtime/request_manager.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2f6356792..c9ef79f2b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -785,7 +785,6 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     int guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    request.llm_cache_size++;
     if (streaming_cache) {
       request.streaming_cache_info.commit_cache(1);
       request.llm_cache_size = request.streaming_cache_info.commit_len;

From e1477d492453d894eff77d7e74cb1ad6e9a754e4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 16:41:54 -0700
Subject: [PATCH 440/667] fix: minor bugs in streaming llm

---
 .../inc_multihead_self_attention_kernels.cu   | 24 ++++++++++++++++++-
 src/runtime/request_manager.cc                | 14 ++++++-----
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 0de12cc79..7e2efd966 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -265,6 +265,9 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
   //   }
 
   int num_tokens = bc->num_active_tokens();
+  if (num_tokens == 0) {
+    return;
+  }
   int parallelism = m->qk_dim * num_tokens * m->num_q_heads;
 
   // Step 2: apply bias for QKV, or scale the query
@@ -350,6 +353,9 @@ void apply_pos_encoding_to_tokens_in_batch(
     return;
   }
   int num_tokens = bc->num_active_tokens();
+  if (num_tokens == 0) {
+    return;
+  }
   int parallelism = num_tokens * m->local_hidden_size;
   size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
   apply_pos_encoding_to_tokens_in_batch_kernel<<<GET_BLOCKS(parallelism),
@@ -426,6 +432,9 @@ void apply_pos_encoding_to_streaming_proj(
     }
     num_tokens += bc->streamingCacheInfo[req_idx].commit_len;
   }
+  if (num_tokens == 0) {
+    return;
+  }
   int parallelism = num_tokens * kv_hidden_size / 2;
   int const max_num_pages = round_up_pages(
       BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
@@ -495,6 +504,9 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream) {
   int num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
   int parallelism = m->local_hidden_size * num_new_tokens;
   int const max_num_pages =
       round_up_pages(BatchConfig::max_sequence_length() +
@@ -592,6 +604,9 @@ void update_kv_in_streaming_cache(IncMultiHeadSelfAttentionMeta const *m,
     }
     num_tokens += bc->streamingCacheInfo[req_idx].commit_len;
   }
+  if (num_tokens == 0) {
+    return;
+  }
   int parallelism = kv_hidden_size * num_tokens;
   int const max_num_pages_pre_pos_enc_buf = round_up_pages(
       BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
@@ -681,6 +696,9 @@ void commit_kv(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->streaming_cache);
   int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
   int const num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
   int parallelism = kv_hidden_size * num_new_tokens;
   int const max_num_pages = round_up_pages(BatchConfig::MAX_STREAMING_POS -
                                            BatchConfig::get_max_tree_depth());
@@ -714,7 +732,11 @@ void produce_output(IncMultiHeadSelfAttentionMeta const *m,
                     BatchConfig const *bc,
                     DT *output_ptr,
                     cudaStream_t stream) {
-  int parallelism = m->v_dim * m->num_q_heads * bc->num_active_tokens();
+  int const num_tokens = bc->num_active_tokens();
+  if (num_tokens == 0) {
+    return;
+  }
+  int parallelism = m->v_dim * m->num_q_heads * num_tokens;
   produce_output_kernel<<<GET_BLOCKS(parallelism),
                           min(CUDA_NUM_THREADS, parallelism),
                           0,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c9ef79f2b..80ee9135c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -305,11 +305,12 @@ RequestManager::RequestGuid
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
     assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
     init_token_tree(request.guid);
-    request.streaming_cache_info = StreamingCacheInfo(
-        BatchConfig::SINK_SIZE,
-        BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
   }
 
+  request.streaming_cache_info = StreamingCacheInfo(
+      BatchConfig::SINK_SIZE,
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
   {
@@ -367,11 +368,12 @@ RequestManager::RequestGuid
     std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
     assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
     init_token_tree(request.guid);
-    request.streaming_cache_info = StreamingCacheInfo(
-        BatchConfig::SINK_SIZE,
-        BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
   }
 
+  request.streaming_cache_info = StreamingCacheInfo(
+      BatchConfig::SINK_SIZE,
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
   {

From 2f9ef18536543dd4e9393c08caa7c3b208505e3f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 19:28:24 -0700
Subject: [PATCH 441/667] fix: minor runtime bug

---
 src/runtime/request_manager.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 80ee9135c..eaddb255c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -309,7 +309,8 @@ RequestManager::RequestGuid
 
   request.streaming_cache_info = StreamingCacheInfo(
       BatchConfig::SINK_SIZE,
-      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::SINK_SIZE -
+          BatchConfig::get_max_tree_depth());
 
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
@@ -372,7 +373,8 @@ RequestManager::RequestGuid
 
   request.streaming_cache_info = StreamingCacheInfo(
       BatchConfig::SINK_SIZE,
-      BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth());
+      BatchConfig::MAX_STREAMING_POS - BatchConfig::SINK_SIZE -
+          BatchConfig::get_max_tree_depth());
 
   pending_request_queue.push(request);
   all_requests[request.guid] = request;
@@ -745,7 +747,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   }
   prefill_request->llm_prefill_len += prefill_request->num_tokens_in_batch;
 
-  if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
+  if (prefill_request->llm_prefill_len == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);

From 30867e0cb673514a3f19251da29e776504af7a8d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 3 Sep 2024 19:53:40 -0700
Subject: [PATCH 442/667] Added statics.

---
 src/runtime/request_manager.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2f9dc3250..9219fc114 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2410,6 +2410,16 @@ void RequestManager::terminate_background_server() {
     generated_tokens_per_step += ")";
     str += generated_tokens_per_step;
 
+    std::string mean_generated_tokens_per_step =
+        "\n mean_generated_tokens_per_step( ";
+    double mean_generated_tokens = 0;
+    for (int nb : profiling.generated_tokens_per_step) {
+      mean_generated_tokens += nb;
+    }
+    mean_generated_tokens /= profiling.generated_tokens_per_step.size();
+    mean_generated_tokens_per_step += std::to_string(mean_generated_tokens);
+    mean_generated_tokens_per_step += ")";
+
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate

From 7f7daebb062910ea6d09df9e30a9ca128cc70c80 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Wed, 4 Sep 2024 10:59:49 -0400
Subject: [PATCH 443/667] Fix output.

---
 src/runtime/request_manager.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9219fc114..38d247c87 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2419,6 +2419,7 @@ void RequestManager::terminate_background_server() {
     mean_generated_tokens /= profiling.generated_tokens_per_step.size();
     mean_generated_tokens_per_step += std::to_string(mean_generated_tokens);
     mean_generated_tokens_per_step += ")";
+    str += mean_generated_tokens_per_step;
 
     write_to_output_file("", str);
     background_server_status = TERMINATED;

From 417e70a165a28d15ec56c696cc8c62e8163242a0 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 13:10:04 -0700
Subject: [PATCH 444/667] Fix a bug.

---
 src/runtime/request_manager.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 38d247c87..546b6e206 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -795,18 +795,18 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   }
   prefill_request->llm_prefill_len += prefill_request->num_tokens_in_batch;
 
-  if (prefill_request->llm_cache_size == prefill_request->tokens.size()) {
+  if (prefill_request->llm_prefill_len == prefill_request->tokens.size()) {
     // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);
-    std::cout << std::endl;
-    std::cout << std::endl;
-    std::cout << std::endl;
-    std::cout << result.token_ids[prefill_request->num_tokens_in_batch - 1]
-              << std::endl;
-    std::cout << std::endl;
-    std::cout << std::endl;
-    std::cout << std::endl;
+    // std::cout << std::endl;
+    // std::cout << std::endl;
+    // std::cout << std::endl;
+    // std::cout << result.token_ids[prefill_request->num_tokens_in_batch - 1]
+    //           << std::endl;
+    // std::cout << std::endl;
+    // std::cout << std::endl;
+    // std::cout << std::endl;
     prefill_completed = true;
 
     if (prefill_request->tokens.back() == eos_token_id) {

From b5eeb267b72c452523ea80c9bf7712c72b16a2c9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 16:44:29 -0700
Subject: [PATCH 445/667] chore: minor output

---
 src/runtime/batch_config.cc | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index d7696d064..89c642d48 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -162,20 +162,18 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   }
 
   // Streaming cache info
-  if (bc.inference_mode == TREE_SEARCH_MODE) {
-    os << "Streaming cache info:\n";
-    for (int i = 0; i < bc.max_requests_per_batch(); i++) {
-      if (bc.request_available[i]) {
-        os << "  Request " << i << ":\n";
-        os << "    Sink cache size: "
-           << bc.streamingCacheInfo[i].sink_cache_size << std::endl;
-        os << "    Window cache size: "
-           << bc.streamingCacheInfo[i].window_cache_size << std::endl;
-        os << "    Window back: " << bc.streamingCacheInfo[i].window_back
-           << std::endl;
-        os << "    Commit len: " << bc.streamingCacheInfo[i].commit_len
-           << std::endl;
-      }
+  os << "Streaming cache info:\n";
+  for (int i = 0; i < bc.max_requests_per_batch(); i++) {
+    if (bc.request_available[i]) {
+      os << "  Request " << i << ":\n";
+      os << "    Sink cache size: "
+          << bc.streamingCacheInfo[i].sink_cache_size << std::endl;
+      os << "    Window cache size: "
+          << bc.streamingCacheInfo[i].window_cache_size << std::endl;
+      os << "    Window back: " << bc.streamingCacheInfo[i].window_back
+          << std::endl;
+      os << "    Commit len: " << bc.streamingCacheInfo[i].commit_len
+          << std::endl;
     }
   }
 

From 13850bb65b6a3d7481ca447f55bad4b30dd03a02 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 19:37:57 -0700
Subject: [PATCH 446/667] fix: minor offset transition bug

---
 src/ops/kernels/inc_multihead_self_attention_kernels.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 7e2efd966..d260dc50a 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -683,10 +683,14 @@ __global__ void
          to_v_idx = get_v_entry_offset(
              request_idx, to_idx, max_num_pages, num_kv_heads, head_dim);
 
+  int const stride = num_q_heads / num_kv_heads;
+  int const kv_offset =
+      offset / head_dim * stride * head_dim + offset % head_dim;
+
   pre_pos_enc_buf[to_k_idx + offset] =
-      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + offset]);
+      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
   pre_pos_enc_buf[to_v_idx + offset] = static_cast<half>(
-      qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + offset]);
+      qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]);
 }
 
 template <typename DT>

From 30d17a2321f530aba626f5348c5ade497c5b249c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 20:56:50 -0700
Subject: [PATCH 447/667] chore: minor

---
 src/ops/inc_multihead_self_attention.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index da19335fd..81e4ec3f7 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -506,7 +506,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
           size_t max_post_pos_enc_pages =
               round_up_pages(BatchConfig::MAX_STREAMING_POS -
                              BatchConfig::get_max_tree_depth() +
-                             BatchConfig::max_spec_tree_token_num());
+                             max(BatchConfig::max_tokens_per_batch(),
+                                 BatchConfig::max_spec_tree_token_num()));
           key_cache_size = num_kv_heads * qk_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_post_pos_enc_pages * kPagesize;

From 07d57b8af823ca065d8ebd8c88749d7c785db5d9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 21:45:47 -0700
Subject: [PATCH 448/667] Fix bug in counting mean acc rate.

---
 include/flexflow/request_manager.h |  6 +++---
 src/runtime/request_manager.cc     | 14 +++++++++-----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 2331ef220..21501e093 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -371,9 +371,9 @@ class RequestManager {
   int max_tree_width;
   int k;
   // Profile based latency
-  double baseline_latency_ms;
-  double ssm_spec_latency_ms;
-  double llm_verify_latency_ms;
+  double baseline_latency_ms = 1000;
+  double ssm_spec_latency_ms = 50;
+  double llm_verify_latency_ms = 50;
   double correction_factor = 1.05;
 
   State request_manager_status;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f7cf82e03..f9845e868 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2415,11 +2415,15 @@ void RequestManager::terminate_background_server() {
 
     std::string mean_generated_tokens_per_step =
         "\n mean_generated_tokens_per_step( ";
-    double mean_generated_tokens = 0;
-    for (int nb : profiling.generated_tokens_per_step) {
-      mean_generated_tokens += nb;
-    }
-    mean_generated_tokens /= profiling.generated_tokens_per_step.size();
+    double mean_generated_tokens =
+        (double)std::accumulate(profiling.generated_tokens_per_step.begin(),
+                                profiling.generated_tokens_per_step.end(),
+                                0);
+    double total_request_steps =
+        (double)std::accumulate(profiling.requests_per_step.begin(),
+                                profiling.requests_per_step.end(),
+                                0);
+    mean_generated_tokens /= total_request_steps;
     mean_generated_tokens_per_step += std::to_string(mean_generated_tokens);
     mean_generated_tokens_per_step += ")";
     str += mean_generated_tokens_per_step;

From 61177ee2568ff05005ba48ea10b80709c0b92934 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Sep 2024 22:06:14 -0700
Subject: [PATCH 449/667] style: format code

---
 .../inc_multihead_self_attention_kernels.h    |  9 +++----
 inference/models/falcon.cc                    |  2 +-
 inference/models/llama.cc                     | 24 +++++++++----------
 inference/models/starcoder.cc                 |  2 +-
 .../inc_multihead_self_attention_kernels.cu   |  5 ++--
 src/ops/spec_inc_multihead_self_attention.cc  |  2 +-
 src/ops/tree_inc_multihead_self_attention.cu  |  6 ++---
 src/runtime/batch_config.cc                   | 10 ++++----
 src/runtime/graph.cc                          |  3 ++-
 9 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 1b6c49a3d..8f69ad380 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -62,10 +62,11 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
 // For other Key tokens like in streaming cache, we nned other kernel to apply
 // the position embedding.
 template <typename DT>
-void apply_pos_encoding_to_tokens_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                        BatchConfig const *bc,
-                        DT *output_ptr,
-                        cudaStream_t stream);
+void apply_pos_encoding_to_tokens_in_batch(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    DT *output_ptr,
+    cudaStream_t stream);
 
 // [For the tokens in streaming cache]
 // Apply position embedding for k projection in the streaming cache.
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 24c63ea0e..96e85177c 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            false,  /*streaming_cache*/
+            false,   /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 64e54ae6b..16dc2441f 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -151,18 +151,18 @@ void LLAMA::create_llama_model(FFModel &ff,
             llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
-            0.0f,    /*dropout*/
-            false,   /*qkv_bias*/
-            false,   /*final_bias*/
-            false,   /*add_zero_attn*/
-            DT_NONE, /*data_type*/
-            nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            streaming_cache,   /*streaming_cache*/
+            0.0f,            /*dropout*/
+            false,           /*qkv_bias*/
+            false,           /*final_bias*/
+            false,           /*add_zero_attn*/
+            DT_NONE,         /*data_type*/
+            nullptr,         /*kernel_initializer*/
+            true,            /*apply_rotary_embedding*/
+            false,           /*scaling query*/
+            1.0f,            /*scaling factor*/
+            true,            /*qk_prod_scaling*/
+            false,           /*position_bias*/
+            streaming_cache, /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 55faec3a7..f531fe988 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -124,7 +124,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
-            false,                      /*streaming_cache*/
+            false,                       /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index d260dc50a..e65f2c060 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -689,8 +689,9 @@ __global__ void
 
   pre_pos_enc_buf[to_k_idx + offset] =
       static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
-  pre_pos_enc_buf[to_v_idx + offset] = static_cast<half>(
-      qkv_proj_array[from_idx + q_hidden_size + temp_kv_hidden_size + kv_offset]);
+  pre_pos_enc_buf[to_v_idx + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                       temp_kv_hidden_size + kv_offset]);
 }
 
 template <typename DT>
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 5817bd1c4..cfcf783e9 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -296,7 +296,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
-      position_bias(_position_bias) , streaming_cache(_streaming_cache) {
+      position_bias(_position_bias), streaming_cache(_streaming_cache) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 5898f558a..8c384c1b0 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -400,10 +400,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
               bias_ptr,
               stream);
 
-  apply_pos_encoding_to_tokens_in_batch(m,
-                     bc,
-                     static_cast<DT *>(m->devQKVProjArray),
-                     stream);
+  apply_pos_encoding_to_tokens_in_batch(
+      m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 89c642d48..308f468f5 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -166,14 +166,14 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   for (int i = 0; i < bc.max_requests_per_batch(); i++) {
     if (bc.request_available[i]) {
       os << "  Request " << i << ":\n";
-      os << "    Sink cache size: "
-          << bc.streamingCacheInfo[i].sink_cache_size << std::endl;
+      os << "    Sink cache size: " << bc.streamingCacheInfo[i].sink_cache_size
+         << std::endl;
       os << "    Window cache size: "
-          << bc.streamingCacheInfo[i].window_cache_size << std::endl;
+         << bc.streamingCacheInfo[i].window_cache_size << std::endl;
       os << "    Window back: " << bc.streamingCacheInfo[i].window_back
-          << std::endl;
+         << std::endl;
       os << "    Commit len: " << bc.streamingCacheInfo[i].commit_len
-          << std::endl;
+         << std::endl;
     }
   }
 
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index ab6421d58..ca8e51d40 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2809,7 +2809,8 @@ void FFModel::deserialize_graph_optimal_view(
             tensor_parallelism_degree;
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, streaming_cache, position_bias;
+            scaling_query, qk_prod_scaling, offload, streaming_cache,
+            position_bias;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);

From 81001236a4b890425dee147142609dd95cc36900 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 6 Sep 2024 18:37:40 -0700
Subject: [PATCH 450/667] Removed unused outputs.

---
 src/runtime/request_manager.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f9845e868..e281c0edf 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -803,14 +803,6 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
     // Indicates that the LLM prefilling phase finishes
     prefill_request->tokens.push_back(
         result.token_ids[prefill_request->num_tokens_in_batch - 1]);
-    // std::cout << std::endl;
-    // std::cout << std::endl;
-    // std::cout << std::endl;
-    // std::cout << result.token_ids[prefill_request->num_tokens_in_batch - 1]
-    //           << std::endl;
-    // std::cout << std::endl;
-    // std::cout << std::endl;
-    // std::cout << std::endl;
     prefill_completed = true;
 
     if (prefill_request->tokens.back() == eos_token_id) {

From 3b87329b617407d21f4bac13d1676ae80aad5641 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 6 Sep 2024 19:25:53 -0700
Subject: [PATCH 451/667] Removed unused output.

---
 src/runtime/request_manager.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e281c0edf..d28daa7a9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2547,12 +2547,6 @@ void RequestManager::add_tokens_to_spec_token_tree(
         }
         int result_idx = child_prob.second;
 
-        //   std::cout << "Probability at result index " << result_idx << ":
-        //   "
-        //             << ssm_inference_result.probs[result_idx] << "\t";
-        //   std::cout << "Token id: "
-        //             << ssm_inference_result.token_ids[result_idx] <<
-        //             std::endl;
         assert(logit != -std::numeric_limits<float>::infinity() &&
                "Child log probability should not be -inf.");
 

From f4e46d2622f9735d37965376087cf1197dbb94e8 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 7 Sep 2024 17:48:35 -0400
Subject: [PATCH 452/667] Fix bug.

---
 include/flexflow/request_manager.h | 43 +++++++-----------
 src/runtime/request_manager.cc     | 72 ++++++++++++++++++++++--------
 2 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 21501e093..af196402d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -213,34 +213,6 @@ struct Request {
   double get_slo_ratio();
 };
 
-// A comparator for std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>
-// This is used to sort the token tree nodes in ascending order
-struct SharedTokenTreeNodePtrRequestWeightedGreater {
-  bool operator()(
-      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &lhs,
-      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &rhs) const {
-    if (lhs.first->gumbel) {
-      assert(rhs.first->gumbel);
-      return lhs.first->gumbel_logit * lhs.second.get_length_weight() >
-             rhs.first->gumbel_logit * rhs.second.get_length_weight();
-    }
-    return lhs.first->log_accumulated_prob * lhs.second.get_length_weight() >
-           rhs.first->log_accumulated_prob * rhs.second.get_length_weight();
-  }
-};
-
-struct SharedTokenTreeNodePtrRequestGreater {
-  bool operator()(
-      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &lhs,
-      std::pair<std::shared_ptr<TokenTreeNode>, Request &> const &rhs) const {
-    if (lhs.first->gumbel) {
-      assert(rhs.first->gumbel);
-      return lhs.first->gumbel_logit > rhs.first->gumbel_logit;
-    }
-    return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
-  }
-};
-
 class RequestManager {
 public:
   enum State {
@@ -301,6 +273,7 @@ class RequestManager {
   void set_streaming_cache(bool streaming_cache);
   bool get_memory_occupancy();
   void set_memory_occupancy(bool memory_occupancy);
+  Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -361,6 +334,20 @@ class RequestManager {
   int get_num_active_requests();
   int get_empty_request_index();
 
+  // Comparters
+  struct SharedTokenTreeNodePtrRequestGuidWeightedGreater {
+    bool operator()(
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
+        const;
+  };
+  struct SharedTokenTreeNodePtrRequestGuidGreater {
+    bool operator()(
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
+        const;
+  };
+
 private:
   // configuration parameters
   int max_requests_per_batch;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d28daa7a9..4f5bc9e4c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -262,6 +262,44 @@ void RequestManager::set_memory_occupancy(bool memory_occupancy_) {
   memory_occupancy = memory_occupancy_;
 }
 
+Request &RequestManager::get_request_with_guid(RequestGuid guid) {
+  return all_requests[guid];
+}
+
+bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedGreater::
+    operator()(
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
+        const {
+  if (lhs.first->gumbel) {
+    assert(rhs.first->gumbel);
+    return lhs.first->gumbel_logit * get_request_manager()
+                                         ->get_request_with_guid(lhs.second)
+                                         .get_length_weight() >
+           rhs.first->gumbel_logit * get_request_manager()
+                                         ->get_request_with_guid(rhs.second)
+                                         .get_length_weight();
+  }
+  return lhs.first->log_accumulated_prob *
+             get_request_manager()
+                 ->get_request_with_guid(lhs.second)
+                 .get_length_weight() >
+         rhs.first->log_accumulated_prob *
+             get_request_manager()
+                 ->get_request_with_guid(rhs.second)
+                 .get_length_weight();
+}
+
+bool RequestManager::SharedTokenTreeNodePtrRequestGuidGreater ::operator()(
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs) const {
+  if (lhs.first->gumbel) {
+    assert(rhs.first->gumbel);
+    return lhs.first->gumbel_logit > rhs.first->gumbel_logit;
+  }
+  return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -2667,9 +2705,9 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
   std::priority_queue<
-      std::pair<std::shared_ptr<TokenTreeNode>, Request &>,
-      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, Request &>>,
-      SharedTokenTreeNodePtrRequestWeightedGreater>
+      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
+      SharedTokenTreeNodePtrRequestGuidWeightedGreater>
       global_token_tree_node_pq;
 
   // Initialie the priority queue with the top element in each request's token
@@ -2686,21 +2724,20 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
       continue;
     }
     if (!request.token_tree_nodes_pq.empty()) {
-      global_token_tree_node_pq.push(
-          {request.token_tree_nodes_pq.top(), request});
+      global_token_tree_node_pq.push({request.token_tree_nodes_pq.top(), guid});
       request.token_tree_nodes_pq.pop();
     }
   }
 
   // Perform dequeue and enqueue until the budget is used up
   while (budget > 0 and !global_token_tree_node_pq.empty()) {
-    auto [node_ptr, request] = global_token_tree_node_pq.top();
+    auto [node_ptr, guid] = global_token_tree_node_pq.top();
     global_token_tree_node_pq.pop();
     node_ptr->included = true;
-    if (!request.token_tree_nodes_pq.empty()) {
+    if (!get_request_with_guid(guid).token_tree_nodes_pq.empty()) {
       global_token_tree_node_pq.push(
-          {request.token_tree_nodes_pq.top(), request});
-      request.token_tree_nodes_pq.pop();
+          {get_request_with_guid(guid).token_tree_nodes_pq.top(), guid});
+      get_request_with_guid(guid).token_tree_nodes_pq.pop();
     }
     budget--;
   }
@@ -2725,9 +2762,9 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
   std::priority_queue<
-      std::pair<std::shared_ptr<TokenTreeNode>, Request &>,
-      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, Request &>>,
-      SharedTokenTreeNodePtrRequestGreater>
+      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
+      SharedTokenTreeNodePtrRequestGuidGreater>
       global_token_tree_node_pq;
 
   // Initialie the priority queue with the top element in each request's token
@@ -2744,21 +2781,20 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
       continue;
     }
     if (!request.token_tree_nodes_pq.empty()) {
-      global_token_tree_node_pq.push(
-          {request.token_tree_nodes_pq.top(), request});
+      global_token_tree_node_pq.push({request.token_tree_nodes_pq.top(), guid});
       request.token_tree_nodes_pq.pop();
     }
   }
 
   // Perform dequeue and enqueue until the budget is used up
   while (budget > 0 and !global_token_tree_node_pq.empty()) {
-    auto [node_ptr, request] = global_token_tree_node_pq.top();
+    auto [node_ptr, guid] = global_token_tree_node_pq.top();
     global_token_tree_node_pq.pop();
     node_ptr->included = true;
-    if (!request.token_tree_nodes_pq.empty()) {
+    if (!get_request_with_guid(guid).token_tree_nodes_pq.empty()) {
       global_token_tree_node_pq.push(
-          {request.token_tree_nodes_pq.top(), request});
-      request.token_tree_nodes_pq.pop();
+          {get_request_with_guid(guid).token_tree_nodes_pq.top(), guid});
+      get_request_with_guid(guid).token_tree_nodes_pq.pop();
     }
     budget--;
   }

From bacc515d43a9e59d720e9bd4a4fd060482468081 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 7 Sep 2024 16:05:14 -0700
Subject: [PATCH 453/667] fix: indeterminate output of customAllReduce

---
 deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
index 27a266fa3..959f52d3d 100644
--- a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
+++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
@@ -379,6 +379,7 @@ void dispatchARKernels(AllReduceStrategyType algo,
     twoShotAllReduceKernel<T, RANKS_PER_NODE>
         <<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
   }
+  multiGpuBarrierKernel<<<1, param.ranks_per_node, 0, stream>>>(param);
 }
 
 template <typename T>

From 3a353871dc37c4ebebf7754f9b0dd22b6cc5b0b6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 8 Sep 2024 15:49:24 -0700
Subject: [PATCH 454/667] fix: request expected latency

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 14 ++++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index af196402d..ec9d9d128 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -273,6 +273,7 @@ class RequestManager {
   void set_streaming_cache(bool streaming_cache);
   bool get_memory_occupancy();
   void set_memory_occupancy(bool memory_occupancy);
+  double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4f5bc9e4c..5c599ac56 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,8 +13,8 @@
  * limitations under the License.
  */
 
-#include "flexflow/request_manager.h"
 #include "flexflow/parallel_ops/parallel_op.h"
+#include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
@@ -262,6 +262,11 @@ void RequestManager::set_memory_occupancy(bool memory_occupancy_) {
   memory_occupancy = memory_occupancy_;
 }
 
+double RequestManager::get_request_expected_latency(Request &request) {
+  return request.get_slo_ratio() * baseline_latency_ms *
+         (request.tokens.size() - request.llm_prefill_len);
+}
+
 Request &RequestManager::get_request_with_guid(RequestGuid guid) {
   return all_requests[guid];
 }
@@ -1554,9 +1559,7 @@ bool RequestManager::update_llm_verify_results(
       // Request is completed
       request_completed = true;
       request_complete_clean_up(request_index);
-    } else if (request.decode_latency_ms > request.tokens.size() *
-                                               baseline_latency_ms *
-                                               request.get_slo_ratio()) {
+    } else if (request.decode_latency_ms > get_request_expected_latency(request)) {
       // The request violates the SLO, drop that request
       request_completed = true;
       request_complete_clean_up(request_index);
@@ -2649,8 +2652,7 @@ void RequestManager::prune_token_tree() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
     double spare_latency =
-        request.get_slo_ratio() * baseline_latency_ms * request.tokens.size() -
-        request.decode_latency_ms;
+        get_request_expected_latency(request) - request.decode_latency_ms;
     assert(spare_latency >= 0.0);
     spare_latency_2_request_index.push_back(
         std::make_pair(spare_latency, request_index));

From 9b2245be839a4920ec6661e46f036a9e289df578 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 8 Sep 2024 20:18:53 -0700
Subject: [PATCH 455/667] feat: add GenerationRequest

---
 include/flexflow/inference.h             | 11 +++
 include/flexflow/model.h                 |  5 +-
 include/flexflow/request_manager.h       |  3 +-
 inference/incr_decoding/incr_decoding.cc |  2 +-
 inference/spec_infer/spec_infer.cc       |  2 +-
 src/c/flexflow_c.cc                      |  2 +-
 src/runtime/request_manager.cc           | 91 ++++++------------------
 7 files changed, 40 insertions(+), 76 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 82da0d879..bc12d2069 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -40,6 +40,17 @@ struct GenerationConfig {
   }
 };
 
+struct GenerationRequest {
+  std::string prompt;
+  double slo_ratio;
+
+  GenerationRequest(std::string const &prompt_, double slo_ratio_)
+      : prompt(prompt_), slo_ratio(slo_ratio_) {}
+
+  GenerationRequest(std::string const &prompt_)
+      : prompt(prompt_), slo_ratio(1.0) {}
+};
+
 struct GenerationResult {
   using RequestGuid = BatchConfig::RequestGuid;
   using TokenId = BatchConfig::TokenId;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 6618fdaf8..9e2e0b1dc 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -824,8 +824,9 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  std::vector<GenerationResult> generate(std::vector<std::string> &prompts,
-                                         int max_seq_length);
+  std::vector<GenerationResult> generate(std::vector<std::string> &prompts);
+
+  std::vector<GenerationResult> generate(std::vector<GenerationRequest> &requests);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ec9d9d128..9dac1446f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -288,8 +288,7 @@ class RequestManager {
   void serve_spec_infer_sync(FFModel *model);
   void serve_decoding(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
-  RequestGuid register_new_request(std::string const &prompt);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt);
+  RequestGuid register_new_request(GenerationRequest const &req);
   // Methods to start and terminate request manager's background task
   void start_background_server(FFModel *model);
   bool is_background_server_terminated();
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 83f2ba632..716c04ec2 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -296,7 +296,7 @@ void FlexFlow::top_level_task(Task const *task,
       prompts.push_back(text);
     }
     std::vector<GenerationResult> result =
-        model.generate(prompts, 128 /*max_sequence_length*/);
+        model.generate(prompts);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index cc48d9c86..69292e44b 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -477,7 +477,7 @@ void FlexFlow::top_level_task(Task const *task,
       prompts.push_back(text);
       // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
-    tree_model.generate(prompts, 128 /*max_sequence_length*/);
+    tree_model.generate(prompts);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index a398b54ca..dbdebc201 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1607,7 +1607,7 @@ void flexflow_model_generate(flexflow_model_t handle_,
                 max_seq_length);
   }
   std::vector<GenerationResult> results =
-      handle->generate(prompts, max_seq_length);
+      handle->generate(prompts);
   // If the prompt exceeds max seq len, check that we return the prompt with no
   // additional token. Otherwise, check that the output does not exceed the max
   // sequence length.
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5c599ac56..d3cf855b9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 #include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
@@ -25,6 +26,7 @@
 #include <random>
 #include <stack>
 #include <stdexcept>
+#include <vector>
 
 namespace FlexFlow {
 
@@ -373,67 +375,7 @@ size_t RequestManager::get_num_ssms() {
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::vector<TokenId> const &prompt) {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-
-  // Add a new request
-  Request request;
-  request.status = Request::PENDING;
-  request.guid = next_available_guid++;
-
-  if (prompt.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << prompt.size() << ".\n";
-
-    printf("tokens size: %zu\n", request.tokens.size());
-    return INVALID_GUID;
-  } else {
-    request.tokens = prompt;
-  }
-
-  if (get_num_ssms() == 0) {
-    std::cout << "No small speculative model registered, using incremental "
-                 "decoding."
-              << std::endl;
-  } else {
-    std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
-    assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
-    init_token_tree(request.guid);
-  }
-
-  request.streaming_cache_info = StreamingCacheInfo(
-      BatchConfig::SINK_SIZE,
-      BatchConfig::MAX_STREAMING_POS - BatchConfig::SINK_SIZE -
-          BatchConfig::get_max_tree_depth());
-
-  pending_request_queue.push(request);
-  all_requests[request.guid] = request;
-  {
-    std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
-    request_to_promise[request.guid] = new std::promise<void>();
-  }
-
-  if (verbose) {
-    std::cout << "new req: " << request.tokens.size() << std::endl;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      std::cout << i << " : " << request.tokens[i] << std::endl;
-    }
-  }
-
-  GenerationResult gr;
-  gr.guid = request.guid;
-  gr.input_text = "";
-  gr.input_tokens = prompt;
-  gr.output_text = "";
-  gr.output_tokens = prompt;
-  request_generation_results[request.guid] = gr;
-
-  return request.guid;
-}
-
-RequestManager::RequestGuid
-    RequestManager::register_new_request(std::string const &prompt) {
+    RequestManager::register_new_request(GenerationRequest const &req) {
   std::lock_guard<std::mutex> const lock(request_queue_mutex);
   // Add a new request
   Request request;
@@ -442,7 +384,7 @@ RequestManager::RequestGuid
   if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
-  std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
+  std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
   if (tokens.size() >= get_max_sequence_length()) {
     std::cout << "Warning: too many tokens in prompt, only load up to "
               << get_max_sequence_length() << " tokens, but got "
@@ -454,7 +396,9 @@ RequestManager::RequestGuid
   for (int i = 0; i < tokens.size(); i++) {
     std::cout << "[" << i << "]" << tokens.at(i) << "\n";
   }
+  std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
+  request.set_slo_ratio(req.slo_ratio);
 
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
@@ -490,9 +434,9 @@ RequestManager::RequestGuid
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = prompt;
+  gr.input_text = req.prompt;
   gr.input_tokens = request.tokens;
-  gr.output_text = prompt;
+  gr.output_text = req.prompt;
   gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
   return request.guid;
@@ -1559,7 +1503,8 @@ bool RequestManager::update_llm_verify_results(
       // Request is completed
       request_completed = true;
       request_complete_clean_up(request_index);
-    } else if (request.decode_latency_ms > get_request_expected_latency(request)) {
+    } else if (request.decode_latency_ms >
+               get_request_expected_latency(request)) {
       // The request violates the SLO, drop that request
       request_completed = true;
       request_complete_clean_up(request_index);
@@ -2097,13 +2042,12 @@ void RequestManager::get_verify_results_greedy(
   profiling.generated_tokens_per_step.push_back(total_nb_generated_tokens);
 }
 
-// TODO: the max_seq_length is not used in the current implementation
 std::vector<GenerationResult>
-    FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
+    FFModel::generate(std::vector<GenerationRequest> &requests) {
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
-  for (int i = 0; i < prompts.size(); i++) {
-    RequestManager::RequestGuid guid = rm->register_new_request(prompts.at(i));
+  for (GenerationRequest &request : requests) {
+    RequestManager::RequestGuid guid = rm->register_new_request(request);
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }
@@ -2115,6 +2059,15 @@ std::vector<GenerationResult>
   return results;
 }
 
+std::vector<GenerationResult>
+    FFModel::generate(std::vector<std::string> &prompts) {
+  std::vector<GenerationRequest> requests;
+  for (std::string &prompt : prompts) {
+    requests.push_back(GenerationRequest(prompt));
+  }
+  return generate(requests);
+}
+
 void RequestManager::start_background_server(FFModel *model) {
   assert(background_server_status == INITIALIZED);
   background_server_status = SERVING;

From 2112b489b05bd1367a15199fe9b7817458b11fbb Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 12:26:25 -0700
Subject: [PATCH 456/667] feat: add EmissionMachine to simulate requests
 arrival

---
 include/flexflow/inference.h             | 42 ++++++++++++++++++++++--
 include/flexflow/model.h                 |  7 ++--
 inference/incr_decoding/incr_decoding.cc |  3 +-
 inference/spec_infer/spec_infer.cc       |  3 +-
 src/runtime/inference_manager.cc         | 29 ++++++++++++++++
 src/runtime/request_manager.cc           | 11 ++++---
 6 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index bc12d2069..1bb89aca9 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -46,9 +46,6 @@ struct GenerationRequest {
 
   GenerationRequest(std::string const &prompt_, double slo_ratio_)
       : prompt(prompt_), slo_ratio(slo_ratio_) {}
-
-  GenerationRequest(std::string const &prompt_)
-      : prompt(prompt_), slo_ratio(1.0) {}
 };
 
 struct GenerationResult {
@@ -61,6 +58,45 @@ struct GenerationResult {
   std::vector<TokenId> output_tokens;
 };
 
+// Contains the configuration for how to emit requests to the server,
+// managing the request arrival rate.
+class EmissionMachine {
+public:
+  enum class EmissionMode { Constant, Poisson, Trace };
+  EmissionMode mode;
+  double last_request_time_ms;
+  double req_per_s;
+
+  EmissionMachine(EmissionMode mode_, double req_per_s_)
+      : mode(mode_), last_request_time_ms(0), req_per_s(req_per_s_) {}
+  void wait_until_next_request();
+
+  // Simulate next request arrival time
+  virtual double get_next_interval_ms() = 0;
+};
+
+class ConstantEmissionMachine : public EmissionMachine {
+public:
+  double interval_ms;
+
+  ConstantEmissionMachine(double req_per_s_)
+      : EmissionMachine(EmissionMode::Constant, req_per_s_),
+        interval_ms(1e3 / req_per_s_) {}
+
+  double get_next_interval_ms() override;
+};
+
+class PoissonEmissionMachine : public EmissionMachine {
+public:
+  double lambda;
+
+  PoissonEmissionMachine(double req_per_s_)
+      : EmissionMachine(EmissionMode::Poisson, req_per_s_), lambda(req_per_s_) {
+  }
+
+  double get_next_interval_ms() override;
+};
+
 #include <string>
 #include <vector>
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 9e2e0b1dc..854d27ffc 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -824,9 +824,12 @@ class FFModel {
   // ========================================
   // Inference APIs
   // ========================================
-  std::vector<GenerationResult> generate(std::vector<std::string> &prompts);
+  std::vector<GenerationResult> generate(std::vector<std::string> &prompts,
+                                         EmissionMachine &emission_machine);
 
-  std::vector<GenerationResult> generate(std::vector<GenerationRequest> &requests);
+  std::vector<GenerationResult>
+      generate(std::vector<GenerationRequest> &requests,
+               EmissionMachine &emission_machine);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 716c04ec2..972f49cf5 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -295,8 +295,9 @@ void FlexFlow::top_level_task(Task const *task,
       total_num_requests++;
       prompts.push_back(text);
     }
+    ConstantEmissionMachine emission_machine(1.0);
     std::vector<GenerationResult> result =
-        model.generate(prompts);
+        model.generate(prompts, emission_machine);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 69292e44b..10b6ef0be 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -477,7 +477,8 @@ void FlexFlow::top_level_task(Task const *task,
       prompts.push_back(text);
       // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
-    tree_model.generate(prompts);
+    ConstantEmissionMachine emission_machine(1.0);
+    tree_model.generate(prompts, emission_machine);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 2df1d6bfa..b7e3f16d9 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -15,11 +15,13 @@
 
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/graph.h"
+#include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/fused.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 #include "flexflow/request_manager.h"
+#include <random>
 
 namespace FlexFlow {
 
@@ -673,4 +675,31 @@ std::string join_path(std::vector<std::string> const &paths) {
   return joined;
 }
 
+void EmissionMachine::wait_until_next_request() {
+  // use last_request_time to determine the next request time
+  // and sleep until then
+  if (last_request_time_ms == 0) {
+    last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3;
+  }
+  double current_time = Realm::Clock::current_time_in_microseconds() * 1e-3;
+  double time_to_sleep =
+      get_next_interval_ms() - (current_time - last_request_time_ms);
+  if (time_to_sleep > 0) {
+    usleep(static_cast<useconds_t>(time_to_sleep * 1e3));
+  }
+  last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3;
+}
+
+double ConstantEmissionMachine::get_next_interval_ms() {
+  return interval_ms;
+}
+
+double PoissonEmissionMachine::get_next_interval_ms() {
+  // Note that these are static so multiple instances will share the same
+  // generator and distribution.
+  static std::default_random_engine generator(
+      std::chrono::system_clock::now().time_since_epoch().count());
+  static std::exponential_distribution<double> distribution(lambda);
+  return distribution(generator) * 1e3;
+}
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d3cf855b9..b50a2cbff 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2043,7 +2043,8 @@ void RequestManager::get_verify_results_greedy(
 }
 
 std::vector<GenerationResult>
-    FFModel::generate(std::vector<GenerationRequest> &requests) {
+    FFModel::generate(std::vector<GenerationRequest> &requests,
+                      EmissionMachine &emission_machine) {
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
   for (GenerationRequest &request : requests) {
@@ -2051,6 +2052,7 @@ std::vector<GenerationResult>
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }
+    emission_machine.wait_until_next_request();
   }
   std::vector<GenerationResult> results;
   for (int i = 0; i < guids.size(); i++) {
@@ -2060,12 +2062,13 @@ std::vector<GenerationResult>
 }
 
 std::vector<GenerationResult>
-    FFModel::generate(std::vector<std::string> &prompts) {
+    FFModel::generate(std::vector<std::string> &prompts,
+                      EmissionMachine &emission_machine) {
   std::vector<GenerationRequest> requests;
   for (std::string &prompt : prompts) {
-    requests.push_back(GenerationRequest(prompt));
+    requests.push_back(GenerationRequest(prompt, 1.0));
   }
-  return generate(requests);
+  return generate(requests, emission_machine);
 }
 
 void RequestManager::start_background_server(FFModel *model) {

From 86e31c37c26753d019fa3c655a6993d33016bd3c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 12:32:16 -0700
Subject: [PATCH 457/667] chore: minor

---
 inference/incr_decoding/incr_decoding.cc | 6 +++---
 inference/spec_infer/spec_infer.cc       | 6 +++---
 src/c/flexflow_c.cc                      | 3 ++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 972f49cf5..861776ec6 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -288,16 +288,16 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-    std::vector<std::string> prompts;
+    std::vector<GenerationRequest> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      prompts.push_back(text);
+      requests.push_back(GenerationRequest(text, 1.0));
     }
     ConstantEmissionMachine emission_machine(1.0);
     std::vector<GenerationResult> result =
-        model.generate(prompts, emission_machine);
+        model.generate(requests, emission_machine);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 10b6ef0be..5a855d483 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -469,16 +469,16 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
 
-    std::vector<std::string> prompts;
+    std::vector<GenerationRequest> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
-      prompts.push_back(text);
+      requests.push_back(GenerationRequest(text, 1.0));
       // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
     ConstantEmissionMachine emission_machine(1.0);
-    tree_model.generate(prompts, emission_machine);
+    tree_model.generate(requests, emission_machine);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index dbdebc201..5f92b999b 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1606,8 +1606,9 @@ void flexflow_model_generate(flexflow_model_t handle_,
                 text_str.c_str(),
                 max_seq_length);
   }
+  ConstantEmissionMachine emission_machine(1.0);
   std::vector<GenerationResult> results =
-      handle->generate(prompts);
+      handle->generate(prompts, emission_machine);
   // If the prompt exceeds max seq len, check that we return the prompt with no
   // additional token. Otherwise, check that the output does not exceed the max
   // sequence length.

From 0997fad265860b851eaead6a9d709c110c88d8e9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 12:51:37 -0700
Subject: [PATCH 458/667] chore: minor

---
 include/flexflow/inference.h       | 2 +-
 inference/spec_infer/spec_infer.cc | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 1bb89aca9..1871406b9 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -81,7 +81,7 @@ class ConstantEmissionMachine : public EmissionMachine {
 
   ConstantEmissionMachine(double req_per_s_)
       : EmissionMachine(EmissionMode::Constant, req_per_s_),
-        interval_ms(1e3 / req_per_s_) {}
+        interval_ms(req_per_s_ > 0 ? 1e3 / req_per_s_ : 0) {}
 
   double get_next_interval_ms() override;
 };
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 5a855d483..7dc10ab38 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -475,7 +475,6 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       total_num_requests++;
       requests.push_back(GenerationRequest(text, 1.0));
-      // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
     ConstantEmissionMachine emission_machine(1.0);
     tree_model.generate(requests, emission_machine);

From ae0b8e3359840a800620b54927c6e598bf1094c5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 13:13:25 -0700
Subject: [PATCH 459/667] feat: update load_pending_requests logic

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 43 ++++++++++++++++++------------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 9dac1446f..687e8eb5a 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -440,7 +440,7 @@ class RequestManager {
   ProfileInfo profiling;
   std::unordered_map<RequestGuid, RequestProfileInfo> profiling_requests;
   double total_request_run_time;
-  void load_pending_request_to_batch();
+  bool load_pending_request_to_batch();
   void request_complete_clean_up(int batch_index);
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b50a2cbff..a4146959a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -26,6 +26,7 @@
 #include <random>
 #include <stack>
 #include <stdexcept>
+#include <thread>
 #include <vector>
 
 namespace FlexFlow {
@@ -527,7 +528,20 @@ BatchConfig
   return prepare_next_batch();
 }
 
-void RequestManager::load_pending_request_to_batch() {
+// Return value: true if load a pending request to the batch
+bool RequestManager::load_pending_request_to_batch() {
+  if (pending_request_queue.empty()) {
+    if (num_available_requests > 0) {
+      // No pending request to process, but there are available requests
+      // in the batch, do nothing
+      return false;
+    }
+    // Wait until there is a pending request
+    while (pending_request_queue.empty()) {
+      printf("Waiting for pending request to process...\n");
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
   assert(!pending_request_queue.empty() && "No pending request to process.");
   RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
@@ -549,6 +563,7 @@ void RequestManager::load_pending_request_to_batch() {
   profiling_requests[guid] = RequestProfileInfo();
   profiling_requests[guid].start_time =
       Realm::Clock::current_time_in_microseconds();
+  return true;
 }
 
 void RequestManager::request_complete_clean_up(int batch_index) {
@@ -648,14 +663,12 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
   if (num_available_requests == 0) {
     // Update nothing
-    if (!pending_request_queue.empty()) {
-      // Load the pending request to the batch
-      load_pending_request_to_batch();
-      request_manager_status = PREFILLING;
-      if (decoding_mode == SPECULATIVE_DECODING) {
-        prefill_model = SSM;
-        current_ssm_step = 0;
-      }
+    // Load the pending request to the batch
+    load_pending_request_to_batch();
+    request_manager_status = PREFILLING;
+    if (decoding_mode == SPECULATIVE_DECODING) {
+      prefill_model = SSM;
+      current_ssm_step = 0;
     }
     return;
   }
@@ -670,9 +683,8 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 
           // Check if there are more empty slots
           if (num_available_requests < get_max_requests_per_batch() &&
-              !pending_request_queue.empty()) {
+              load_pending_request_to_batch()) {
             // Load the pending request to the batch
-            load_pending_request_to_batch();
             request_manager_status = PREFILLING;
           } else {
             // No more empty slots, start the decoding
@@ -700,9 +712,8 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             prefill_request = nullptr;
             // Check if there are more empty slots
             if (num_available_requests < get_max_requests_per_batch() &&
-                !pending_request_queue.empty()) {
+                load_pending_request_to_batch()) {
               // Load the pending request to the batch
-              load_pending_request_to_batch();
               prefill_model = SSM;
               current_ssm_step = 0;
             } else {
@@ -727,26 +738,24 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
     case DECODING:
       if (update_llm_decode_results(result)) {
         // A request completed after the decode
-        if (pending_request_queue.empty()) {
+        if (load_pending_request_to_batch() == false) {
           // No pending request to process, continue the speculation
           request_manager_status = DECODING;
         } else {
           request_manager_status = PREFILLING;
-          load_pending_request_to_batch();
         }
       }
       break;
     case LLM_VERIFY:
       if (update_llm_verify_results(result)) {
         // A request completed after the verification
-        if (pending_request_queue.empty()) {
+        if (load_pending_request_to_batch() == false) {
           // No pending request to process, continue the speculation
           request_manager_status = SSM_SPEC;
           current_ssm_step = 0;
           ssm_completed = false;
         } else {
           request_manager_status = PREFILLING;
-          load_pending_request_to_batch();
           prefill_model = SSM;
           current_ssm_step = 0;
         }

From 132f68f8983a16fa2b3314349ddf50ce64e85cf3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 17:41:30 -0700
Subject: [PATCH 460/667] fix: dead lock in request manager; client wait until
 server init

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 28 +++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 687e8eb5a..1f9a2c024 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -291,6 +291,7 @@ class RequestManager {
   RequestGuid register_new_request(GenerationRequest const &req);
   // Methods to start and terminate request manager's background task
   void start_background_server(FFModel *model);
+  bool is_background_server_serving();
   bool is_background_server_terminated();
   void terminate_background_server();
   static void terminate_background_server_at_exit();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a4146959a..1253a21b0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -19,6 +19,7 @@
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
+#include <cstdio>
 #include <filesystem>
 #include <future>
 #include <iomanip>
@@ -537,11 +538,14 @@ bool RequestManager::load_pending_request_to_batch() {
       return false;
     }
     // Wait until there is a pending request
-    while (pending_request_queue.empty()) {
-      printf("Waiting for pending request to process...\n");
+    while (pending_request_queue.empty() && !is_background_server_terminated()) {
       std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
+    if (is_background_server_terminated()) {
+      return false;
+    }
   }
+  std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
   assert(!pending_request_queue.empty() && "No pending request to process.");
   RequestGuid guid = pending_request_queue.front().guid;
   pending_request_queue.pop();
@@ -659,7 +663,6 @@ void RequestManager::request_complete_clean_up(int batch_index) {
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   std::lock_guard<std::mutex> const rm_state_lock(rm_state_mutex);
-  std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
 
   if (num_available_requests == 0) {
     // Update nothing
@@ -888,6 +891,9 @@ void RequestManager::update_ssm_prefill_results(
 }
 
 BatchConfig RequestManager::prepare_next_batch() {
+  if (is_background_server_terminated()) {
+    return BatchConfig();
+  }
   switch (request_manager_status) {
     case PREFILLING:
       if (decoding_mode == INCREMENTAL_DECODING) {
@@ -2056,6 +2062,12 @@ std::vector<GenerationResult>
                       EmissionMachine &emission_machine) {
   RequestManager *rm = RequestManager::get_request_manager();
   std::vector<RequestManager::RequestGuid> guids;
+
+  // Wait until the request manager is ready
+  while (!rm->is_background_server_serving()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
   for (GenerationRequest &request : requests) {
     RequestManager::RequestGuid guid = rm->register_new_request(request);
     if (guid != RequestManager::INVALID_GUID) {
@@ -2082,7 +2094,6 @@ std::vector<GenerationResult>
 
 void RequestManager::start_background_server(FFModel *model) {
   assert(background_server_status == INITIALIZED);
-  background_server_status = SERVING;
   // Start background task
   Runtime *runtime = Runtime::get_runtime();
   Context ctx = Runtime::get_context();
@@ -2158,6 +2169,7 @@ void RequestManager::serve_decoding(FFModel *llm) {
   { batch_pipeline.push(last_irf); }
 
   reset_profiling_statistics();
+  background_server_status = SERVING;
   while (!is_background_server_terminated()) {
 
     if (batch_pipeline.size() >= 4) {
@@ -2226,6 +2238,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   infer_result_future_pipeline.push(irf_0);
 
   reset_profiling_statistics();
+  background_server_status = SERVING;
   while (!is_background_server_terminated()) {
     if (infer_result_future_pipeline.size() >= 4) {
       // Block here to avoid launching too many batches
@@ -2294,6 +2307,7 @@ void RequestManager::serve_spec_infer_sync(FFModel *llm) {
   request_manager_status = PREFILLING;
   prefill_model = SSM;
 
+  background_server_status = SERVING;
   while (!is_background_server_terminated()) {
     BatchConfigFuture bcf = get_next_batch_config(irf_0, ctx, runtime);
     bcf.get_void_result();
@@ -2331,7 +2345,7 @@ void RequestManager::terminate_background_server_at_exit() {
 }
 
 void RequestManager::terminate_background_server() {
-  if (background_server_status == SERVING) {
+  if (is_background_server_serving()) {
     assert(profiling.llm_step_times.size() ==
            profiling.requests_per_step.size());
     // Write the last profiling statistics to output file
@@ -2435,6 +2449,10 @@ void RequestManager::terminate_background_server() {
   }
 }
 
+bool RequestManager::is_background_server_serving() {
+  return background_server_status == SERVING;
+}
+
 bool RequestManager::is_background_server_terminated() {
   return background_server_status == TERMINATED;
 }

From c57b3eea75528835adf028fb8a83b1d63cda661a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 18:01:34 -0700
Subject: [PATCH 461/667] feat: client support prompt input with slo_ratio

---
 inference/incr_decoding/incr_decoding.cc | 10 +++++++---
 inference/spec_infer/spec_infer.cc       | 10 +++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 861776ec6..8b1f587f2 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -290,10 +290,14 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*ignore_comments */ true);
     std::vector<GenerationRequest> requests;
     for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      std::string text = prompt["prompt"].get<std::string>();
+      double slo_ratio = prompt["slo_ratio"].get<double>();
+      printf("Prompt[%d] with slo %.3f: %s\n",
+             total_num_requests,
+             slo_ratio,
+             text.c_str());
       total_num_requests++;
-      requests.push_back(GenerationRequest(text, 1.0));
+      requests.push_back(GenerationRequest(text, slo_ratio));
     }
     ConstantEmissionMachine emission_machine(1.0);
     std::vector<GenerationResult> result =
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 7dc10ab38..25e3eb843 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -471,10 +471,14 @@ void FlexFlow::top_level_task(Task const *task,
 
     std::vector<GenerationRequest> requests;
     for (auto &prompt : prompt_json) {
-      std::string text = prompt.get<std::string>();
-      printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      std::string text = prompt["prompt"].get<std::string>();
+      double slo_ratio = prompt["slo_ratio"].get<double>();
+      printf("Prompt[%d] with slo %.3f: %s\n",
+             total_num_requests,
+             slo_ratio,
+             text.c_str());
       total_num_requests++;
-      requests.push_back(GenerationRequest(text, 1.0));
+      requests.push_back(GenerationRequest(text, slo_ratio));
     }
     ConstantEmissionMachine emission_machine(1.0);
     tree_model.generate(requests, emission_machine);

From 2040cf76bce9fba7518b23731157077d32986175 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 18:08:51 -0700
Subject: [PATCH 462/667] feat: add an prompt processing script

---
 inference/utils/process_prompts.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 inference/utils/process_prompts.py

diff --git a/inference/utils/process_prompts.py b/inference/utils/process_prompts.py
new file mode 100644
index 000000000..902662191
--- /dev/null
+++ b/inference/utils/process_prompts.py
@@ -0,0 +1,28 @@
+import json
+import argparse
+
+def read_prompts_from_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+        return data
+
+def write_prompts_to_json(file_path, data):
+    with open(file_path, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+def process_prompts(input_file, output_file):
+    prompts = read_prompts_from_json(input_file)
+    processed_prompts = [{"prompt": prompt, "slo_ratio": 1.0} for prompt in prompts]
+    write_prompts_to_json(output_file, processed_prompts)
+
+def main():
+    parser = argparse.ArgumentParser(description="Process prompts JSON file and generate slo_ratio for each prompt.")
+    parser.add_argument('input_file', type=str, help="Input JSON file containing prompts.")
+    parser.add_argument('output_file', type=str, help="Output JSON file to save the processed prompts.")
+    
+    args = parser.parse_args()
+
+    process_prompts(args.input_file, args.output_file)
+
+if __name__ == '__main__':
+    main()

From 03ba37e026486092a668994bcbd4e3c81ad9c68a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 18:09:31 -0700
Subject: [PATCH 463/667] style: minor format

---
 src/runtime/request_manager.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1253a21b0..2dbbd3c72 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
+#include "flexflow/request_manager.h"
 #include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
-#include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
@@ -538,7 +538,8 @@ bool RequestManager::load_pending_request_to_batch() {
       return false;
     }
     // Wait until there is a pending request
-    while (pending_request_queue.empty() && !is_background_server_terminated()) {
+    while (pending_request_queue.empty() &&
+           !is_background_server_terminated()) {
       std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
     if (is_background_server_terminated()) {

From 36fb00e4d77a8ba05ff0a8e98dd7a7b94dd0a65d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 21:11:12 -0700
Subject: [PATCH 464/667] feat: add slo attainment metric

---
 include/flexflow/request_manager.h |  3 ++-
 src/runtime/request_manager.cc     | 24 +++++++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 1f9a2c024..30c6569d4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -136,6 +136,7 @@ struct Request {
   double decode_latency_ms = 0.0;
   int ssm_prefill_len = 0;
   int llm_prefill_len = 0;
+  bool attained = false;
 
   int first_token_offset_in_batch = 0;
   int num_tokens_in_batch = 0;
@@ -442,7 +443,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, RequestProfileInfo> profiling_requests;
   double total_request_run_time;
   bool load_pending_request_to_batch();
-  void request_complete_clean_up(int batch_index);
+  void request_complete_clean_up(int batch_index, bool attained);
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2dbbd3c72..4aa845e81 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -571,7 +571,7 @@ bool RequestManager::load_pending_request_to_batch() {
   return true;
 }
 
-void RequestManager::request_complete_clean_up(int batch_index) {
+void RequestManager::request_complete_clean_up(int batch_index, bool attained) {
   RequestGuid guid = guid_of_requests[batch_index];
   profiling_requests[guid].finish_time =
       Realm::Clock::current_time_in_microseconds();
@@ -580,6 +580,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   request_available[batch_index] = false;
   num_available_requests--;
   request.status = Request::COMPLETED;
+  request.attained = attained;
 
   // Find the sos and eos in the sequence
   auto bos_it = std::find(
@@ -806,7 +807,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
     prefill_completed = true;
 
     if (prefill_request->tokens.back() == eos_token_id) {
-      request_complete_clean_up(prefill_request->batch_index);
+      request_complete_clean_up(prefill_request->batch_index, true);
     }
 
     if (decoding_mode == SPECULATIVE_DECODING) {
@@ -855,7 +856,7 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     if (request.tokens.back() == eos_token_id or
         request.tokens.size() >= get_max_sequence_length()) {
       request_completed = true;
-      request_complete_clean_up(request_index);
+      request_complete_clean_up(request_index, true);
     }
 
     if (verbose) {
@@ -1518,12 +1519,12 @@ bool RequestManager::update_llm_verify_results(
     if (eos_token_found or request.tokens.size() >= get_max_sequence_length()) {
       // Request is completed
       request_completed = true;
-      request_complete_clean_up(request_index);
+      request_complete_clean_up(request_index, true);
     } else if (request.decode_latency_ms >
                get_request_expected_latency(request)) {
       // The request violates the SLO, drop that request
       request_completed = true;
-      request_complete_clean_up(request_index);
+      request_complete_clean_up(request_index, false);
     } else {
       update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
     }
@@ -2441,6 +2442,19 @@ void RequestManager::terminate_background_server() {
     mean_generated_tokens_per_step += ")";
     str += mean_generated_tokens_per_step;
 
+    std::string slo_attainment = "\n slo_attainment( ";
+    double attainment = 0;
+    for (auto request_pair : all_requests) {
+      Request &request = request_pair.second;
+      if (request.attained) {
+        attainment += 1;
+      }
+    }
+    attainment /= total_requests;
+    slo_attainment += std::to_string(attainment);
+    slo_attainment += ")";
+    str += slo_attainment;
+
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate

From fd6f610e392c47475d55e8edfbee10ffd0812ef9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Sep 2024 21:29:08 -0700
Subject: [PATCH 465/667] chore: minor

---
 inference/incr_decoding/incr_decoding.cc | 2 +-
 inference/spec_infer/spec_infer.cc       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 8b1f587f2..ceea544ee 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -299,7 +299,7 @@ void FlexFlow::top_level_task(Task const *task,
       total_num_requests++;
       requests.push_back(GenerationRequest(text, slo_ratio));
     }
-    ConstantEmissionMachine emission_machine(1.0);
+    PoissonEmissionMachine emission_machine(1.0);
     std::vector<GenerationResult> result =
         model.generate(requests, emission_machine);
   }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 25e3eb843..011fc8833 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -480,7 +480,7 @@ void FlexFlow::top_level_task(Task const *task,
       total_num_requests++;
       requests.push_back(GenerationRequest(text, slo_ratio));
     }
-    ConstantEmissionMachine emission_machine(1.0);
+    PoissonEmissionMachine emission_machine(1.0);
     tree_model.generate(requests, emission_machine);
   }
 

From 6f89252279671254c5f0d8c45819246cafa9875d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 10 Sep 2024 16:56:20 -0700
Subject: [PATCH 466/667] feat: separate max_tokens_per_batch for SSM and LLM

---
 include/flexflow/batch_config.h               |  2 +-
 include/flexflow/flexflow_c.h                 |  3 +++
 include/flexflow/request_manager.h            |  6 +++--
 inference/incr_decoding/incr_decoding.cc      | 11 +++++++++
 inference/models/falcon.cc                    |  4 ++--
 inference/models/llama.cc                     |  4 ++--
 inference/models/mpt.cc                       |  4 ++--
 inference/models/opt.cc                       |  4 ++--
 inference/models/starcoder.cc                 |  4 ++--
 inference/spec_infer/spec_infer.cc            | 11 +++++++++
 src/c/flexflow_c.cc                           |  8 +++++++
 src/ops/inc_multihead_self_attention.cpp      |  5 +++-
 src/ops/inc_multihead_self_attention.cu       |  8 ++++---
 src/ops/spec_inc_multihead_self_attention.cpp |  4 ++--
 src/runtime/batch_config.cc                   |  5 ++--
 src/runtime/inference_manager.cc              |  7 ++++--
 src/runtime/request_manager.cc                | 24 ++++++++++++-------
 src/runtime/request_manager.cpp               | 10 +++++++-
 src/runtime/request_manager.cu                | 11 +++++----
 19 files changed, 97 insertions(+), 38 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index d56f4e245..ec8171947 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -67,7 +67,7 @@ class BatchConfig {
   int num_active_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
-  static int max_verify_tokens_per_batch();
+  static int max_tokens_per_ssm_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
   static int get_max_tree_depth();
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 9bc2c6973..ea94adf39 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -976,6 +976,9 @@ void flexflow_request_manager_set_max_requests_per_batch(
 void flexflow_request_manager_set_max_tokens_per_batch(
     flexflow_request_manager_t handle_, int max_num_tokens);
 
+void flexflow_request_manager_set_max_tokens_per_ssm_batch(
+    flexflow_request_manager_t handle_, int max_num_ssm_tokens);
+
 void flexflow_request_manager_set_max_spec_tree_token_num(
     flexflow_request_manager_t handle_, int max_num_tokens);
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 30c6569d4..54ee050c4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -34,7 +34,7 @@ class InferenceManager {
 public:
   InferenceManager();
   static InferenceManager *get_inference_manager();
-  void compile_model_and_allocate_buffer(FFModel *model);
+  void compile_model_and_allocate_buffer(FFModel *model, bool is_llm = true);
   void init_operators_inference(FFModel *model);
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
@@ -249,9 +249,10 @@ class RequestManager {
   int get_max_requests_per_batch();
   void set_max_tokens_per_batch(int max_num_tokens);
   int get_max_tokens_per_batch();
+  void set_max_tokens_per_ssm_batch(int max_num_ssm_tokens);
+  int get_max_tokens_per_ssm_batch();
   void set_max_spec_tree_token_num(int max_num_tokens);
   int get_max_spec_tree_token_num();
-  int get_max_verify_tokens_per_batch();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
   void set_decoding_mode(DecodingMode mode);
@@ -354,6 +355,7 @@ class RequestManager {
   // configuration parameters
   int max_requests_per_batch;
   int max_tokens_per_batch;
+  int max_tokens_per_ssm_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
   int max_tree_depth;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index ceea544ee..0541f7502 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -47,6 +47,7 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
+                      int &max_tokens_per_ssm_batch,
                       int &max_sequence_length,
                       int &sampling_seed,
                       bool &streaming_cache) {
@@ -103,6 +104,10 @@ void parse_input_args(char **argv,
       max_tokens_per_batch = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-tokens-per-ssm-batch")) {
+      max_tokens_per_ssm_batch = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
@@ -145,6 +150,7 @@ void FlexFlow::top_level_task(Task const *task,
   float topp = 0.6f;
   int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
+  int max_tokens_per_ssm_batch = -1;
   int max_sequence_length = 256;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
@@ -165,9 +171,13 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
+                    max_tokens_per_ssm_batch,
                    max_sequence_length,
                    sampling_seed,
                    streaming_cache);
+  if (max_tokens_per_ssm_batch == -1) {
+    max_tokens_per_ssm_batch = max_tokens_per_batch;
+  }
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -227,6 +237,7 @@ void FlexFlow::top_level_task(Task const *task,
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_decoding_mode(decoding_mode);
   rm->set_max_spec_tree_token_num(64);
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 96e85177c..c8103e517 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -40,8 +40,8 @@ void FALCON::create_falcon_model(FFModel &ff,
   {
     // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
+        mode == TREE_SEARCH_MODE 
+            ? BatchConfig::max_tokens_per_ssm_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 16dc2441f..3a02845b9 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -43,8 +43,8 @@ void LLAMA::create_llama_model(FFModel &ff,
   Tensor input;
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
+        mode == TREE_SEARCH_MODE 
+            ? BatchConfig::max_tokens_per_ssm_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 11845da0e..c5aec6968 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -41,8 +41,8 @@ void MPT::create_mpt_model(FFModel &ff,
   Tensor input;
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
+        mode == TREE_SEARCH_MODE 
+            ? BatchConfig::max_tokens_per_ssm_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 9c563f9c2..3840432d8 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -43,8 +43,8 @@ void OPT::create_opt_model(FFModel &ff,
   ff.set_position_offset(2);
   {
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
+        mode == TREE_SEARCH_MODE 
+            ? BatchConfig::max_tokens_per_ssm_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index f531fe988..20643224d 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -49,8 +49,8 @@ void STARCODER::create_starcoder_model(
   {
     // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
     int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == TREE_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
+        mode == TREE_SEARCH_MODE 
+            ? BatchConfig::max_tokens_per_ssm_batch()
             : BatchConfig::max_tokens_per_batch(),
         1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 011fc8833..2231aedb9 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -62,6 +62,7 @@ void parse_input_args(char **argv,
                       bool &verbose,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
+                      int &max_tokens_per_ssm_batch,
                       int &max_sequence_length,
                       int &max_spec_tree_token_num,
                       int &max_tree_width,
@@ -121,6 +122,10 @@ void parse_input_args(char **argv,
       max_tokens_per_batch = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-tokens-per-ssm-batch")) {
+      max_tokens_per_ssm_batch = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
@@ -312,6 +317,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool verbose = false;
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
+  int max_tokens_per_ssm_batch = -1;
   int max_sequence_length = 512;
   int max_spec_tree_token_num = 64;
   int expansion_degree = 3;
@@ -335,6 +341,7 @@ void FlexFlow::top_level_task(Task const *task,
                    verbose,
                    max_requests_per_batch,
                    max_tokens_per_batch,
+                    max_tokens_per_ssm_batch,
                    max_sequence_length,
                    max_spec_tree_token_num,
                    max_tree_width,
@@ -344,6 +351,9 @@ void FlexFlow::top_level_task(Task const *task,
                    do_sample,
                    sampling_seed,
                    streaming_cache);
+  if (max_tokens_per_ssm_batch == -1) {
+    max_tokens_per_ssm_batch = max_tokens_per_batch;
+  }
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -358,6 +368,7 @@ void FlexFlow::top_level_task(Task const *task,
   RequestManager *rm = RequestManager::get_request_manager();
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
   rm->set_max_spec_tree_token_num(max_spec_tree_token_num);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_max_tree_depth(max_tree_depth);
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 5f92b999b..c8649f596 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2589,6 +2589,14 @@ void flexflow_request_manager_set_max_tokens_per_batch(
   DEBUG_PRINT("[RequestManager] set max_tokens_per_batch %d", max_num_tokens);
 }
 
+void flexflow_request_manager_set_max_tokens_per_ssm_batch(
+    flexflow_request_manager_t handle_, int max_num_ssm_tokens) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_tokens_per_ssm_batch(max_num_ssm_tokens);
+  DEBUG_PRINT("[RequestManager] set max_tokens_per_ssm_batch %d",
+              max_num_ssm_tokens);
+}
+
 void flexflow_request_manager_set_max_spec_tree_token_num(
     flexflow_request_manager_t handle_, int max_num_tokens) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 123b2ee05..17e81b54b 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/inc_multihead_self_attention.h"
+#include "flexflow/ffconst.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
@@ -950,7 +951,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = infer_mode == TREE_SEARCH_MODE
+                                   ? BatchConfig::max_tokens_per_ssm_batch()
+                                   : BatchConfig::max_tokens_per_batch();
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 81e4ec3f7..cf614fde6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -476,7 +476,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = infer_mode == TREE_SEARCH_MODE
+                                   ? BatchConfig::max_tokens_per_ssm_batch()
+                                   : BatchConfig::max_tokens_per_batch();
     size_t qkv_max_proj_size =
         max_tokens_per_batch *
         (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads);
@@ -494,7 +496,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
         query_tmp_size =
-            num_q_heads * qk_dim * BatchConfig::max_tokens_per_batch();
+            num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
                          BatchConfig::max_requests_per_batch() * max_num_pages *
@@ -506,7 +508,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
           size_t max_post_pos_enc_pages =
               round_up_pages(BatchConfig::MAX_STREAMING_POS -
                              BatchConfig::get_max_tree_depth() +
-                             max(BatchConfig::max_tokens_per_batch(),
+                             max(max_tokens_per_batch,
                                  BatchConfig::max_spec_tree_token_num()));
           key_cache_size = num_kv_heads * qk_dim *
                            BatchConfig::max_requests_per_batch() *
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index df16d2979..9cfea2f61 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -467,7 +467,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                       DT const *bias_ptr,
                       hipStream_t stream) {
   // here because we need postion info in infernece 1
-  int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+  int max_tokens_per_batch = bc->max_tokens_per_ssm_batch();
   checkCUDA(
       hipMemcpyAsync(m->token_infos,
                      &(bc->tokensInfo),
@@ -618,7 +618,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = BatchConfig::max_tokens_per_ssm_batch();
     size_t beam_tokeninfo_size =
         max_tokens_per_batch * TreeSearchBatchConfig::MAX_BEAM_WIDTH;
     size_t requestinfo_size = TreeSearchBatchConfig::max_requests_per_batch();
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 308f468f5..fa2ae1cb0 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -89,9 +89,8 @@ int BatchConfig::max_tokens_per_batch() {
 }
 
 /*static*/
-int BatchConfig::max_verify_tokens_per_batch() {
-  return RequestManager::get_request_manager()
-      ->get_max_verify_tokens_per_batch();
+int BatchConfig::max_tokens_per_ssm_batch() {
+  return RequestManager::get_request_manager()->get_max_tokens_per_ssm_batch();
 }
 
 /*static*/
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index b7e3f16d9..f9dc552f1 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flexflow/batch_config.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/graph.h"
 #include "flexflow/inference.h"
@@ -55,11 +56,13 @@ bool parallel_tensor_list_overlaps(std::vector<ParallelTensor> const &list1,
   return false;
 }
 
-void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
+void InferenceManager::compile_model_and_allocate_buffer(FFModel *model,
+                                                         bool is_llm) {
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
-  model->config.batchSize = BatchConfig::max_tokens_per_batch();
+  model->config.batchSize = is_llm ? BatchConfig::max_tokens_per_batch()
+                                   : BatchConfig::max_tokens_per_ssm_batch();
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4aa845e81..773c8fc32 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -111,6 +111,7 @@ RequestManager::RequestManager()
   // ffmodel.compile()
   max_requests_per_batch = -1;
   max_tokens_per_batch = -1;
+  max_tokens_per_ssm_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
   max_tree_depth = -1;
@@ -139,6 +140,13 @@ void RequestManager::set_max_tokens_per_batch(int max_num_tokens) {
   assert(max_tokens_per_batch <= BatchConfig::MAX_NUM_TOKENS);
 }
 
+void RequestManager::set_max_tokens_per_ssm_batch(int max_num_ssm_tokens) {
+  assert(max_tokens_per_ssm_batch == -1 ||
+         max_tokens_per_ssm_batch == max_num_ssm_tokens);
+  max_tokens_per_ssm_batch = max_num_ssm_tokens;
+  assert(max_tokens_per_ssm_batch <= BatchConfig::MAX_NUM_TOKENS);
+}
+
 void RequestManager::set_max_spec_tree_token_num(int max_num_tokens) {
   assert(max_spec_tree_token_num == -1 ||
          max_spec_tree_token_num == max_num_tokens);
@@ -151,16 +159,16 @@ int RequestManager::get_max_tokens_per_batch() {
   return max_tokens_per_batch;
 }
 
+int RequestManager::get_max_tokens_per_ssm_batch() {
+  assert(max_tokens_per_ssm_batch > 0);
+  return max_tokens_per_ssm_batch;
+}
+
 int RequestManager::get_max_spec_tree_token_num() {
   assert(max_spec_tree_token_num > 0);
   return max_spec_tree_token_num;
 }
 
-int RequestManager::get_max_verify_tokens_per_batch() {
-  assert(max_tokens_per_batch > 0);
-  return max_tokens_per_batch;
-}
-
 void RequestManager::set_max_sequence_length(int max_seq_length) {
   assert(max_sequence_length == -1 || max_sequence_length == max_seq_length);
   max_sequence_length = max_seq_length;
@@ -1024,7 +1032,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
   bc.requestsInfo[request_index].first_token_index_in_request =
       prefill_request->ssm_cache_size;
-  int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
+  int num_tokens_in_batch = std::min(get_max_tokens_per_ssm_batch(),
                                      (int)prefill_request->tokens.size() -
                                          prefill_request->ssm_prefill_len);
   bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
@@ -2217,7 +2225,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   for (size_t i = 0; i < get_num_ssms(); i++) {
     // Compile the i-th ssm
     FFModel *ssm = get_ssm_model(i);
-    im->compile_model_and_allocate_buffer(ssm);
+    im->compile_model_and_allocate_buffer(ssm, false);
     assert(im->model_weights_loaders.find(ssm) !=
            im->model_weights_loaders.end());
     // Load model weights
@@ -2290,7 +2298,7 @@ void RequestManager::serve_spec_infer_sync(FFModel *llm) {
   for (size_t i = 0; i < get_num_ssms(); i++) {
     // Compile the i-th ssm
     FFModel *ssm = get_ssm_model(i);
-    im->compile_model_and_allocate_buffer(ssm);
+    im->compile_model_and_allocate_buffer(ssm, false);
     assert(im->model_weights_loaders.find(ssm) !=
            im->model_weights_loaders.end());
     // Load model weights
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index c3e3dcdf0..8766e6e1a 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -35,10 +35,18 @@ void RequestManager::load_tokens_task(
 
   // Extreme long prompts are not supported, only load up to
   // max_tokens_per_batch as prompt
-  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch()) {
+  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() &&
+      (batch_config->get_mode() == INC_DECODING_MODE ||
+       batch_config->get_mode() == TREE_VERIFY_MODE)) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
            BatchConfig::max_tokens_per_batch());
     printf("Got: %d tokens\n", batch_config->num_tokens);
+  } else if (batch_config->num_tokens >
+                 BatchConfig::max_tokens_per_ssm_batch() &&
+             batch_config->get_mode() == TREE_SEARCH_MODE) {
+    printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
+           BatchConfig::max_tokens_per_ssm_batch());
+    printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 
   for (int i = 0; i < batch_config->num_tokens; i++) {
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 48d79ea5c..15b0c9fb3 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -47,15 +47,16 @@ void RequestManager::load_tokens_task(
   // Extreme long prompts are not supported, only load up to
   // BatchConfig::max_tokens_per_batch() as prompt
   if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() &&
-      batch_config->get_mode() == INC_DECODING_MODE) {
+      (batch_config->get_mode() == INC_DECODING_MODE ||
+       batch_config->get_mode() == TREE_VERIFY_MODE)) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
            BatchConfig::max_tokens_per_batch());
     printf("Got: %d tokens\n", batch_config->num_tokens);
   } else if (batch_config->num_tokens >
-             BatchConfig::max_verify_tokens_per_batch()) {
-    printf("Warning: Speculative decoding. too many tokens in prompt, only "
-           "load up to %d tokens\n",
-           BatchConfig::max_verify_tokens_per_batch());
+                 BatchConfig::max_tokens_per_ssm_batch() &&
+             batch_config->get_mode() == TREE_SEARCH_MODE) {
+    printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
+           BatchConfig::max_tokens_per_ssm_batch());
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 

From d67d57726336963d23dc2065572927ebe4ab8de8 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 10 Sep 2024 18:24:03 -0700
Subject: [PATCH 467/667] chore: remove redundant max_spec_tree_tokens

---
 include/flexflow/batch_config.h          |  9 +++++----
 include/flexflow/flexflow_c.h            |  3 ---
 include/flexflow/request_manager.h       |  1 -
 inference/incr_decoding/incr_decoding.cc |  1 -
 inference/spec_infer/spec_infer.cc       |  8 --------
 src/c/flexflow_c.cc                      |  8 --------
 src/runtime/request_manager.cc           | 15 ++++++++-------
 7 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ec8171947..296182e77 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -80,12 +80,13 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  inline static int const MAX_NUM_REQUESTS = 8;
+  inline static int const MAX_NUM_REQUESTS = 64;
   inline static int const MAX_NUM_TOKENS = 1024;
-  inline static int const MAX_SPEC_TREE_TOKEN_NUM = 128;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 4;
   inline static int const MAX_TREE_DEPTH = 16;
-  inline static int const MAX_TREE_WIDTH = 64;
+  inline static int const MAX_TREE_WIDTH = 16;
+  inline static int const MAX_SPEC_TREE_TOKEN_NUM =
+      MAX_TREE_DEPTH * MAX_TREE_WIDTH + 1;
   inline static int const MAX_K_LOGITS = 16;
 
   // The Constants for the Streaming KVCache
@@ -152,7 +153,7 @@ class BatchConfig {
         std::fill(std::begin(bits), std::end(bits), 0);
       }
 
-      uint64_t bits[MAX_SPEC_TREE_TOKEN_NUM / 64];
+      uint64_t bits[(MAX_SPEC_TREE_TOKEN_NUM + 63) / 64];
     };
 
     Bitset bit_mask[MAX_SPEC_TREE_TOKEN_NUM];
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index ea94adf39..803637171 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -979,9 +979,6 @@ void flexflow_request_manager_set_max_tokens_per_batch(
 void flexflow_request_manager_set_max_tokens_per_ssm_batch(
     flexflow_request_manager_t handle_, int max_num_ssm_tokens);
 
-void flexflow_request_manager_set_max_spec_tree_token_num(
-    flexflow_request_manager_t handle_, int max_num_tokens);
-
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 54ee050c4..e1e8a5bc8 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -251,7 +251,6 @@ class RequestManager {
   int get_max_tokens_per_batch();
   void set_max_tokens_per_ssm_batch(int max_num_ssm_tokens);
   int get_max_tokens_per_ssm_batch();
-  void set_max_spec_tree_token_num(int max_num_tokens);
   int get_max_spec_tree_token_num();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 0541f7502..9c5fed08b 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -240,7 +240,6 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_decoding_mode(decoding_mode);
-  rm->set_max_spec_tree_token_num(64);
   rm->set_max_tree_depth(8);
   rm->set_max_tree_width(16);
   rm->set_verbose(verbose);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 2231aedb9..3473f83f9 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -64,7 +64,6 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_batch,
                       int &max_tokens_per_ssm_batch,
                       int &max_sequence_length,
-                      int &max_spec_tree_token_num,
                       int &max_tree_width,
                       int &max_tree_depth,
                       int &expansion_degree,
@@ -130,10 +129,6 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "--max-spec-tree-token-num")) {
-      max_spec_tree_token_num = std::stoi(argv[++i]);
-      continue;
-    }
     if (!strcmp(argv[i], "--max-tree-width")) {
       max_tree_width = std::stoi(argv[++i]);
       continue;
@@ -319,7 +314,6 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_batch = 128;
   int max_tokens_per_ssm_batch = -1;
   int max_sequence_length = 512;
-  int max_spec_tree_token_num = 64;
   int expansion_degree = 3;
   int max_tree_depth = 8;
   int max_tree_width = 16;
@@ -343,7 +337,6 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_batch,
                     max_tokens_per_ssm_batch,
                    max_sequence_length,
-                   max_spec_tree_token_num,
                    max_tree_width,
                    max_tree_depth,
                    expansion_degree,
@@ -369,7 +362,6 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
-  rm->set_max_spec_tree_token_num(max_spec_tree_token_num);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index c8649f596..c69cd4435 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2597,14 +2597,6 @@ void flexflow_request_manager_set_max_tokens_per_ssm_batch(
               max_num_ssm_tokens);
 }
 
-void flexflow_request_manager_set_max_spec_tree_token_num(
-    flexflow_request_manager_t handle_, int max_num_tokens) {
-  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
-  handle->set_max_spec_tree_token_num(max_num_tokens);
-  DEBUG_PRINT("[RequestManager] set max_spec_tree_token_num %d",
-              max_num_tokens);
-}
-
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 773c8fc32..e568fffff 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -147,13 +147,6 @@ void RequestManager::set_max_tokens_per_ssm_batch(int max_num_ssm_tokens) {
   assert(max_tokens_per_ssm_batch <= BatchConfig::MAX_NUM_TOKENS);
 }
 
-void RequestManager::set_max_spec_tree_token_num(int max_num_tokens) {
-  assert(max_spec_tree_token_num == -1 ||
-         max_spec_tree_token_num == max_num_tokens);
-  max_spec_tree_token_num = max_num_tokens;
-  assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
-}
-
 int RequestManager::get_max_tokens_per_batch() {
   assert(max_tokens_per_batch > 0);
   return max_tokens_per_batch;
@@ -210,6 +203,10 @@ void RequestManager::set_max_tree_depth(int max_tree_depth) {
          max_tree_depth <= BatchConfig::MAX_TREE_DEPTH and
          "Invalid max_tree_depth");
   this->max_tree_depth = max_tree_depth;
+  if (max_tree_width > 0) {
+    max_spec_tree_token_num = max_tree_depth * max_tree_width + 1;
+    assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  }
 }
 
 int RequestManager::get_max_tree_width() {
@@ -224,6 +221,10 @@ void RequestManager::set_max_tree_width(int max_tree_width) {
          max_tree_width <= BatchConfig::MAX_TREE_WIDTH and
          "Invalid max_tree_width");
   this->max_tree_width = max_tree_width;
+  if (max_tree_depth > 0) {
+    max_spec_tree_token_num = max_tree_depth * max_tree_width + 1;
+    assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  }
 }
 
 void RequestManager::set_speculative_sampling(bool speculative_sampling_) {

From 1b5c66e9bf4d62c79d8ae91574bb9c3176981f59 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 10 Sep 2024 22:13:54 -0700
Subject: [PATCH 468/667] chore: minor

---
 include/flexflow/request_manager.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index e1e8a5bc8..d35344d3e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -361,9 +361,9 @@ class RequestManager {
   int max_tree_width;
   int k;
   // Profile based latency
-  double baseline_latency_ms = 1000;
-  double ssm_spec_latency_ms = 50;
-  double llm_verify_latency_ms = 50;
+  double baseline_latency_ms = 43;
+  double ssm_spec_latency_ms = 17;
+  double llm_verify_latency_ms = 65;
   double correction_factor = 1.05;
 
   State request_manager_status;

From d19cd7569372888c956cbf1fd06d9215b003703d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 10 Sep 2024 22:55:15 -0700
Subject: [PATCH 469/667] style: format

---
 inference/incr_decoding/incr_decoding.cc | 2 +-
 inference/models/falcon.cc               | 9 ++++-----
 inference/models/llama.cc                | 9 ++++-----
 inference/models/mpt.cc                  | 9 ++++-----
 inference/models/opt.cc                  | 9 ++++-----
 inference/models/starcoder.cc            | 9 ++++-----
 inference/spec_infer/spec_infer.cc       | 2 +-
 src/ops/inc_multihead_self_attention.cu  | 3 +--
 8 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 9c5fed08b..80e1fc2a4 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -171,7 +171,7 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                    max_tokens_per_ssm_batch,
+                   max_tokens_per_ssm_batch,
                    max_sequence_length,
                    sampling_seed,
                    streaming_cache);
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index c8103e517..042f49bed 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -39,11 +39,10 @@ void FALCON::create_falcon_model(FFModel &ff,
   Tensor input;
   {
     // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
-    int const token_dims[] = {
-        mode == TREE_SEARCH_MODE 
-            ? BatchConfig::max_tokens_per_ssm_batch()
-            : BatchConfig::max_tokens_per_batch(),
-        1};
+    int const token_dims[] = {mode == TREE_SEARCH_MODE
+                                  ? BatchConfig::max_tokens_per_ssm_batch()
+                                  : BatchConfig::max_tokens_per_batch(),
+                              1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
 
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 3a02845b9..1acb3e684 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -42,11 +42,10 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor input;
   {
-    int const token_dims[] = {
-        mode == TREE_SEARCH_MODE 
-            ? BatchConfig::max_tokens_per_ssm_batch()
-            : BatchConfig::max_tokens_per_batch(),
-        1};
+    int const token_dims[] = {mode == TREE_SEARCH_MODE
+                                  ? BatchConfig::max_tokens_per_ssm_batch()
+                                  : BatchConfig::max_tokens_per_batch(),
+                              1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
 
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index c5aec6968..9b7628ce6 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -40,11 +40,10 @@ void MPT::create_mpt_model(FFModel &ff,
   //------------------------------ build the model --------------------------
   Tensor input;
   {
-    int const token_dims[] = {
-        mode == TREE_SEARCH_MODE 
-            ? BatchConfig::max_tokens_per_ssm_batch()
-            : BatchConfig::max_tokens_per_batch(),
-        1};
+    int const token_dims[] = {mode == TREE_SEARCH_MODE
+                                  ? BatchConfig::max_tokens_per_ssm_batch()
+                                  : BatchConfig::max_tokens_per_batch(),
+                              1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
 
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 3840432d8..7220c976c 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -42,11 +42,10 @@ void OPT::create_opt_model(FFModel &ff,
   Tensor position_input;
   ff.set_position_offset(2);
   {
-    int const token_dims[] = {
-        mode == TREE_SEARCH_MODE 
-            ? BatchConfig::max_tokens_per_ssm_batch()
-            : BatchConfig::max_tokens_per_batch(),
-        1};
+    int const token_dims[] = {mode == TREE_SEARCH_MODE
+                                  ? BatchConfig::max_tokens_per_ssm_batch()
+                                  : BatchConfig::max_tokens_per_batch(),
+                              1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
     position_input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 20643224d..25677189e 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -48,11 +48,10 @@ void STARCODER::create_starcoder_model(
   ff.set_position_offset(0);
   {
     // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
-    int const token_dims[] = {
-        mode == TREE_SEARCH_MODE 
-            ? BatchConfig::max_tokens_per_ssm_batch()
-            : BatchConfig::max_tokens_per_batch(),
-        1};
+    int const token_dims[] = {mode == TREE_SEARCH_MODE
+                                  ? BatchConfig::max_tokens_per_ssm_batch()
+                                  : BatchConfig::max_tokens_per_batch(),
+                              1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
     position_input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 3473f83f9..439ff3d15 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -335,7 +335,7 @@ void FlexFlow::top_level_task(Task const *task,
                    verbose,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                    max_tokens_per_ssm_batch,
+                   max_tokens_per_ssm_batch,
                    max_sequence_length,
                    max_tree_width,
                    max_tree_depth,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index cf614fde6..b916e74d3 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -495,8 +495,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       case INC_DECODING_MODE:
       case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
-        query_tmp_size =
-            num_q_heads * qk_dim * max_tokens_per_batch;
+        query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
                          BatchConfig::max_requests_per_batch() * max_num_pages *

From 6e37125ba15b2756569dcfcda67502522efe008a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 13 Sep 2024 22:39:42 -0700
Subject: [PATCH 470/667] chore: minor output

---
 src/ops/inc_multihead_self_attention.cu      | 2 +-
 src/ops/spec_inc_multihead_self_attention.cu | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b916e74d3..755f6b6a1 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -193,7 +193,7 @@ void incr_attention(IncMultiHeadSelfAttentionMeta *m,
     }
     if (result != cudaSuccess) {
       throw std::runtime_error("Failed to run "
-                               "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                               "IncrementalDecodingAttentionForwardKernel: " +
                                std::string(cudaGetErrorString(result)));
     }
   });
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 16dbe7476..41bbabe00 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -198,7 +198,7 @@ void tree_search_attention(SpecIncMultiHeadSelfAttentionMeta *m,
     }
     if (result != cudaSuccess) {
       throw std::runtime_error("Failed to run "
-                               "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                               "TreeSearchAttentionForwardKernel: " +
                                std::string(cudaGetErrorString(result)));
     }
   });
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 8c384c1b0..a2272e5f2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -289,7 +289,7 @@ void tree_verify_attention(TreeIncMultiHeadSelfAttentionMeta *m,
     }
     if (result != cudaSuccess) {
       throw std::runtime_error("Failed to run "
-                               "BatchPrefillWithPagedKVCacheWrapperDispatched" +
+                               "TreeVerifyAttentionKernel: " +
                                std::string(cudaGetErrorString(result)));
     }
   });

From 3c4e50eaadfc41a33f363068d39091e13ca92e89 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sat, 14 Sep 2024 17:12:18 -0400
Subject: [PATCH 471/667] Fix bugs in the scheduler.

---
 include/flexflow/batch_config.h    |  2 +-
 include/flexflow/request_manager.h | 27 +++-------
 src/runtime/request_manager.cc     | 85 +++++++++++++++++-------------
 3 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 296182e77..b65143395 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -82,7 +82,7 @@ class BatchConfig {
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
   inline static int const MAX_NUM_TOKENS = 1024;
-  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 4;
+  inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
   inline static int const MAX_TREE_DEPTH = 16;
   inline static int const MAX_TREE_WIDTH = 16;
   inline static int const MAX_SPEC_TREE_TOKEN_NUM =
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d35344d3e..40c8adb5b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -60,14 +60,14 @@ class InferenceManager {
 class TokenTreeNode {
 public:
   BatchConfig::TokenId id;
-  float log_accumulated_prob;
+  double log_accumulated_prob;
   int parent_pos;
   bool included = false;
   bool gumbel = false;
   float gumbel_logit = 0.0f;
 
   TokenTreeNode(BatchConfig::TokenId id,
-                float log_accumulated_prob,
+                double log_accumulated_prob,
                 int parent_pos,
                 bool gumbel = false,
                 float gumbel_logit = 0.0f)
@@ -82,7 +82,7 @@ bool operator<=(std::shared_ptr<TokenTreeNode> const &lhs,
                 std::shared_ptr<TokenTreeNode> const &rhs);
 
 // A comparator for std::shared_ptr<TokenTreeNode>
-// This is used to sort the token tree nodes in ascending order
+// This is used to construct a max heap for the token tree nodes
 struct SharedTokenTreeNodePtrLess {
   bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
                   std::shared_ptr<TokenTreeNode> const &rhs) const {
@@ -94,19 +94,6 @@ struct SharedTokenTreeNodePtrLess {
   }
 };
 
-// A comparator for std::shared_ptr<TokenTreeNode>
-// This is used in to sort the token tree nodes in descending order
-struct SharedTokenTreeNodePtrGreater {
-  bool operator()(std::shared_ptr<TokenTreeNode> const &lhs,
-                  std::shared_ptr<TokenTreeNode> const &rhs) const {
-    if (lhs->gumbel) {
-      assert(rhs->gumbel);
-      return lhs->gumbel_logit > rhs->gumbel_logit;
-    }
-    return lhs->log_accumulated_prob > rhs->log_accumulated_prob;
-  }
-};
-
 class TokenTree {
 public:
   std::list<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
@@ -337,13 +324,13 @@ class RequestManager {
   int get_empty_request_index();
 
   // Comparters
-  struct SharedTokenTreeNodePtrRequestGuidWeightedGreater {
+  struct SharedTokenTreeNodePtrRequestGuidWeightedLess {
     bool operator()(
         std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
         std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
         const;
   };
-  struct SharedTokenTreeNodePtrRequestGuidGreater {
+  struct SharedTokenTreeNodePtrRequestGuidLess {
     bool operator()(
         std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
         std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
@@ -499,8 +486,8 @@ class RequestManager {
       reject_sampling(std::vector<std::pair<TokenId, float>> &D,
                       std::unordered_map<TokenId, float> &R,
                       int k);
-  void gumbel_conditioned_on_max(float target_max,
-                                 std::vector<std::pair<float, int>> &logits);
+  void gumbel_conditioned_on_max(double target_max,
+                                 std::vector<std::pair<double, int>> &logits);
 
   // Profiling related functions
   void reset_profiling_statistics();
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e568fffff..89054ba20 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -284,16 +284,14 @@ Request &RequestManager::get_request_with_guid(RequestGuid guid) {
   return all_requests[guid];
 }
 
-bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedGreater::
-    operator()(
-        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
-        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
-        const {
+bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedLess::operator()(
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
+    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs) const {
   if (lhs.first->gumbel) {
     assert(rhs.first->gumbel);
     return lhs.first->gumbel_logit * get_request_manager()
                                          ->get_request_with_guid(lhs.second)
-                                         .get_length_weight() >
+                                         .get_length_weight() <
            rhs.first->gumbel_logit * get_request_manager()
                                          ->get_request_with_guid(rhs.second)
                                          .get_length_weight();
@@ -301,21 +299,21 @@ bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedGreater::
   return lhs.first->log_accumulated_prob *
              get_request_manager()
                  ->get_request_with_guid(lhs.second)
-                 .get_length_weight() >
+                 .get_length_weight() <
          rhs.first->log_accumulated_prob *
              get_request_manager()
                  ->get_request_with_guid(rhs.second)
                  .get_length_weight();
 }
 
-bool RequestManager::SharedTokenTreeNodePtrRequestGuidGreater ::operator()(
+bool RequestManager::SharedTokenTreeNodePtrRequestGuidLess ::operator()(
     std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
     std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs) const {
   if (lhs.first->gumbel) {
     assert(rhs.first->gumbel);
-    return lhs.first->gumbel_logit > rhs.first->gumbel_logit;
+    return lhs.first->gumbel_logit < rhs.first->gumbel_logit;
   }
-  return lhs.first->log_accumulated_prob > rhs.first->log_accumulated_prob;
+  return lhs.first->log_accumulated_prob < rhs.first->log_accumulated_prob;
 }
 
 void RequestManager::register_tokenizer(ModelType type,
@@ -1344,6 +1342,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     std::cout
         << "\n############### prepare_verify_batch_config ###############\n";
   }
+  // TODO: REMOVE THIS OUTPUT
+  //   std::cout
+  //       << "\n############### prepare_verify_batch_config ###############\n";
   // This method does the following:
   // 1. Commit the verified tokens in the last iteration through the
   // BatchConfig. We can do this request by request.
@@ -1427,6 +1428,14 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
     request.num_tokens_in_batch = token_tree_index;
 
+    // TODO: REMOVE THIS OUTPUT
+    // std::cout << "Request " << request_index
+    //           << " token tree size: " << request.num_tokens_in_batch
+    //           << std::endl;
+    // std::cout << "Request " << guid << " token tree: " << std::endl;
+    // std::cout << request.speculative_token_trees[0];
+    // std::cout << std::endl;
+
     // Create the causal mask for the large model based on the small model
     // causal mask.
     new_bc.causalMask[request_index] = create_llm_bitmask(guid);
@@ -1736,12 +1745,12 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 }
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
-    float target_max, std::vector<std::pair<float, int>> &logits) {
+    double target_max, std::vector<std::pair<double, int>> &logits) {
   // Assume the logits are sorted in descending order
   if (logits.size() == 0) {
     return;
   }
-  float max_logit = logits[0].first;
+  double max_logit = logits[0].first;
   for (auto &logit_n_idx : logits) {
     logit_n_idx.first =
         -log(exp(-target_max) - exp(-max_logit) + exp(-logit_n_idx.first));
@@ -2512,11 +2521,11 @@ void RequestManager::add_root_to_spec_token_tree(
   TokenTree &speculative_token_tree = request.speculative_token_trees[0];
   speculative_token_tree.add_layer();
   auto node_ptr = std::make_shared<TokenTreeNode>(token_id, 0.0, -1);
+  node_ptr->included = true;
   if (speculative_sampling) {
     node_ptr->gumbel = true;
   }
   speculative_token_tree.tree_layers.front().push_back(node_ptr);
-  request.token_tree_nodes_pq.push(node_ptr);
 }
 
 void RequestManager::add_tokens_to_spec_token_tree(
@@ -2545,37 +2554,39 @@ void RequestManager::add_tokens_to_spec_token_tree(
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
       // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
-      float parent_log_prob = parent_ptr->log_accumulated_prob;
+      double parent_log_prob = parent_ptr->log_accumulated_prob;
       int child_start_idx =
           result_offset +
           parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
       // TODO: rename child_probs to child_logits after change the output of
       // argmax from prob to logprob
-      std::vector<std::pair<float, int>> child_probs(
-          BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+      std::vector<std::pair<double, int>> child_probs;
       for (int child_pos = 0;
            child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
            child_pos++) {
         int result_idx = child_start_idx + child_pos;
         if (!speculative_sampling) {
           // TODO: the argmax will return log prob instead of prob
-          if (log(ssm_inference_result.probs[result_idx]) !=
-              -std::numeric_limits<float>::infinity()) {
-            child_probs[child_pos] = std::make_pair(
-                log(ssm_inference_result.probs[result_idx]), result_idx);
+          double log_prob = log((double)ssm_inference_result.probs[result_idx]);
+          if (log_prob == 0.0) {
+            // Slightly perturb the log prob to make it strictly less than 0
+            log_prob -= 1e-10;
+          }
+          if (log_prob != -std::numeric_limits<double>::infinity()) {
+            child_probs.push_back(std::make_pair(log_prob, result_idx));
           }
         } else {
           // Use gumbel perturbed logits here
           // TODO: handle the case when the child logit is -inf
           // TODO: this branch is not tested
-          child_probs[child_pos] = std::make_pair(
-              ssm_inference_result.gumbel_logits[result_idx], result_idx);
+          child_probs.push_back(std::make_pair(
+              ssm_inference_result.gumbel_logits[result_idx], result_idx));
         }
       }
       // Sort in descending order
       std::sort(child_probs.begin(),
                 child_probs.end(),
-                std::greater<std::pair<float, int>>());
+                std::greater<std::pair<double, int>>());
       if (speculative_sampling) {
         // TODO: this branch is not tested
         // Condition the gumbel perturbed logits on the maximum
@@ -2583,11 +2594,11 @@ void RequestManager::add_tokens_to_spec_token_tree(
       }
 
       for (auto const &child_prob : child_probs) {
-        float logit = child_prob.first;
+        double logit = child_prob.first;
         // The value used to compare between tokens
-        float accumulated_log_prob = logit + parent_log_prob;
-        float gumbel_logit = 0.0f;
-        float cmp_value;
+        double accumulated_log_prob = logit + parent_log_prob;
+        double gumbel_logit = 0.0f;
+        double cmp_value;
         if (speculative_sampling) {
           cmp_value = gumbel_logit = logit;
         } else {
@@ -2595,9 +2606,6 @@ void RequestManager::add_tokens_to_spec_token_tree(
         }
         int result_idx = child_prob.second;
 
-        assert(logit != -std::numeric_limits<float>::infinity() &&
-               "Child log probability should not be -inf.");
-
         if (tokens.size() == max_tree_width and
             cmp_value <= (speculative_sampling
                               ? (*tokens.begin())->gumbel_logit
@@ -2638,6 +2646,7 @@ void RequestManager::add_tokens_to_spec_token_tree(
     spec_token_tree.add_layer();
     for (auto token_it = tokens.cbegin(); token_it != tokens.cend();
          token_it++) {
+      assert((*token_it)->log_accumulated_prob != 0.0);
       spec_token_tree.tree_layers.back().push_back((*token_it));
       request.token_tree_nodes_pq.push((*token_it));
     }
@@ -2678,6 +2687,8 @@ void RequestManager::prune_token_tree() {
   }
 
   assert(budget >= 0);
+  // TODO: REMOVE THIS OUTPUT
+  //   std::cout << "Budget: " << budget << std::endl;
   if (budget > 0) {
     if (memory_occupancy) {
       add_tokens_toward_memory_occupancy(budget);
@@ -2693,10 +2704,9 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
                                 correction_factor /
                                 (baseline_latency_ms * request.get_slo_ratio());
 
+  // The root is already included
+  // In function add_root_to_spec_token_tree
   double current_added = 1.0;
-  // Include the root of every token tree
-  request.token_tree_nodes_pq.top()->included = true;
-  request.token_tree_nodes_pq.pop();
 
   while (budget > 0 and current_added < num_tokens_to_decode) {
     if (request.token_tree_nodes_pq.empty()) {
@@ -2705,7 +2715,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
     auto node_ptr = request.token_tree_nodes_pq.top();
     request.token_tree_nodes_pq.pop();
     node_ptr->included = true;
-    current_added += node_ptr->log_accumulated_prob;
+    current_added += exp(node_ptr->log_accumulated_prob);
     budget--;
   }
 }
@@ -2716,7 +2726,7 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
   std::priority_queue<
       std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
       std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
-      SharedTokenTreeNodePtrRequestGuidWeightedGreater>
+      SharedTokenTreeNodePtrRequestGuidWeightedLess>
       global_token_tree_node_pq;
 
   // Initialie the priority queue with the top element in each request's token
@@ -2773,7 +2783,7 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
   std::priority_queue<
       std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
       std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
-      SharedTokenTreeNodePtrRequestGuidGreater>
+      SharedTokenTreeNodePtrRequestGuidLess>
       global_token_tree_node_pq;
 
   // Initialie the priority queue with the top element in each request's token
@@ -2800,6 +2810,8 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
     auto [node_ptr, guid] = global_token_tree_node_pq.top();
     global_token_tree_node_pq.pop();
     node_ptr->included = true;
+    // TODO: REMOVE THIS OUTPUT
+    // std::cout << node_ptr->log_accumulated_prob << std::endl;
     if (!get_request_with_guid(guid).token_tree_nodes_pq.empty()) {
       global_token_tree_node_pq.push(
           {get_request_with_guid(guid).token_tree_nodes_pq.top(), guid});
@@ -2832,6 +2844,7 @@ std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
     int token_pos = 0;
     for (auto const &node : layer) {
       if (node->included) {
+        os << std::fixed << std::setprecision(12);
         os << "token pos: " << token_pos << "\ttoken id: " << node->id
            << "\tparent pos: " << node->parent_pos
            << "\tlog prob: " << node->log_accumulated_prob << std::endl;

From 62ac7ed75dd24d30d2e157dd56cef322380af14f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 14 Sep 2024 14:40:27 -0700
Subject: [PATCH 472/667] feat: add max_tokens_per_prefilling_batch

---
 include/flexflow/batch_config.h          |  1 +
 include/flexflow/flexflow_c.h            |  3 +++
 include/flexflow/request_manager.h       |  5 ++++-
 inference/incr_decoding/incr_decoding.cc | 11 +++++++++++
 inference/models/falcon.cc               | 10 ++++++----
 inference/models/llama.cc                | 10 ++++++----
 inference/models/mpt.cc                  | 10 ++++++----
 inference/models/opt.cc                  | 10 ++++++----
 inference/models/starcoder.cc            | 10 ++++++----
 inference/spec_infer/spec_infer.cc       | 11 +++++++++++
 src/c/flexflow_c.cc                      |  8 ++++++++
 src/ops/inc_multihead_self_attention.cpp |  9 +++++----
 src/ops/inc_multihead_self_attention.cu  |  7 ++++---
 src/runtime/batch_config.cc              |  5 +++++
 src/runtime/inference_manager.cc         |  6 ++++--
 src/runtime/request_manager.cpp          | 18 ++++++++----------
 src/runtime/request_manager.cu           | 17 +++++++----------
 17 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 296182e77..7d27b76cc 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -68,6 +68,7 @@ class BatchConfig {
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_tokens_per_ssm_batch();
+  static int max_tokens_per_prefilling_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
   static int get_max_tree_depth();
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 803637171..1ac5e1fb6 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -979,6 +979,9 @@ void flexflow_request_manager_set_max_tokens_per_batch(
 void flexflow_request_manager_set_max_tokens_per_ssm_batch(
     flexflow_request_manager_t handle_, int max_num_ssm_tokens);
 
+void flexflow_request_manager_set_max_tokens_per_prefilling_batch(
+    flexflow_request_manager_t handle_, int max_num_prefilling_tokens);
+
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index d35344d3e..12b4c8123 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -251,6 +251,8 @@ class RequestManager {
   int get_max_tokens_per_batch();
   void set_max_tokens_per_ssm_batch(int max_num_ssm_tokens);
   int get_max_tokens_per_ssm_batch();
+  void set_max_tokens_per_prefilling_batch(int max_num_prefilling_tokens);
+  int get_max_tokens_per_prefilling_batch();
   int get_max_spec_tree_token_num();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
@@ -355,6 +357,7 @@ class RequestManager {
   int max_requests_per_batch;
   int max_tokens_per_batch;
   int max_tokens_per_ssm_batch;
+  int max_tokens_per_prefilling_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
   int max_tree_depth;
@@ -388,7 +391,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
-  Request *prefill_request = nullptr;
+  std::vector<Request *> prefill_requests;
 
   // Added to make the request manager stateful. During the processing of the
   // first small model inference results, the step equals to 1. That is, every
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 80e1fc2a4..61ef7e22e 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -48,6 +48,7 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_tokens_per_ssm_batch,
+                      int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &sampling_seed,
                       bool &streaming_cache) {
@@ -108,6 +109,10 @@ void parse_input_args(char **argv,
       max_tokens_per_ssm_batch = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-tokens-per-prefilling-batch")) {
+      max_tokens_per_prefilling_batch = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
@@ -151,6 +156,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
   int max_tokens_per_ssm_batch = -1;
+  int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
@@ -172,12 +178,16 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_tokens_per_ssm_batch,
+                   max_tokens_per_prefilling_batch,
                    max_sequence_length,
                    sampling_seed,
                    streaming_cache);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
+  if (max_tokens_per_prefilling_batch == -1) {
+    max_tokens_per_prefilling_batch = max_tokens_per_batch;
+  }
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -238,6 +248,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_decoding_mode(decoding_mode);
   rm->set_max_tree_depth(8);
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 042f49bed..4bd71421d 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -39,10 +39,12 @@ void FALCON::create_falcon_model(FFModel &ff,
   Tensor input;
   {
     // assert(falcon_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
-    int const token_dims[] = {mode == TREE_SEARCH_MODE
-                                  ? BatchConfig::max_tokens_per_ssm_batch()
-                                  : BatchConfig::max_tokens_per_batch(),
-                              1};
+    int const token_dims[] = {
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
+        1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
 
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 1acb3e684..81e255d83 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -42,10 +42,12 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor input;
   {
-    int const token_dims[] = {mode == TREE_SEARCH_MODE
-                                  ? BatchConfig::max_tokens_per_ssm_batch()
-                                  : BatchConfig::max_tokens_per_batch(),
-                              1};
+    int const token_dims[] = {
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
+        1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
 
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 9b7628ce6..d13fb6bae 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -40,10 +40,12 @@ void MPT::create_mpt_model(FFModel &ff,
   //------------------------------ build the model --------------------------
   Tensor input;
   {
-    int const token_dims[] = {mode == TREE_SEARCH_MODE
-                                  ? BatchConfig::max_tokens_per_ssm_batch()
-                                  : BatchConfig::max_tokens_per_batch(),
-                              1};
+    int const token_dims[] = {
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
+        1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
 
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 7220c976c..837c8de0c 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -42,10 +42,12 @@ void OPT::create_opt_model(FFModel &ff,
   Tensor position_input;
   ff.set_position_offset(2);
   {
-    int const token_dims[] = {mode == TREE_SEARCH_MODE
-                                  ? BatchConfig::max_tokens_per_ssm_batch()
-                                  : BatchConfig::max_tokens_per_batch(),
-                              1};
+    int const token_dims[] = {
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
+        1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
     position_input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 25677189e..dbce90b7c 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -48,10 +48,12 @@ void STARCODER::create_starcoder_model(
   ff.set_position_offset(0);
   {
     // assert(startcoder_config.max_num_tokens <= BatchConfig::MAX_NUM_TOKENS);
-    int const token_dims[] = {mode == TREE_SEARCH_MODE
-                                  ? BatchConfig::max_tokens_per_ssm_batch()
-                                  : BatchConfig::max_tokens_per_batch(),
-                              1};
+    int const token_dims[] = {
+        std::max(mode == TREE_SEARCH_MODE
+                     ? BatchConfig::max_tokens_per_ssm_batch()
+                     : BatchConfig::max_tokens_per_batch(),
+                 BatchConfig::max_tokens_per_prefilling_batch()),
+        1};
     input = ff.create_tensor<2>(token_dims, DT_INT32);
     position_input = ff.create_tensor<2>(token_dims, DT_INT32);
   }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 439ff3d15..741fa3d77 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -63,6 +63,7 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_tokens_per_ssm_batch,
+                      int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &max_tree_width,
                       int &max_tree_depth,
@@ -125,6 +126,10 @@ void parse_input_args(char **argv,
       max_tokens_per_ssm_batch = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-tokens-per-prefilling-batch")) {
+      max_tokens_per_prefilling_batch = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
@@ -313,6 +318,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
   int max_tokens_per_ssm_batch = -1;
+  int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 512;
   int expansion_degree = 3;
   int max_tree_depth = 8;
@@ -336,6 +342,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_tokens_per_ssm_batch,
+                   max_tokens_per_prefilling_batch,
                    max_sequence_length,
                    max_tree_width,
                    max_tree_depth,
@@ -347,6 +354,9 @@ void FlexFlow::top_level_task(Task const *task,
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
+  if (max_tokens_per_prefilling_batch == -1) {
+    max_tokens_per_prefilling_batch = max_tokens_per_batch;
+  }
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -362,6 +372,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_requests_per_batch(max_requests_per_batch);
   rm->set_max_tokens_per_batch(max_tokens_per_batch);
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index c69cd4435..e0bdf31e1 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2597,6 +2597,14 @@ void flexflow_request_manager_set_max_tokens_per_ssm_batch(
               max_num_ssm_tokens);
 }
 
+void flexflow_request_manager_set_max_tokens_per_prefilling_batch(
+    flexflow_request_manager_t handle_, int max_num_prefilling_tokens) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_tokens_per_prefilling_batch(max_num_prefilling_tokens);
+  DEBUG_PRINT("[RequestManager] set max_tokens_per_prefilling_batch %d",
+              max_num_prefilling_tokens);
+}
+
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 17e81b54b..59f337067 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
-#include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ffconst.h"
 #include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/utils/hip_helper.h"
@@ -951,9 +951,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = infer_mode == TREE_SEARCH_MODE
-                                   ? BatchConfig::max_tokens_per_ssm_batch()
-                                   : BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = std::max(
+        infer_mode == TREE_SEARCH_MODE ? BatchConfig::max_tokens_per_ssm_batch()
+                                       : BatchConfig::max_tokens_per_batch(),
+        BatchConfig::max_tokens_per_prefilling_batch());
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 755f6b6a1..4e4f249ea 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -476,9 +476,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = infer_mode == TREE_SEARCH_MODE
-                                   ? BatchConfig::max_tokens_per_ssm_batch()
-                                   : BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = std::max(
+        infer_mode == TREE_SEARCH_MODE ? BatchConfig::max_tokens_per_ssm_batch()
+                                       : BatchConfig::max_tokens_per_batch(),
+        BatchConfig::max_tokens_per_prefilling_batch());
     size_t qkv_max_proj_size =
         max_tokens_per_batch *
         (qk_dim * num_q_heads + qk_dim * num_q_heads + v_dim * num_q_heads);
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index fa2ae1cb0..d932da493 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -93,6 +93,11 @@ int BatchConfig::max_tokens_per_ssm_batch() {
   return RequestManager::get_request_manager()->get_max_tokens_per_ssm_batch();
 }
 
+/*static*/
+int BatchConfig::max_tokens_per_prefilling_batch() {
+  return RequestManager::get_request_manager()->get_max_tokens_per_prefilling_batch();
+}
+
 /*static*/
 int BatchConfig::max_sequence_length() {
   return RequestManager::get_request_manager()->get_max_sequence_length();
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index f9dc552f1..4fd5c4846 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -61,8 +61,10 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model,
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
-  model->config.batchSize = is_llm ? BatchConfig::max_tokens_per_batch()
-                                   : BatchConfig::max_tokens_per_ssm_batch();
+  model->config.batchSize =
+      std::max(is_llm ? BatchConfig::max_tokens_per_batch()
+                      : BatchConfig::max_tokens_per_ssm_batch(),
+               BatchConfig::max_tokens_per_prefilling_batch());
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index 8766e6e1a..7fa82a653 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flexflow/ffconst.h"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
@@ -35,17 +36,14 @@ void RequestManager::load_tokens_task(
 
   // Extreme long prompts are not supported, only load up to
   // max_tokens_per_batch as prompt
-  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() &&
-      (batch_config->get_mode() == INC_DECODING_MODE ||
-       batch_config->get_mode() == TREE_VERIFY_MODE)) {
+  int max_tokens_per_batch =
+      std::max(batch_config->get_mode() == TREE_SEARCH_MODE
+                   ? BatchConfig::max_tokens_per_ssm_batch()
+                   : BatchConfig::max_tokens_per_batch(),
+               BatchConfig::max_tokens_per_prefilling_batch());
+  if (batch_config->num_tokens > max_tokens_per_batch) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
-           BatchConfig::max_tokens_per_batch());
-    printf("Got: %d tokens\n", batch_config->num_tokens);
-  } else if (batch_config->num_tokens >
-                 BatchConfig::max_tokens_per_ssm_batch() &&
-             batch_config->get_mode() == TREE_SEARCH_MODE) {
-    printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
-           BatchConfig::max_tokens_per_ssm_batch());
+           max_tokens_per_batch);
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 15b0c9fb3..733cca745 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -46,17 +46,14 @@ void RequestManager::load_tokens_task(
 
   // Extreme long prompts are not supported, only load up to
   // BatchConfig::max_tokens_per_batch() as prompt
-  if (batch_config->num_tokens > BatchConfig::max_tokens_per_batch() &&
-      (batch_config->get_mode() == INC_DECODING_MODE ||
-       batch_config->get_mode() == TREE_VERIFY_MODE)) {
+  int max_tokens_per_batch =
+      std::max(batch_config->get_mode() == TREE_SEARCH_MODE
+                   ? BatchConfig::max_tokens_per_ssm_batch()
+                   : BatchConfig::max_tokens_per_batch(),
+               BatchConfig::max_tokens_per_prefilling_batch());
+  if (batch_config->num_tokens > max_tokens_per_batch) {
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
-           BatchConfig::max_tokens_per_batch());
-    printf("Got: %d tokens\n", batch_config->num_tokens);
-  } else if (batch_config->num_tokens >
-                 BatchConfig::max_tokens_per_ssm_batch() &&
-             batch_config->get_mode() == TREE_SEARCH_MODE) {
-    printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
-           BatchConfig::max_tokens_per_ssm_batch());
+           max_tokens_per_batch);
     printf("Got: %d tokens\n", batch_config->num_tokens);
   }
 

From da91d84bfc9735e9b30185d46af875570468561b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 14 Sep 2024 15:50:10 -0700
Subject: [PATCH 473/667] feat: support batched prefilling

---
 src/runtime/request_manager.cc | 305 ++++++++++++++++++---------------
 1 file changed, 171 insertions(+), 134 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e568fffff..272564413 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
-#include "flexflow/request_manager.h"
 #include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
+#include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
@@ -112,6 +112,7 @@ RequestManager::RequestManager()
   max_requests_per_batch = -1;
   max_tokens_per_batch = -1;
   max_tokens_per_ssm_batch = -1;
+  max_tokens_per_prefilling_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
   max_tree_depth = -1;
@@ -147,6 +148,13 @@ void RequestManager::set_max_tokens_per_ssm_batch(int max_num_ssm_tokens) {
   assert(max_tokens_per_ssm_batch <= BatchConfig::MAX_NUM_TOKENS);
 }
 
+void RequestManager::set_max_tokens_per_prefilling_batch(int max_num_prefilling_tokens) {
+  assert(max_tokens_per_prefilling_batch == -1 ||
+         max_tokens_per_prefilling_batch == max_num_prefilling_tokens);
+  max_tokens_per_prefilling_batch = max_num_prefilling_tokens;
+  assert(max_tokens_per_prefilling_batch <= BatchConfig::MAX_NUM_TOKENS);
+}
+
 int RequestManager::get_max_tokens_per_batch() {
   assert(max_tokens_per_batch > 0);
   return max_tokens_per_batch;
@@ -157,6 +165,11 @@ int RequestManager::get_max_tokens_per_ssm_batch() {
   return max_tokens_per_ssm_batch;
 }
 
+int RequestManager::get_max_tokens_per_prefilling_batch() {
+  assert(max_tokens_per_prefilling_batch > 0);
+  return max_tokens_per_prefilling_batch;
+}
+
 int RequestManager::get_max_spec_tree_token_num() {
   assert(max_spec_tree_token_num > 0);
   return max_spec_tree_token_num;
@@ -521,7 +534,8 @@ BatchConfig RequestManager::get_next_batch_config_task(
   RequestManager *rm = *((RequestManager **)task->args);
   if (rm->request_manager_status == PREFILLING and rm->prefill_model == SSM and
       rm->current_ssm_step != 0) {
-    // Return an empty batch config
+    // Return an empty batch config, because we only need on step for SSM
+    // prefilling, and the rest is placeholder for scheduling
     return rm->get_next_batch_config(InferenceResult());
   } else if (rm->request_manager_status == SSM_SPEC and rm->ssm_completed) {
     return rm->get_next_batch_config(InferenceResult());
@@ -557,26 +571,30 @@ bool RequestManager::load_pending_request_to_batch() {
   }
   std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
   assert(!pending_request_queue.empty() && "No pending request to process.");
-  RequestGuid guid = pending_request_queue.front().guid;
-  pending_request_queue.pop();
-
-  prefill_request = &all_requests[guid];
-  prefill_request->status = Request::RUNNING;
-
-  // Find an empty slot
-  int request_index = get_empty_request_index();
-  assert(request_index != -1 && "No empty request slot to load the request.");
-  // Load request into batch
-  prefill_request->batch_index = request_index;
-  guid_of_requests[request_index] = guid;
-  request_available[request_index] = true;
-  num_available_requests++;
-  // Initialize the bitmask for the new request with its prompt length
-  init_bitmask_prompt(guid, prefill_request->tokens.size());
-
-  profiling_requests[guid] = RequestProfileInfo();
-  profiling_requests[guid].start_time =
-      Realm::Clock::current_time_in_microseconds();
+  while (num_available_requests < get_max_requests_per_batch() &&
+         !pending_request_queue.empty()) {
+    RequestGuid guid = pending_request_queue.front().guid;
+    pending_request_queue.pop();
+    Request *request = &all_requests[guid];
+
+    request->status = Request::RUNNING;
+    // Find an empty slot
+    int request_index = get_empty_request_index();
+    assert(request_index != -1 && "No empty request slot to load the request.");
+    // Load request into batch
+    request->batch_index = request_index;
+    guid_of_requests[request_index] = guid;
+    request_available[request_index] = true;
+    num_available_requests++;
+    // Initialize the bitmask for the new request with its prompt length
+    init_bitmask_prompt(guid, request->tokens.size());
+
+    prefill_requests.push_back(request);
+
+    profiling_requests[guid] = RequestProfileInfo();
+    profiling_requests[guid].start_time =
+        Realm::Clock::current_time_in_microseconds();
+  }
   return true;
 }
 
@@ -692,8 +710,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
       if (decoding_mode == INCREMENTAL_DECODING) {
         if (update_llm_prefill_results(result)) {
           // This indicates that the prefilling of the current request
-          // finishes Reset the prefill_request
-          prefill_request = nullptr;
+          // finishes
 
           // Check if there are more empty slots
           if (num_available_requests < get_max_requests_per_batch() &&
@@ -723,7 +740,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         } else if (prefill_model == LLM) {
           if (update_llm_prefill_results(result)) {
             // This indicates that the prefilling phase finishes
-            prefill_request = nullptr;
+
             // Check if there are more empty slots
             if (num_available_requests < get_max_requests_per_batch() &&
                 load_pending_request_to_batch()) {
@@ -798,45 +815,53 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 }
 
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
-  bool prefill_completed = false;
-  if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
-    prefill_request->streaming_cache_info.commit_cache(
-        prefill_request->num_tokens_in_batch);
-    prefill_request->llm_cache_size =
-        prefill_request->streaming_cache_info.commit_len;
-  } else {
-    prefill_request->llm_cache_size += prefill_request->num_tokens_in_batch;
-  }
-  prefill_request->llm_prefill_len += prefill_request->num_tokens_in_batch;
+  int num_tokens = 0;
+  std::vector<Request *> incomplete_requests;
+  incomplete_requests.reserve(prefill_requests.size());
+  for (Request *request : prefill_requests) {
+    if (request->num_tokens_in_batch > 0) {
+      if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
+        request->streaming_cache_info.commit_cache(
+            request->num_tokens_in_batch);
+        request->llm_cache_size = request->streaming_cache_info.commit_len;
+      } else {
+        request->llm_cache_size += request->num_tokens_in_batch;
+      }
+      request->llm_prefill_len += request->num_tokens_in_batch;
 
-  if (prefill_request->llm_prefill_len == prefill_request->tokens.size()) {
-    // Indicates that the LLM prefilling phase finishes
-    prefill_request->tokens.push_back(
-        result.token_ids[prefill_request->num_tokens_in_batch - 1]);
-    prefill_completed = true;
+      if (request->llm_prefill_len == request->tokens.size()) {
+        // Indicates that the LLM prefilling phase finishes
+        request->tokens.push_back(
+            result.token_ids[num_tokens + request->num_tokens_in_batch - 1]);
 
-    if (prefill_request->tokens.back() == eos_token_id) {
-      request_complete_clean_up(prefill_request->batch_index, true);
-    }
+        if (request->tokens.back() == eos_token_id) {
+          request_complete_clean_up(request->batch_index, true);
+        }
 
-    if (decoding_mode == SPECULATIVE_DECODING) {
-      // Add the last token to the token tree
-      assert(prefill_request->committed_tokens.empty() &&
-             "The committed tokens should be empty.");
-      prefill_request->committed_tokens.push_back(
-          Request::CommittedToken{-1,
-                                  (int)prefill_request->tokens.size() - 1,
-                                  prefill_request->tokens.back()});
-      init_token_tree(prefill_request->guid);
-      add_root_to_spec_token_tree(prefill_request->guid,
-                                  prefill_request->tokens.back());
-      update_bitmask_prompt(prefill_request->guid, 1);
+        if (decoding_mode == SPECULATIVE_DECODING) {
+          // Add the last token to the token tree
+          assert(request->committed_tokens.empty() &&
+                 "The committed tokens should be empty.");
+          request->committed_tokens.push_back(Request::CommittedToken{
+              -1, (int)request->tokens.size() - 1, request->tokens.back()});
+          init_token_tree(request->guid);
+          add_root_to_spec_token_tree(request->guid, request->tokens.back());
+          update_bitmask_prompt(request->guid, 1);
+        }
+      } else {
+        // Next phase will still be prefilling
+        incomplete_requests.push_back(request);
+      }
+      profiling_requests[request->guid].llm_prefilling_steps++;
+      num_tokens += request->num_tokens_in_batch;
+    } else if (request->llm_prefill_len < request->tokens.size()) {
+      // The request is not completed, continue prefilling
+      incomplete_requests.push_back(request);
     }
   }
 
-  profiling_requests[prefill_request->guid].llm_prefilling_steps++;
-
-  return prefill_completed;
+  prefill_requests.swap(incomplete_requests);
+  return prefill_requests.empty();
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
@@ -888,17 +913,20 @@ void RequestManager::update_ssm_prefill_results(
   // This function is called by update_inference_results when the
   // request_manager_status is PREFILLING and the prefill_model is SSM.
   // There's no results to update, but we should update ssm_cache_size.
-  if (streaming_cache) {
-    prefill_request->streaming_cache_info.commit_cache(
-        prefill_request->num_tokens_in_batch);
-    prefill_request->ssm_cache_size =
-        prefill_request->streaming_cache_info.commit_len;
-  } else {
-    prefill_request->ssm_cache_size += prefill_request->num_tokens_in_batch;
-  }
-  prefill_request->ssm_prefill_len += prefill_request->num_tokens_in_batch;
+  for (Request *request : prefill_requests) {
+    if (request->num_tokens_in_batch > 0) {
+      if (streaming_cache) {
+        request->streaming_cache_info.commit_cache(
+            request->num_tokens_in_batch);
+        request->ssm_cache_size = request->streaming_cache_info.commit_len;
+      } else {
+        request->ssm_cache_size += request->num_tokens_in_batch;
+      }
+      request->ssm_prefill_len += request->num_tokens_in_batch;
 
-  profiling_requests[prefill_request->guid].ssm_prefilling_steps++;
+      profiling_requests[request->guid].ssm_prefilling_steps++;
+    }
+  }
 }
 
 BatchConfig RequestManager::prepare_next_batch() {
@@ -955,7 +983,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     std::cout << "\n############### prepare_llm_prefilling_batch "
                  "##############\n";
   }
-  assert(prefill_request != nullptr &&
+  assert(prefill_requests.size() > 0 &&
          "No prefilling request to process in the prefilling phase.");
 
   BatchConfig bc;
@@ -965,42 +993,45 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
   }
   bc.prompt_phase = true;
-  bc.request_available[prefill_request->batch_index] = true;
-  bc.num_available_requests = 1;
+  int num_tokens = 0;
+  for (Request *request : prefill_requests) {
+    int request_index = request->batch_index;
+    bc.request_available[request_index] = true;
 
-  int request_index = prefill_request->batch_index;
-  RequestGuid guid = guid_of_requests[request_index];
-  Request &request = all_requests[guid];
-  assert(request.status == Request::RUNNING);
-
-  // Request Info
-  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].first_token_index_in_request =
-      prefill_request->llm_cache_size;
-  int num_tokens_in_batch = std::min(get_max_tokens_per_batch(),
-                                     (int)prefill_request->tokens.size() -
-                                         prefill_request->llm_prefill_len);
-  bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
-
-  // Copy the streaming cache info
-  bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
-
-  prefill_request->first_token_offset_in_batch = 0;
-  prefill_request->num_tokens_in_batch = num_tokens_in_batch;
-
-  // Token Info
-  for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
-    int abs_idx = prefill_request->llm_cache_size + token_idx;
-    assert(abs_idx < prefill_request->tokens.size());
-
-    bc.tokensInfo[token_idx].request_index = request_index;
-    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
-    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-    bc.tokensInfo[token_idx].token_id =
-        prefill_request->tokens[prefill_request->llm_prefill_len + token_idx];
+    assert(request->status == Request::RUNNING);
 
-    bc.num_tokens++;
+    // Request Info
+    bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens;
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request->llm_cache_size;
+    int num_tokens_in_batch =
+        std::min(get_max_tokens_per_prefilling_batch() - num_tokens,
+                 (int)request->tokens.size() - request->llm_prefill_len);
+    num_tokens_in_batch = std::max(num_tokens_in_batch, 0);
+    bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
+
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request->streaming_cache_info;
+
+    request->first_token_offset_in_batch = num_tokens;
+    request->num_tokens_in_batch = num_tokens_in_batch;
+
+    // Token Info
+    for (int idx = 0; idx < num_tokens_in_batch; idx++) {
+      int token_idx = num_tokens + idx;
+      int abs_idx = request->llm_cache_size + idx;
+
+      bc.tokensInfo[token_idx].request_index = request_index;
+      bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+      bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+      assert(request->llm_prefill_len + idx < request->tokens.size());
+      bc.tokensInfo[token_idx].token_id =
+          request->tokens[request->llm_prefill_len + idx];
+    }
+    num_tokens += num_tokens_in_batch;
   }
+  bc.num_available_requests = prefill_requests.size();
+  bc.num_tokens = num_tokens;
 
   if (verbose) {
     std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
@@ -1018,45 +1049,51 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     std::cout << "\n############### prepare_ssm_prefilling_batch "
                  "##############\n";
   }
-  assert(prefill_request != nullptr &&
+  assert(prefill_requests.size() > 0 &&
          "No prefilling request to process in the prefilling phase.");
 
   BatchConfig bc;
   bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
   bc.prompt_phase = true;
-  // Only set the prefilling request to be available
-  bc.request_available[prefill_request->batch_index] = true;
-  bc.num_available_requests = 1;
-
-  int request_index = prefill_request->batch_index;
-  // Request Info
-  bc.requestsInfo[request_index].first_token_offset_in_batch = 0;
-  bc.requestsInfo[request_index].first_token_index_in_request =
-      prefill_request->ssm_cache_size;
-  int num_tokens_in_batch = std::min(get_max_tokens_per_ssm_batch(),
-                                     (int)prefill_request->tokens.size() -
-                                         prefill_request->ssm_prefill_len);
-  bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
-
-  // Copy the streaming cache info
-  bc.streamingCacheInfo[request_index] = prefill_request->streaming_cache_info;
-
-  prefill_request->first_token_offset_in_batch = 0;
-  prefill_request->num_tokens_in_batch = num_tokens_in_batch;
-
-  // Token Info
-  for (int token_idx = 0; token_idx < num_tokens_in_batch; token_idx++) {
-    int abs_idx = prefill_request->ssm_cache_size + token_idx;
-    assert(abs_idx < prefill_request->tokens.size());
-
-    bc.tokensInfo[token_idx].request_index = request_index;
-    bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
-    bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
-    bc.tokensInfo[token_idx].token_id =
-        prefill_request->tokens[prefill_request->ssm_prefill_len + token_idx];
+  int num_tokens = 0;
+  for (Request *request : prefill_requests) {
+    int request_index = request->batch_index;
+    // Only set the prefilling request to be available
+    bc.request_available[request_index] = true;
+
+    // Request Info
+    bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens;
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request->ssm_cache_size;
+    int num_tokens_in_batch =
+        std::min(get_max_tokens_per_prefilling_batch() - num_tokens,
+                 (int)request->tokens.size() - request->ssm_prefill_len);
+    num_tokens_in_batch = std::max(num_tokens_in_batch, 0);
+    bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
 
-    bc.num_tokens++;
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] =
+        request->streaming_cache_info;
+
+    request->first_token_offset_in_batch = num_tokens;
+    request->num_tokens_in_batch = num_tokens_in_batch;
+
+    // Token Info
+    for (int idx = 0; idx < num_tokens_in_batch; idx++) {
+      int token_idx = num_tokens + idx;
+      int abs_idx = request->ssm_cache_size + idx;
+
+      bc.tokensInfo[token_idx].request_index = request_index;
+      bc.tokensInfo[token_idx].abs_index_in_request = abs_idx;
+      bc.tokensInfo[token_idx].abs_depth_in_request = abs_idx;
+      assert(request->ssm_prefill_len + idx < request->tokens.size());
+      bc.tokensInfo[token_idx].token_id =
+          request->tokens[request->ssm_prefill_len + idx];
+    }
+    num_tokens += num_tokens_in_batch;
   }
+  bc.num_available_requests = prefill_requests.size();
+  bc.num_tokens = num_tokens;
 
   if (verbose) {
     std::cout << "prepare_ssm_prefilling_batch NEW batchconfig:" << std::endl;

From 1637ed494b0edc060305ceb86e7a027e642b5dcc Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 14 Sep 2024 18:52:17 -0700
Subject: [PATCH 474/667] style: format

---
 src/ops/inc_multihead_self_attention.cpp | 2 +-
 src/runtime/batch_config.cc              | 3 ++-
 src/runtime/request_manager.cc           | 8 ++++----
 src/runtime/request_manager.cpp          | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 59f337067..ed2caea7e 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ffconst.h"
 #include "flexflow/ffconst_utils.h"
-#include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/utils/hip_helper.h"
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index d932da493..5981c6ce9 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -95,7 +95,8 @@ int BatchConfig::max_tokens_per_ssm_batch() {
 
 /*static*/
 int BatchConfig::max_tokens_per_prefilling_batch() {
-  return RequestManager::get_request_manager()->get_max_tokens_per_prefilling_batch();
+  return RequestManager::get_request_manager()
+      ->get_max_tokens_per_prefilling_batch();
 }
 
 /*static*/
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8c6d64337..6115e0ba0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
+#include "flexflow/request_manager.h"
 #include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
-#include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
@@ -148,7 +148,8 @@ void RequestManager::set_max_tokens_per_ssm_batch(int max_num_ssm_tokens) {
   assert(max_tokens_per_ssm_batch <= BatchConfig::MAX_NUM_TOKENS);
 }
 
-void RequestManager::set_max_tokens_per_prefilling_batch(int max_num_prefilling_tokens) {
+void RequestManager::set_max_tokens_per_prefilling_batch(
+    int max_num_prefilling_tokens) {
   assert(max_tokens_per_prefilling_batch == -1 ||
          max_tokens_per_prefilling_batch == max_num_prefilling_tokens);
   max_tokens_per_prefilling_batch = max_num_prefilling_tokens;
@@ -1070,8 +1071,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     bc.requestsInfo[request_index].num_tokens_in_batch = num_tokens_in_batch;
 
     // Copy the streaming cache info
-    bc.streamingCacheInfo[request_index] =
-        request->streaming_cache_info;
+    bc.streamingCacheInfo[request_index] = request->streaming_cache_info;
 
     request->first_token_offset_in_batch = num_tokens;
     request->num_tokens_in_batch = num_tokens_in_batch;
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index 7fa82a653..e3e5a5d5f 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -13,8 +13,8 @@
  * limitations under the License.
  */
 
-#include "flexflow/ffconst.h"
 #include "flexflow/request_manager.h"
+#include "flexflow/ffconst.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 

From bcb028c97d044f82c057ecd8daf593e41e017608 Mon Sep 17 00:00:00 2001
From: zikun-li <lizikunzk@gmail.com>
Date: Sun, 15 Sep 2024 01:01:22 -0400
Subject: [PATCH 475/667] Add a switch for early termination based on slo
 attainment.

---
 include/flexflow/request_manager.h |  3 +++
 inference/spec_infer/spec_infer.cc | 39 +++++++++++++++++++++++++++---
 src/runtime/request_manager.cc     | 24 ++++++++++--------
 3 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 40c8adb5b..a8ec3b946 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -261,6 +261,8 @@ class RequestManager {
   void set_streaming_cache(bool streaming_cache);
   bool get_memory_occupancy();
   void set_memory_occupancy(bool memory_occupancy);
+  void
+      set_slo_violation_early_termination(bool slo_violation_early_termination);
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
@@ -361,6 +363,7 @@ class RequestManager {
   // specify if enable streaming cache for incremental decoding or draft model
   bool streaming_cache = false;
   bool memory_occupancy = false;
+  bool slo_violation_early_termination = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 439ff3d15..2742c8491 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -70,7 +70,11 @@ void parse_input_args(char **argv,
                       bool &spec_sampling,
                       bool &do_sample,
                       int &sampling_seed,
-                      bool &streaming_cache) {
+                      bool &streaming_cache,
+                      bool &slo_attainment_early_termination,
+                      int &baseline_latency_ms,
+                      int &ssm_spec_latency_ms,
+                      int &llm_verify_latency_ms) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -158,6 +162,22 @@ void parse_input_args(char **argv,
       streaming_cache = true;
       continue;
     }
+    if (!strcmp(argv[i], "--slo-attainment-early-termination")) {
+      slo_attainment_early_termination = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--baseline-latency-ms")) {
+      baseline_latency_ms = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--ssm-spec-latency-ms")) {
+      ssm_spec_latency_ms = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--llm-verify-latency-ms")) {
+      llm_verify_latency_ms = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -323,6 +343,10 @@ void FlexFlow::top_level_task(Task const *task,
   bool do_sample = false;
   int sampling_seed = 0;
   bool streaming_cache = false;
+  bool slo_attainment_early_termination = false;
+  int baseline_latency_ms = 50;
+  int ssm_spec_latency_ms = 20;
+  int llm_verify_latency_ms = 50;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -343,7 +367,11 @@ void FlexFlow::top_level_task(Task const *task,
                    spec_sampling,
                    do_sample,
                    sampling_seed,
-                   streaming_cache);
+                   streaming_cache,
+                   slo_attainment_early_termination,
+                   baseline_latency_ms,
+                   ssm_spec_latency_ms,
+                   llm_verify_latency_ms);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -372,6 +400,10 @@ void FlexFlow::top_level_task(Task const *task,
                          model_metadata.eos_token_id,
                          model_metadata.llm_tokenizer_path);
   rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(slo_attainment_early_termination);
+  rm->set_baseline_latency(baseline_latency_ms);
+  rm->set_ssm_spec_latency(ssm_spec_latency_ms);
+  rm->set_llm_verify_latency(llm_verify_latency_ms);
   rm->register_output_filepath(file_paths.output_file_path);
 
   // Create LLM model
@@ -483,7 +515,8 @@ void FlexFlow::top_level_task(Task const *task,
       total_num_requests++;
       requests.push_back(GenerationRequest(text, slo_ratio));
     }
-    PoissonEmissionMachine emission_machine(1.0);
+    // PoissonEmissionMachine emission_machine(1.0);
+    ConstantEmissionMachine emission_machine(-1);
     tree_model.generate(requests, emission_machine);
   }
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 89054ba20..c13ce1172 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -275,6 +275,11 @@ void RequestManager::set_memory_occupancy(bool memory_occupancy_) {
   memory_occupancy = memory_occupancy_;
 }
 
+void RequestManager::set_slo_violation_early_termination(
+    bool slo_violation_early_termination_) {
+  slo_violation_early_termination = slo_violation_early_termination_;
+}
+
 double RequestManager::get_request_expected_latency(Request &request) {
   return request.get_slo_ratio() * baseline_latency_ms *
          (request.tokens.size() - request.llm_prefill_len);
@@ -1429,7 +1434,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     request.num_tokens_in_batch = token_tree_index;
 
     // TODO: REMOVE THIS OUTPUT
-    // std::cout << "Request " << request_index
+    // std::cout << "Request " << request_index << " Guid " << guid
     //           << " token tree size: " << request.num_tokens_in_batch
     //           << std::endl;
     // std::cout << "Request " << guid << " token tree: " << std::endl;
@@ -1538,8 +1543,9 @@ bool RequestManager::update_llm_verify_results(
       // Request is completed
       request_completed = true;
       request_complete_clean_up(request_index, true);
-    } else if (request.decode_latency_ms >
-               get_request_expected_latency(request)) {
+    } else if (slo_violation_early_termination and
+               request.decode_latency_ms >
+                   get_request_expected_latency(request)) {
       // The request violates the SLO, drop that request
       request_completed = true;
       request_complete_clean_up(request_index, false);
@@ -2669,7 +2675,6 @@ void RequestManager::prune_token_tree() {
     assert(request.status == Request::RUNNING);
     double spare_latency =
         get_request_expected_latency(request) - request.decode_latency_ms;
-    assert(spare_latency >= 0.0);
     spare_latency_2_request_index.push_back(
         std::make_pair(spare_latency, request_index));
   }
@@ -2843,12 +2848,11 @@ std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
     os << "Layer: " << layer_idx << std::endl;
     int token_pos = 0;
     for (auto const &node : layer) {
-      if (node->included) {
-        os << std::fixed << std::setprecision(12);
-        os << "token pos: " << token_pos << "\ttoken id: " << node->id
-           << "\tparent pos: " << node->parent_pos
-           << "\tlog prob: " << node->log_accumulated_prob << std::endl;
-      }
+      os << std::fixed << std::setprecision(12);
+      os << "token pos: " << token_pos << "\ttoken id: " << node->id
+         << "\tparent pos: " << node->parent_pos
+         << "\tlog prob: " << node->log_accumulated_prob
+         << (node->included ? " included" : " not included") << std::endl;
       token_pos++;
     }
     layer_idx++;

From 06d332cc09e339d2d331d6da75bda50c53fdea49 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 14 Sep 2024 23:09:10 -0700
Subject: [PATCH 476/667] fix: memory misalignment

---
 include/flexflow/attention_config.h |  5 +++--
 include/flexflow/batch_config.h     |  4 ++++
 include/flexflow/config.h           | 10 ++++++----
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h
index 7144b7ab3..558246867 100644
--- a/include/flexflow/attention_config.h
+++ b/include/flexflow/attention_config.h
@@ -114,8 +114,9 @@ class AttentionMetaData {
     workspace_size =
         float_workspace_size + int_workspace_size; // float + int workspace
 
-    mem_size_ = sizeof(int32_t) * indices_size +
-                sizeof(uint8_t) * custom_mask_size + workspace_size;
+    mem_size_ = alignTo(sizeof(int32_t) * indices_size +
+                            sizeof(uint8_t) * custom_mask_size + workspace_size,
+                        16);
     return mem_size_;
   }
 
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index abae06184..3cc5f650a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -22,6 +22,10 @@
 
 namespace FlexFlow {
 
+inline int alignTo(int x, int y) {
+  return ((x + y - 1) / y) * y;
+}
+
 class InferenceResult;
 
 using BatchConfigFuture = Legion::Future;
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 48b0450b6..e1f3f1904 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -83,11 +83,13 @@ struct FFHandler {
   AttentionMetaData *tree_search_attention_metadata;
   AttentionMetaData *tree_verify_attention_metadata;
 
-  size_t batch_config_metadata_size =
+  size_t batch_config_metadata_size = alignTo(
       sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BatchConfig::request_available) + sizeof(BatchConfig::causalMask) +
-      sizeof(BatchConfig::streamingCacheInfo) +
-      sizeof(BatchConfig::committed_tokens) + sizeof(int);
+          sizeof(BatchConfig::request_available) +
+          sizeof(BatchConfig::causalMask) +
+          sizeof(BatchConfig::streamingCacheInfo) +
+          sizeof(BatchConfig::committed_tokens) + sizeof(int),
+      16);
 
   void *offload_reserve_space;
   size_t offload_reserve_space_size;

From 5ddeb1176707031a785d2fafa8c96b34fa4667b6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 15 Sep 2024 19:30:34 -0700
Subject: [PATCH 477/667] chore: minor

---
 include/flexflow/batch_config.h | 4 ++--
 src/runtime/request_manager.cc  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 3cc5f650a..ff48bb17f 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -88,10 +88,10 @@ class BatchConfig {
   inline static int const MAX_NUM_REQUESTS = 64;
   inline static int const MAX_NUM_TOKENS = 1024;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
-  inline static int const MAX_TREE_DEPTH = 16;
+  inline static int const MAX_TREE_DEPTH = 8;
   inline static int const MAX_TREE_WIDTH = 16;
   inline static int const MAX_SPEC_TREE_TOKEN_NUM =
-      MAX_TREE_DEPTH * MAX_TREE_WIDTH + 1;
+      MAX_TREE_DEPTH * MAX_TREE_WIDTH;
   inline static int const MAX_K_LOGITS = 16;
 
   // The Constants for the Streaming KVCache
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 54e3e2a2b..61d91f173 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -218,7 +218,7 @@ void RequestManager::set_max_tree_depth(int max_tree_depth) {
          "Invalid max_tree_depth");
   this->max_tree_depth = max_tree_depth;
   if (max_tree_width > 0) {
-    max_spec_tree_token_num = max_tree_depth * max_tree_width + 1;
+    max_spec_tree_token_num = max_tree_depth * max_tree_width;
     assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
   }
 }
@@ -236,7 +236,7 @@ void RequestManager::set_max_tree_width(int max_tree_width) {
          "Invalid max_tree_width");
   this->max_tree_width = max_tree_width;
   if (max_tree_depth > 0) {
-    max_spec_tree_token_num = max_tree_depth * max_tree_width + 1;
+    max_spec_tree_token_num = max_tree_depth * max_tree_width;
     assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
   }
 }

From fd6eb7b4b9eff0d3c7019f2c4ba67d089f4a3225 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 15 Sep 2024 19:36:30 -0700
Subject: [PATCH 478/667] Reimplemented add_tokens_to_spec_token_tree.

---
 include/flexflow/request_manager.h |   7 +-
 src/runtime/request_manager.cc     | 148 +++++++++++------------------
 2 files changed, 61 insertions(+), 94 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 7530dd047..23e325278 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -96,14 +96,19 @@ struct SharedTokenTreeNodePtrLess {
 
 class TokenTree {
 public:
-  std::list<std::list<shared_ptr<TokenTreeNode>>> tree_layers = {};
+  std::vector<std::vector<std::shared_ptr<TokenTreeNode>>> tree_layers = {};
   void add_layer() {
     tree_layers.emplace_back();
+    tree_layers.back().reserve(BatchConfig::MAX_TREE_WIDTH);
   }
 
   void clear() {
     tree_layers.clear();
   }
+
+  TokenTree() {
+    tree_layers.reserve(BatchConfig::MAX_TREE_DEPTH + 1);
+  }
 };
 
 std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 54e3e2a2b..54c477e93 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -196,12 +196,14 @@ void RequestManager::set_verbose(bool verbose_) {
 }
 
 int RequestManager::get_k() {
-  assert(k > 0 and k <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM and "Invalid k");
+  assert(k > 0 and k <= BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES and
+         "Invalid k");
   return k;
 }
 
 void RequestManager::set_k(int _k) {
-  assert(_k > 0 and _k <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM and "Invalid k");
+  assert(_k > 0 and _k <= BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES and
+         "Invalid k");
   k = _k;
 }
 
@@ -1329,7 +1331,7 @@ BatchConfig RequestManager::prepare_next_spec_batch_config() {
       request.first_token_offset_in_batch = new_bc.num_tokens;
       continue;
     } else {
-      std::list<std::shared_ptr<TokenTreeNode>> &current_layer =
+      std::vector<std::shared_ptr<TokenTreeNode>> &current_layer =
           token_tree.tree_layers.back();
       // Exclude the current layer from the token tree, because we want the
       // start index
@@ -1721,7 +1723,7 @@ void RequestManager::append_bitmask(RequestGuid guid) {
     // inference steps, skip it
     return;
   }
-  std::list<std::shared_ptr<TokenTreeNode>> &tree_layer =
+  std::vector<std::shared_ptr<TokenTreeNode>> &tree_layer =
       request.speculative_token_trees[0].tree_layers.back();
   int new_layer_size = tree_layer.size();
   int last_layer_size = bitmask.current_layer_size;
@@ -1902,7 +1904,7 @@ void RequestManager::get_verify_results_sample(
     ++layer_it;
     for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
       // We skip the first layer
-      std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+      std::vector<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
       std::vector<std::pair<TokenId, float>> D;
       std::unordered_map<TokenId, float> R;
       // Data format: <current_token_index, current_token_index_in_layer,
@@ -2038,11 +2040,11 @@ void RequestManager::get_verify_results_greedy(
     int last_accepted_token_index = 0;
 
     int current_token_index = 1; // Because we skip the root
-    auto layer_it = token_tree.tree_layers.begin();
-    ++layer_it;
-    for (; layer_it != token_tree.tree_layers.end(); ++layer_it) {
-      // We skip the first layer
-      std::list<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
+                                 // We skip the first layer
+    for (auto layer_it = token_tree.tree_layers.begin() + 1;
+         layer_it != token_tree.tree_layers.end();
+         ++layer_it) {
+      std::vector<std::shared_ptr<TokenTreeNode>> const &tree_layer = *layer_it;
 
       bool token_accepted_this_layer = false;
       int current_token_index_in_layer = 0;
@@ -2568,11 +2570,13 @@ void RequestManager::add_root_to_spec_token_tree(
   if (speculative_sampling) {
     node_ptr->gumbel = true;
   }
-  speculative_token_tree.tree_layers.front().push_back(node_ptr);
+  speculative_token_tree.tree_layers[0].push_back(node_ptr);
 }
 
 void RequestManager::add_tokens_to_spec_token_tree(
     InferenceResult const &ssm_inference_result) {
+  // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
+  // TODO: support gumbel sampling
 
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
@@ -2588,110 +2592,68 @@ void RequestManager::add_tokens_to_spec_token_tree(
     if (parent_num == 0) {
       continue;
     }
+
     int result_offset = request.first_token_offset_in_batch *
                         BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
-    std::list<std::shared_ptr<TokenTreeNode>> &last_layer =
+    std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();
-    std::set<std::shared_ptr<TokenTreeNode>, SharedTokenTreeNodePtrLess> tokens;
+    std::priority_queue<std::pair<double, int>,
+                        std::vector<std::pair<double, int>>,
+                        std::greater<std::pair<double, int>>>
+        child_probs_pq;
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
-      // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
       double parent_log_prob = parent_ptr->log_accumulated_prob;
       int child_start_idx =
           result_offset +
           parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-      // TODO: rename child_probs to child_logits after change the output of
-      // argmax from prob to logprob
-      std::vector<std::pair<double, int>> child_probs;
       for (int child_pos = 0;
            child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
            child_pos++) {
         int result_idx = child_start_idx + child_pos;
-        if (!speculative_sampling) {
-          // TODO: the argmax will return log prob instead of prob
-          double log_prob = log((double)ssm_inference_result.probs[result_idx]);
-          if (log_prob == 0.0) {
-            // Slightly perturb the log prob to make it strictly less than 0
-            log_prob -= 1e-10;
-          }
-          if (log_prob != -std::numeric_limits<double>::infinity()) {
-            child_probs.push_back(std::make_pair(log_prob, result_idx));
-          }
-        } else {
-          // Use gumbel perturbed logits here
-          // TODO: handle the case when the child logit is -inf
-          // TODO: this branch is not tested
-          child_probs.push_back(std::make_pair(
-              ssm_inference_result.gumbel_logits[result_idx], result_idx));
+        double log_prob = log((double)ssm_inference_result.probs[result_idx]);
+        if (log_prob == -std::numeric_limits<double>::infinity()) {
+          continue;
         }
-      }
-      // Sort in descending order
-      std::sort(child_probs.begin(),
-                child_probs.end(),
-                std::greater<std::pair<double, int>>());
-      if (speculative_sampling) {
-        // TODO: this branch is not tested
-        // Condition the gumbel perturbed logits on the maximum
-        gumbel_conditioned_on_max(parent_ptr->gumbel_logit, child_probs);
-      }
-
-      for (auto const &child_prob : child_probs) {
-        double logit = child_prob.first;
-        // The value used to compare between tokens
-        double accumulated_log_prob = logit + parent_log_prob;
-        double gumbel_logit = 0.0f;
-        double cmp_value;
-        if (speculative_sampling) {
-          cmp_value = gumbel_logit = logit;
-        } else {
-          cmp_value = accumulated_log_prob;
+        if (log_prob == 0.0) {
+          // Slightly perturb the log prob to make it strictly less than 0
+          log_prob -= 1e-10;
         }
-        int result_idx = child_prob.second;
-
-        if (tokens.size() == max_tree_width and
-            cmp_value <= (speculative_sampling
-                              ? (*tokens.begin())->gumbel_logit
-                              : (*tokens.begin())->log_accumulated_prob)) {
-          // The current layer is full, and the new token has a lower compare
-          // value than the minimum node in tokens, we don't need to add the
-          // new token and the following tokens belong to the same parent to
-          // it, because the tokens are sorted by their compare value
-          break;
-        } else {
-          std::shared_ptr<TokenTreeNode> node_ptr(nullptr);
-          if (speculative_sampling) {
-            node_ptr = std::make_shared<TokenTreeNode>(
-                ssm_inference_result.token_ids[result_idx],
-                accumulated_log_prob,
-                parent_pos,
-                true,
-                gumbel_logit);
-          } else {
-            node_ptr = std::make_shared<TokenTreeNode>(
-                ssm_inference_result.token_ids[result_idx],
-                accumulated_log_prob,
-                parent_pos);
-          }
-          if (tokens.size() == max_tree_width) {
-            // The current layer is full, and the new token has a higher
-            // compare value than the minimum node in tokens, we need to
-            // remove the minimum node from tokens and add the new token to it
-            tokens.erase(tokens.begin());
-          }
-          tokens.insert(node_ptr);
+
+        double accumulated_log_prob = log_prob + parent_log_prob;
+        if (child_probs_pq.size() == get_max_tree_width() and
+            accumulated_log_prob > child_probs_pq.top().first) {
+          // The current layer is full, and the new token has a higher
+          // log prob than the minimum node in tokens, we don't need to add
+          // the new token to the priority queue, and remove the minimum node
+          // from the priority queue
+          child_probs_pq.pop();
+        } else if (child_probs_pq.size() == get_max_tree_width()) {
+          // The current layer is full, and the new token has a lower log prob
+          // than the minimum node in tokens, we don't need to add the new token
+          // to the priority queue
+          continue;
         }
+        child_probs_pq.push(std::make_pair(accumulated_log_prob, result_idx));
       }
       parent_pos++;
     }
 
-    // Now add all tokens in the set to the token tree
     spec_token_tree.add_layer();
-    for (auto token_it = tokens.cbegin(); token_it != tokens.cend();
-         token_it++) {
-      assert((*token_it)->log_accumulated_prob != 0.0);
-      spec_token_tree.tree_layers.back().push_back((*token_it));
-      request.token_tree_nodes_pq.push((*token_it));
+    while (!child_probs_pq.empty()) {
+      std::pair<double, int> child_pair = child_probs_pq.top();
+      child_probs_pq.pop();
+      double accumulated_log_prob = child_pair.first;
+      int result_idx = child_pair.second;
+      int parent_pos = (result_idx - result_offset) /
+                       BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      std::shared_ptr<TokenTreeNode> node_ptr = std::make_shared<TokenTreeNode>(
+          ssm_inference_result.token_ids[result_idx],
+          accumulated_log_prob,
+          parent_pos);
+      spec_token_tree.tree_layers.back().push_back(node_ptr);
+      request.token_tree_nodes_pq.push(node_ptr);
     }
   }
 }

From 5623fc5a892ae0116100a89b8b1804e503c73bf5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 15 Sep 2024 22:30:56 -0700
Subject: [PATCH 479/667] chore: refactor lock

---
 include/flexflow/request_manager.h |  4 +--
 src/runtime/request_manager.cc     | 41 +++++++++++++++++++-----------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 7530dd047..e0616e4b0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -378,6 +378,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
+  std::mutex request_result_mutex;
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
@@ -394,9 +395,6 @@ class RequestManager {
   int num_available_requests = 0;
   int ssm_completed = true;
 
-  // rm state
-  std::mutex rm_state_mutex;
-
   // Multi-model support
   std::vector<FFModel *> ssm_models;
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 61d91f173..5656b378e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -404,7 +404,6 @@ size_t RequestManager::get_num_ssms() {
 
 RequestManager::RequestGuid
     RequestManager::register_new_request(GenerationRequest const &req) {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
@@ -443,12 +442,26 @@ RequestManager::RequestGuid
       BatchConfig::MAX_STREAMING_POS - BatchConfig::SINK_SIZE -
           BatchConfig::get_max_tree_depth());
 
-  pending_request_queue.push(request);
-  all_requests[request.guid] = request;
+  GenerationResult gr;
+  gr.guid = request.guid;
+  gr.input_text = req.prompt;
+  gr.input_tokens = request.tokens;
+  gr.output_text = req.prompt;
+  gr.output_tokens = request.tokens;
+
+  {
+    std::lock_guard<std::mutex> const lock(request_queue_mutex);
+    pending_request_queue.push(request);
+    all_requests[request.guid] = request;
+  }
   {
     std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
+  {
+    std::lock_guard<std::mutex> const lock(request_result_mutex);
+    request_generation_results[request.guid] = gr;
+  }
 
   {
     std::string output = "New request tokens:";
@@ -460,13 +473,6 @@ RequestManager::RequestGuid
     write_to_output_file("", output);
   }
 
-  GenerationResult gr;
-  gr.guid = request.guid;
-  gr.input_text = req.prompt;
-  gr.input_tokens = request.tokens;
-  gr.output_text = req.prompt;
-  gr.output_tokens = request.tokens;
-  request_generation_results[request.guid] = gr;
   return request.guid;
 }
 
@@ -491,7 +497,7 @@ GenerationResult
   future.get();
   // Get the generation result
   {
-    std::lock_guard<std::mutex> const lock(request_queue_mutex);
+    std::lock_guard<std::mutex> const lock(request_result_mutex);
     assert(request_generation_results.find(guid) !=
            request_generation_results.end());
     return request_generation_results[guid];
@@ -627,6 +633,15 @@ void RequestManager::request_complete_clean_up(int batch_index, bool attained) {
   std::string output =
       this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
 
+  {
+    std::lock_guard<std::mutex> const lock(request_result_mutex);
+    request_generation_results[guid].output_text = output;
+    request_generation_results[guid].output_tokens =
+        std::vector<int>(bos_it, eos_it);
+  }
+
+  trigger_request_completion_future(guid);
+
   std::cout << "Request " << guid << " completed: " << std::endl << std::endl;
   std::cout << "<bos>" << output;
   if (eos_rit != request.tokens.rend()) {
@@ -689,14 +704,10 @@ void RequestManager::request_complete_clean_up(int batch_index, bool attained) {
   //         std::to_string(profile_info.ssm_decoding_steps) + ")";
   // }
   // write_to_output_file("", str);
-
-  trigger_request_completion_future(guid);
 }
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
-  std::lock_guard<std::mutex> const rm_state_lock(rm_state_mutex);
-
   if (num_available_requests == 0) {
     // Update nothing
     // Load the pending request to the batch

From f524aac884f24a33d49f9e1569b9df5ba9c7015c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 16 Sep 2024 18:35:29 -0700
Subject: [PATCH 480/667] fix: request per batch

---
 src/runtime/request_manager.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5656b378e..d5c3272e6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1008,6 +1008,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     bc.inference_mode = InferenceMode::TREE_VERIFY_MODE;
   }
   bc.prompt_phase = true;
+  bc.num_available_requests = 0;
   int num_tokens = 0;
   for (Request *request : prefill_requests) {
     int request_index = request->batch_index;
@@ -1044,8 +1045,10 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
           request->tokens[request->llm_prefill_len + idx];
     }
     num_tokens += num_tokens_in_batch;
+    if (num_tokens_in_batch > 0) {
+      bc.num_available_requests++;
+    }
   }
-  bc.num_available_requests = prefill_requests.size();
   bc.num_tokens = num_tokens;
 
   if (verbose) {
@@ -1070,6 +1073,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   BatchConfig bc;
   bc.inference_mode = InferenceMode::TREE_SEARCH_MODE;
   bc.prompt_phase = true;
+  bc.num_available_requests = 0;
   int num_tokens = 0;
   for (Request *request : prefill_requests) {
     int request_index = request->batch_index;
@@ -1105,8 +1109,10 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
           request->tokens[request->ssm_prefill_len + idx];
     }
     num_tokens += num_tokens_in_batch;
+    if (num_tokens_in_batch > 0) {
+      bc.num_available_requests++;
+    }
   }
-  bc.num_available_requests = prefill_requests.size();
   bc.num_tokens = num_tokens;
 
   if (verbose) {

From 0b7a02f8b42b76ee5cf6fbda2cf452518343e3e3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 17 Sep 2024 21:53:01 -0700
Subject: [PATCH 481/667] Optimizes CPU performance of the scheduler

---
 include/flexflow/request_manager.h |  30 +++++++--
 src/runtime/request_manager.cc     | 102 +++++++++++++++++++----------
 2 files changed, 94 insertions(+), 38 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b9aff165a..79852263b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -94,6 +94,16 @@ struct SharedTokenTreeNodePtrLess {
   }
 };
 
+// A comparator for std::pair<std::shared_ptr<TokenTreeNode>, double>
+// This is used to construct a max heap for the token tree nodes
+struct SharedTokenTreeNodePtrDoubleLess {
+  bool operator()(
+      std::pair<std::shared_ptr<TokenTreeNode>, double> const &lhs,
+      std::pair<std::shared_ptr<TokenTreeNode>, double> const &rhs) const {
+    return lhs.second < rhs.second;
+  }
+};
+
 class TokenTree {
 public:
   std::vector<std::vector<std::shared_ptr<TokenTreeNode>>> tree_layers = {};
@@ -196,14 +206,26 @@ struct Request {
   // 2. Committing phase after the target model verification
   StreamingCacheInfo streaming_cache_info;
 
-  std::priority_queue<std::shared_ptr<TokenTreeNode>,
-                      std::vector<std::shared_ptr<TokenTreeNode>>,
-                      SharedTokenTreeNodePtrLess>
-      token_tree_nodes_pq;
+  std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, double>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+      SharedTokenTreeNodePtrDoubleLess>
+      token_tree_nodes_acc_prob_pair_pq;
 
   double get_length_weight();
   void set_slo_ratio(double slo_ratio_);
   double get_slo_ratio();
+
+  Request() {
+    std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+        _prealloc_vector;
+    _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+    token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
+        std::pair<std::shared_ptr<TokenTreeNode>, double>,
+        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+        SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                          std::move(_prealloc_vector));
+  }
 };
 
 class RequestManager {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 39e754a4f..54db94c3f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2615,20 +2615,23 @@ void RequestManager::add_tokens_to_spec_token_tree(
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();
+    std::vector<std::pair<double, int>> preallocated_vector;
+    preallocated_vector.reserve(get_max_tree_width());
     std::priority_queue<std::pair<double, int>,
                         std::vector<std::pair<double, int>>,
                         std::greater<std::pair<double, int>>>
-        child_probs_pq;
+        child_probs_pq(std::greater<std::pair<double, int>>(),
+                       std::move(preallocated_vector));
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
       double parent_log_prob = parent_ptr->log_accumulated_prob;
       int child_start_idx =
           result_offset +
           parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-      for (int child_pos = 0;
-           child_pos < BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
-           child_pos++) {
-        int result_idx = child_start_idx + child_pos;
+      for (int result_idx = child_start_idx;
+           result_idx <
+           child_start_idx + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           result_idx++) {
         double log_prob = log((double)ssm_inference_result.probs[result_idx]);
         if (log_prob == -std::numeric_limits<double>::infinity()) {
           continue;
@@ -2639,6 +2642,7 @@ void RequestManager::add_tokens_to_spec_token_tree(
         }
 
         double accumulated_log_prob = log_prob + parent_log_prob;
+
         if (child_probs_pq.size() == get_max_tree_width() and
             accumulated_log_prob > child_probs_pq.top().first) {
           // The current layer is full, and the new token has a higher
@@ -2670,7 +2674,8 @@ void RequestManager::add_tokens_to_spec_token_tree(
           accumulated_log_prob,
           parent_pos);
       spec_token_tree.tree_layers.back().push_back(node_ptr);
-      request.token_tree_nodes_pq.push(node_ptr);
+      request.token_tree_nodes_acc_prob_pair_pq.push(
+          std::make_pair(node_ptr, accumulated_log_prob));
     }
   }
 }
@@ -2730,13 +2735,14 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
   double current_added = 1.0;
 
   while (budget > 0 and current_added < num_tokens_to_decode) {
-    if (request.token_tree_nodes_pq.empty()) {
+    if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
       break;
     }
-    auto node_ptr = request.token_tree_nodes_pq.top();
-    request.token_tree_nodes_pq.pop();
+    auto [node_ptr, log_acc_prob] =
+        request.token_tree_nodes_acc_prob_pair_pq.top();
+    request.token_tree_nodes_acc_prob_pair_pq.pop();
     node_ptr->included = true;
-    current_added += exp(node_ptr->log_accumulated_prob);
+    current_added += exp(log_acc_prob);
     budget--;
   }
 }
@@ -2744,11 +2750,15 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
 void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
+  std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>
+      global_token_tree_node_vector;
+  global_token_tree_node_vector.reserve(get_max_requests_per_batch());
   std::priority_queue<
       std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
       std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
       SharedTokenTreeNodePtrRequestGuidWeightedLess>
-      global_token_tree_node_pq;
+      global_token_tree_node_pq(SharedTokenTreeNodePtrRequestGuidWeightedLess(),
+                                std::move(global_token_tree_node_vector));
 
   // Initialie the priority queue with the top element in each request's token
   // tree
@@ -2760,12 +2770,13 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    if (request.token_tree_nodes_pq.empty()) {
+    if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
       continue;
     }
-    if (!request.token_tree_nodes_pq.empty()) {
-      global_token_tree_node_pq.push({request.token_tree_nodes_pq.top(), guid});
-      request.token_tree_nodes_pq.pop();
+    if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_acc_prob_pair_pq.top().first, guid});
+      request.token_tree_nodes_acc_prob_pair_pq.pop();
     }
   }
 
@@ -2774,10 +2785,14 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
     auto [node_ptr, guid] = global_token_tree_node_pq.top();
     global_token_tree_node_pq.pop();
     node_ptr->included = true;
-    if (!get_request_with_guid(guid).token_tree_nodes_pq.empty()) {
+    if (!get_request_with_guid(guid)
+             .token_tree_nodes_acc_prob_pair_pq.empty()) {
       global_token_tree_node_pq.push(
-          {get_request_with_guid(guid).token_tree_nodes_pq.top(), guid});
-      get_request_with_guid(guid).token_tree_nodes_pq.pop();
+          {get_request_with_guid(guid)
+               .token_tree_nodes_acc_prob_pair_pq.top()
+               .first,
+           guid});
+      get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop();
     }
     budget--;
   }
@@ -2791,21 +2806,30 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    std::priority_queue<std::shared_ptr<TokenTreeNode>,
-                        std::vector<std::shared_ptr<TokenTreeNode>>,
-                        SharedTokenTreeNodePtrLess>()
-        .swap(request.token_tree_nodes_pq);
+    std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+        _prealloc_vector;
+    _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+    std::priority_queue<
+        std::pair<std::shared_ptr<TokenTreeNode>, double>,
+        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+        SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                          std::move(_prealloc_vector))
+        .swap(request.token_tree_nodes_acc_prob_pair_pq);
   }
 }
 
 void RequestManager::add_tokens_toward_goodput(int budget) {
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
+  std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>
+      global_token_tree_node_vector;
+  global_token_tree_node_vector.reserve(get_max_requests_per_batch());
   std::priority_queue<
       std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
       std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
       SharedTokenTreeNodePtrRequestGuidLess>
-      global_token_tree_node_pq;
+      global_token_tree_node_pq(SharedTokenTreeNodePtrRequestGuidLess(),
+                                std::move(global_token_tree_node_vector));
 
   // Initialie the priority queue with the top element in each request's token
   // tree
@@ -2817,12 +2841,13 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    if (request.token_tree_nodes_pq.empty()) {
+    if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
       continue;
     }
-    if (!request.token_tree_nodes_pq.empty()) {
-      global_token_tree_node_pq.push({request.token_tree_nodes_pq.top(), guid});
-      request.token_tree_nodes_pq.pop();
+    if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+      global_token_tree_node_pq.push(
+          {request.token_tree_nodes_acc_prob_pair_pq.top().first, guid});
+      request.token_tree_nodes_acc_prob_pair_pq.pop();
     }
   }
 
@@ -2833,10 +2858,14 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
     node_ptr->included = true;
     // TODO: REMOVE THIS OUTPUT
     // std::cout << node_ptr->log_accumulated_prob << std::endl;
-    if (!get_request_with_guid(guid).token_tree_nodes_pq.empty()) {
+    if (!get_request_with_guid(guid)
+             .token_tree_nodes_acc_prob_pair_pq.empty()) {
       global_token_tree_node_pq.push(
-          {get_request_with_guid(guid).token_tree_nodes_pq.top(), guid});
-      get_request_with_guid(guid).token_tree_nodes_pq.pop();
+          {get_request_with_guid(guid)
+               .token_tree_nodes_acc_prob_pair_pq.top()
+               .first,
+           guid});
+      get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop();
     }
     budget--;
   }
@@ -2850,10 +2879,15 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    std::priority_queue<std::shared_ptr<TokenTreeNode>,
-                        std::vector<std::shared_ptr<TokenTreeNode>>,
-                        SharedTokenTreeNodePtrLess>()
-        .swap(request.token_tree_nodes_pq);
+    std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+        _prealloc_vector;
+    _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+    std::priority_queue<
+        std::pair<std::shared_ptr<TokenTreeNode>, double>,
+        std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+        SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                          std::move(_prealloc_vector))
+        .swap(request.token_tree_nodes_acc_prob_pair_pq);
   }
 }
 

From fa13afaff7f77e1f69c0bc519bf9cf1de12e5877 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Sep 2024 08:09:56 -0700
Subject: [PATCH 482/667] chore: incr decode add slo attainment

---
 src/runtime/request_manager.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 54db94c3f..231444579 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -884,6 +884,10 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   bool request_completed = false;
   int nb_requests_decoded = 0;
+  long long int current_time = Realm::Clock::current_time_in_microseconds();
+  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
+                                     1e-3);
+
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -902,12 +906,20 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     request.tokens.push_back(
         result.token_ids[request.first_token_offset_in_batch]);
 
+    request.decode_latency_ms =
+        (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
     profiling_requests[guid].llm_decoding_steps++;
     nb_requests_decoded++;
     if (request.tokens.back() == eos_token_id or
         request.tokens.size() >= get_max_sequence_length()) {
       request_completed = true;
       request_complete_clean_up(request_index, true);
+    } else if (slo_violation_early_termination and
+               request.decode_latency_ms >
+                   get_request_expected_latency(request)) {
+      // The request violates the SLO, drop that request
+      request_completed = true;
+      request_complete_clean_up(request_index, false);
     }
 
     if (verbose) {
@@ -916,10 +928,6 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
                 << output << std::endl;
     }
   }
-  profiling.llm_step_times.push_back(
-      (Realm::Clock::current_time_in_microseconds() -
-       profiling.llm_step_start) *
-      1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
   profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
   return request_completed;

From 86f95dcb89ca2265d828d050a90d8e74287d1428 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Sep 2024 08:38:13 -0700
Subject: [PATCH 483/667] Optimized some usage of priority queues.

---
 include/flexflow/request_manager.h |  9 +++--
 src/runtime/request_manager.cc     | 59 ++++++++++++------------------
 2 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 79852263b..b0368249f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -361,11 +361,12 @@ class RequestManager {
         std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
         const;
   };
-  struct SharedTokenTreeNodePtrRequestGuidLess {
+  struct SharedTokenTreeNodePtrDoubleRequestGuidLess {
     bool operator()(
-        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
-        std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs)
-        const;
+        std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const
+            &lhs,
+        std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const
+            &rhs) const;
   };
 
 private:
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 54db94c3f..67db63deb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -327,14 +327,11 @@ bool RequestManager::SharedTokenTreeNodePtrRequestGuidWeightedLess::operator()(
                  .get_length_weight();
 }
 
-bool RequestManager::SharedTokenTreeNodePtrRequestGuidLess ::operator()(
-    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &lhs,
-    std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid> const &rhs) const {
-  if (lhs.first->gumbel) {
-    assert(rhs.first->gumbel);
-    return lhs.first->gumbel_logit < rhs.first->gumbel_logit;
-  }
-  return lhs.first->log_accumulated_prob < rhs.first->log_accumulated_prob;
+bool RequestManager::SharedTokenTreeNodePtrDoubleRequestGuidLess ::operator()(
+    std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const &lhs,
+    std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid> const &rhs)
+    const {
+  return std::get<1>(lhs) < std::get<1>(rhs);
 }
 
 void RequestManager::register_tokenizer(ModelType type,
@@ -1403,9 +1400,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     std::cout
         << "\n############### prepare_verify_batch_config ###############\n";
   }
-  // TODO: REMOVE THIS OUTPUT
-  //   std::cout
-  //       << "\n############### prepare_verify_batch_config ###############\n";
   // This method does the following:
   // 1. Commit the verified tokens in the last iteration through the
   // BatchConfig. We can do this request by request.
@@ -1489,14 +1483,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
     request.num_tokens_in_batch = token_tree_index;
 
-    // TODO: REMOVE THIS OUTPUT
-    // std::cout << "Request " << request_index << " Guid " << guid
-    //           << " token tree size: " << request.num_tokens_in_batch
-    //           << std::endl;
-    // std::cout << "Request " << guid << " token tree: " << std::endl;
-    // std::cout << request.speculative_token_trees[0];
-    // std::cout << std::endl;
-
     // Create the causal mask for the large model based on the small model
     // causal mask.
     new_bc.causalMask[request_index] = create_llm_bitmask(guid);
@@ -2686,6 +2672,7 @@ void RequestManager::prune_token_tree() {
   assert(budget >= 0);
 
   std::vector<std::pair<double, int>> spare_latency_2_request_index;
+  spare_latency_2_request_index.reserve(get_max_requests_per_batch());
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -2713,8 +2700,6 @@ void RequestManager::prune_token_tree() {
   }
 
   assert(budget >= 0);
-  // TODO: REMOVE THIS OUTPUT
-  //   std::cout << "Budget: " << budget << std::endl;
   if (budget > 0) {
     if (memory_occupancy) {
       add_tokens_toward_memory_occupancy(budget);
@@ -2809,26 +2794,26 @@ void RequestManager::add_tokens_toward_memory_occupancy(int budget) {
     std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
         _prealloc_vector;
     _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
-    std::priority_queue<
+    request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
         std::pair<std::shared_ptr<TokenTreeNode>, double>,
         std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
         SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
-                                          std::move(_prealloc_vector))
-        .swap(request.token_tree_nodes_acc_prob_pair_pq);
+                                          std::move(_prealloc_vector));
   }
 }
 
 void RequestManager::add_tokens_toward_goodput(int budget) {
   // This is a helper data structure to store help the pruning of the token
   // trees across different requests.
-  std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>
+  std::vector<std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid>>
       global_token_tree_node_vector;
   global_token_tree_node_vector.reserve(get_max_requests_per_batch());
   std::priority_queue<
-      std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>,
-      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, RequestGuid>>,
-      SharedTokenTreeNodePtrRequestGuidLess>
-      global_token_tree_node_pq(SharedTokenTreeNodePtrRequestGuidLess(),
+      std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid>,
+      std::vector<
+          std::tuple<std::shared_ptr<TokenTreeNode>, double, RequestGuid>>,
+      SharedTokenTreeNodePtrDoubleRequestGuidLess>
+      global_token_tree_node_pq(SharedTokenTreeNodePtrDoubleRequestGuidLess(),
                                 std::move(global_token_tree_node_vector));
 
   // Initialie the priority queue with the top element in each request's token
@@ -2846,24 +2831,27 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
     }
     if (!request.token_tree_nodes_acc_prob_pair_pq.empty()) {
       global_token_tree_node_pq.push(
-          {request.token_tree_nodes_acc_prob_pair_pq.top().first, guid});
+          {request.token_tree_nodes_acc_prob_pair_pq.top().first,
+           request.token_tree_nodes_acc_prob_pair_pq.top().second,
+           guid});
       request.token_tree_nodes_acc_prob_pair_pq.pop();
     }
   }
 
   // Perform dequeue and enqueue until the budget is used up
   while (budget > 0 and !global_token_tree_node_pq.empty()) {
-    auto [node_ptr, guid] = global_token_tree_node_pq.top();
+    auto [node_ptr, acc_log_prob, guid] = global_token_tree_node_pq.top();
     global_token_tree_node_pq.pop();
     node_ptr->included = true;
-    // TODO: REMOVE THIS OUTPUT
-    // std::cout << node_ptr->log_accumulated_prob << std::endl;
     if (!get_request_with_guid(guid)
              .token_tree_nodes_acc_prob_pair_pq.empty()) {
       global_token_tree_node_pq.push(
           {get_request_with_guid(guid)
                .token_tree_nodes_acc_prob_pair_pq.top()
                .first,
+           get_request_with_guid(guid)
+               .token_tree_nodes_acc_prob_pair_pq.top()
+               .second,
            guid});
       get_request_with_guid(guid).token_tree_nodes_acc_prob_pair_pq.pop();
     }
@@ -2882,12 +2870,11 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
     std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
         _prealloc_vector;
     _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
-    std::priority_queue<
+    request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
         std::pair<std::shared_ptr<TokenTreeNode>, double>,
         std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
         SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
-                                          std::move(_prealloc_vector))
-        .swap(request.token_tree_nodes_acc_prob_pair_pq);
+                                          std::move(_prealloc_vector));
   }
 }
 

From f1698125ffe122c5817a1206f8edcc2867b5bb65 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Sep 2024 12:23:44 -0700
Subject: [PATCH 484/667] feat: support slo ratio sampling

---
 include/flexflow/inference.h             | 28 ++++++++++-----
 inference/incr_decoding/incr_decoding.cc | 38 +++++++++++++++------
 inference/spec_infer/spec_infer.cc       | 43 ++++++++++++++++--------
 src/c/flexflow_c.cc                      |  4 ++-
 src/runtime/inference_manager.cc         | 14 ++++++++
 src/runtime/request_manager.cc           | 11 ++++--
 6 files changed, 101 insertions(+), 37 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 1871406b9..273019208 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -66,21 +66,32 @@ class EmissionMachine {
   EmissionMode mode;
   double last_request_time_ms;
   double req_per_s;
-
-  EmissionMachine(EmissionMode mode_, double req_per_s_)
-      : mode(mode_), last_request_time_ms(0), req_per_s(req_per_s_) {}
+  std::vector<std::pair<double, double>> slo_ratios;
+
+  EmissionMachine(EmissionMode mode_,
+                  double req_per_s_,
+                  std::vector<std::pair<double, double>> slo_ratios_)
+      : mode(mode_), last_request_time_ms(0), req_per_s(req_per_s_),
+        slo_ratios(slo_ratios_) {
+    // cumulate the slo ratios for sampling
+    for (size_t i = 1; i < slo_ratios.size(); i++) {
+      slo_ratios[i].second += slo_ratios[i - 1].second;
+    }
+  }
   void wait_until_next_request();
 
   // Simulate next request arrival time
   virtual double get_next_interval_ms() = 0;
+  double sample_slo_ratio();
 };
 
 class ConstantEmissionMachine : public EmissionMachine {
 public:
   double interval_ms;
 
-  ConstantEmissionMachine(double req_per_s_)
-      : EmissionMachine(EmissionMode::Constant, req_per_s_),
+  ConstantEmissionMachine(double req_per_s_,
+                          std::vector<std::pair<double, double>> slo_ratios_)
+      : EmissionMachine(EmissionMode::Constant, req_per_s_, slo_ratios_),
         interval_ms(req_per_s_ > 0 ? 1e3 / req_per_s_ : 0) {}
 
   double get_next_interval_ms() override;
@@ -90,9 +101,10 @@ class PoissonEmissionMachine : public EmissionMachine {
 public:
   double lambda;
 
-  PoissonEmissionMachine(double req_per_s_)
-      : EmissionMachine(EmissionMode::Poisson, req_per_s_), lambda(req_per_s_) {
-  }
+  PoissonEmissionMachine(double req_per_s_,
+                         std::vector<std::pair<double, double>> slo_ratios_)
+      : EmissionMachine(EmissionMode::Poisson, req_per_s_, slo_ratios_),
+        lambda(req_per_s_) {}
 
   double get_next_interval_ms() override;
 };
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 61ef7e22e..98b18b6da 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -20,6 +20,7 @@
 #include "models/mpt.h"
 #include "models/opt.h"
 #include "models/starcoder.h"
+#include <cassert>
 #include <wordexp.h>
 
 #include <nlohmann/json.hpp>
@@ -300,7 +301,6 @@ void FlexFlow::top_level_task(Task const *task,
 
   rm->start_background_server(&model);
 
-  int total_num_requests = 0;
   {
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
@@ -309,18 +309,34 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
+
+    // Parse slo_ratios
+    std::vector<std::pair<double, double>> slo_ratios;
+    if (prompt_json[0].contains("slo_ratios")) {
+      for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+        slo_ratios.emplace_back(std::stod(key), value.get<double>());
+      }
+    }
+    double total =
+        std::accumulate(slo_ratios.begin(),
+                        slo_ratios.end(),
+                        0.0,
+                        [](double sum, std::pair<double, double> const &pair) {
+                          return sum + pair.second;
+                        });
+    if (std::abs(total - 1.0) > 1e-6) {
+      std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                << total << std::endl;
+      assert(false);
+    }
+
     std::vector<GenerationRequest> requests;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt["prompt"].get<std::string>();
-      double slo_ratio = prompt["slo_ratio"].get<double>();
-      printf("Prompt[%d] with slo %.3f: %s\n",
-             total_num_requests,
-             slo_ratio,
-             text.c_str());
-      total_num_requests++;
-      requests.push_back(GenerationRequest(text, slo_ratio));
+    for (size_t i = 1; i < prompt_json.size(); ++i) {
+      requests.push_back(
+          GenerationRequest(prompt_json[i]["prompt"].get<std::string>(), -1.0));
     }
-    PoissonEmissionMachine emission_machine(1.0);
+    // PoissonEmissionMachine emission_machine(1.0, slo_ratios);
+    ConstantEmissionMachine emission_machine(-1, slo_ratios);
     std::vector<GenerationResult> result =
         model.generate(requests, emission_machine);
   }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 254d58af0..0f3e7f247 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -18,6 +18,7 @@
 #include "models/llama.h"
 #include "models/mpt.h"
 #include "models/opt.h"
+#include <cassert>
 #include <filesystem>
 #include <nlohmann/json.hpp>
 #include <wordexp.h>
@@ -505,7 +506,6 @@ void FlexFlow::top_level_task(Task const *task,
   rm->start_background_server(&tree_model);
 
   // Register requests from prompt file
-  int total_num_requests = 0;
   {
     using json = nlohmann::json;
     std::ifstream file_handle(file_paths.prompt_file_path);
@@ -515,20 +515,35 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
 
+    // Parse slo_ratios
+    std::vector<std::pair<double, double>> slo_ratios;
+    if (prompt_json[0].contains("slo_ratios")) {
+      for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+        slo_ratios.emplace_back(std::stod(key), value.get<double>());
+      }
+    }
+    double total =
+        std::accumulate(slo_ratios.begin(),
+                        slo_ratios.end(),
+                        0.0,
+                        [](double sum, std::pair<double, double> const &pair) {
+                          return sum + pair.second;
+                        });
+    if (std::abs(total - 1.0) > 1e-6) {
+      std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                << total << std::endl;
+      assert(false);
+    }
+
     std::vector<GenerationRequest> requests;
-    for (auto &prompt : prompt_json) {
-      std::string text = prompt["prompt"].get<std::string>();
-      double slo_ratio = prompt["slo_ratio"].get<double>();
-      printf("Prompt[%d] with slo %.3f: %s\n",
-             total_num_requests,
-             slo_ratio,
-             text.c_str());
-      total_num_requests++;
-      requests.push_back(GenerationRequest(text, slo_ratio));
-    }
-    // PoissonEmissionMachine emission_machine(1.0);
-    ConstantEmissionMachine emission_machine(-1);
-    tree_model.generate(requests, emission_machine);
+    for (size_t i = 1; i < prompt_json.size(); ++i) {
+      requests.push_back(
+          GenerationRequest(prompt_json[i]["prompt"].get<std::string>(), -1.0));
+    }
+    // PoissonEmissionMachine emission_machine(1.0, slo_ratios);
+    ConstantEmissionMachine emission_machine(-1, slo_ratios);
+    std::vector<GenerationResult> result =
+        tree_model.generate(requests, emission_machine);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index e0bdf31e1..bba5a3882 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -18,6 +18,7 @@
 #include "flexflow/mapper.h"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/file_loader.h"
+#include <vector>
 
 using namespace Legion;
 using namespace FlexFlow;
@@ -1606,7 +1607,8 @@ void flexflow_model_generate(flexflow_model_t handle_,
                 text_str.c_str(),
                 max_seq_length);
   }
-  ConstantEmissionMachine emission_machine(1.0);
+  std::vector<std::pair<double, double>> slo_ratios = {std::pair(10.0, 1.0)};
+  ConstantEmissionMachine emission_machine(1.0, slo_ratios);
   std::vector<GenerationResult> results =
       handle->generate(prompts, emission_machine);
   // If the prompt exceeds max seq len, check that we return the prompt with no
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 4fd5c4846..b2a231b8f 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -707,4 +707,18 @@ double PoissonEmissionMachine::get_next_interval_ms() {
   static std::exponential_distribution<double> distribution(lambda);
   return distribution(generator) * 1e3;
 }
+
+double EmissionMachine::sample_slo_ratio() {
+  static std::default_random_engine generator(
+      std::chrono::system_clock::now().time_since_epoch().count());
+  static std::uniform_real_distribution<double> distribution(0.0, 1.0);
+  double r = distribution(generator);
+
+  for (auto const &pair : slo_ratios) {
+    if (r < pair.second) {
+      return pair.first;
+    }
+  }
+  return slo_ratios.back().first;
+}
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 231444579..8c87ce638 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2158,8 +2158,13 @@ std::vector<GenerationResult>
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
   }
 
-  for (GenerationRequest &request : requests) {
-    RequestManager::RequestGuid guid = rm->register_new_request(request);
+  for (size_t i = 0; i < requests.size(); i++) {
+    requests[i].slo_ratio = emission_machine.sample_slo_ratio();
+    printf("Prompt[%ld] with slo %.3f: %s\n",
+           i,
+           requests[i].slo_ratio,
+           requests[i].prompt.c_str());
+    RequestManager::RequestGuid guid = rm->register_new_request(requests[i]);
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
     }
@@ -2177,7 +2182,7 @@ std::vector<GenerationResult>
                       EmissionMachine &emission_machine) {
   std::vector<GenerationRequest> requests;
   for (std::string &prompt : prompts) {
-    requests.push_back(GenerationRequest(prompt, 1.0));
+    requests.push_back(GenerationRequest(prompt, -1.0));
   }
   return generate(requests, emission_machine);
 }

From 7ae7edd57e155843dd9ccffc1aea616aa50d1025 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Sep 2024 17:19:51 -0700
Subject: [PATCH 485/667] fix: incr_decode doesn'y have slo attainment metric

---
 inference/incr_decoding/incr_decoding.cc | 36 ++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 98b18b6da..8e2a11cd3 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -52,7 +52,11 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &sampling_seed,
-                      bool &streaming_cache) {
+                      bool &streaming_cache,
+                      bool &slo_attainment_early_termination,
+                      int &baseline_latency_ms,
+                      int &ssm_spec_latency_ms,
+                      int &llm_verify_latency_ms) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -126,6 +130,22 @@ void parse_input_args(char **argv,
       streaming_cache = true;
       continue;
     }
+    if (!strcmp(argv[i], "--slo-attainment-early-termination")) {
+      slo_attainment_early_termination = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--baseline-latency-ms")) {
+      baseline_latency_ms = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--ssm-spec-latency-ms")) {
+      ssm_spec_latency_ms = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--llm-verify-latency-ms")) {
+      llm_verify_latency_ms = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -163,6 +183,10 @@ void FlexFlow::top_level_task(Task const *task,
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;
   bool streaming_cache = false;
+  bool slo_attainment_early_termination = false;
+  int baseline_latency_ms = 50;
+  int ssm_spec_latency_ms = 20;
+  int llm_verify_latency_ms = 50;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -182,7 +206,11 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_prefilling_batch,
                    max_sequence_length,
                    sampling_seed,
-                   streaming_cache);
+                   streaming_cache,
+                   slo_attainment_early_termination,
+                   baseline_latency_ms,
+                   ssm_spec_latency_ms,
+                   llm_verify_latency_ms);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -252,6 +280,10 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(slo_attainment_early_termination);
+  rm->set_baseline_latency(baseline_latency_ms);
+  rm->set_ssm_spec_latency(ssm_spec_latency_ms);
+  rm->set_llm_verify_latency(llm_verify_latency_ms);
   rm->set_max_tree_depth(8);
   rm->set_max_tree_width(16);
   rm->set_verbose(verbose);

From ff3af26b863072c68eedd29ef582e6bef3e536f2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Sep 2024 17:36:22 -0700
Subject: [PATCH 486/667] feat: support early_drop switch

---
 include/flexflow/request_manager.h |  5 ++--
 src/runtime/request_manager.cc     | 38 ++++++++++++++++++------------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b0368249f..af8a80ee3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -138,7 +138,7 @@ struct Request {
   double decode_latency_ms = 0.0;
   int ssm_prefill_len = 0;
   int llm_prefill_len = 0;
-  bool attained = false;
+  bool attained = true;
 
   int first_token_offset_in_batch = 0;
   int num_tokens_in_batch = 0;
@@ -463,7 +463,8 @@ class RequestManager {
   std::unordered_map<RequestGuid, RequestProfileInfo> profiling_requests;
   double total_request_run_time;
   bool load_pending_request_to_batch();
-  void request_complete_clean_up(int batch_index, bool attained);
+  void request_update_attainment(int index, bool attained);
+  void request_complete_clean_up(int batch_index);
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 3e4bc6043..1b57bb24e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -607,7 +607,12 @@ bool RequestManager::load_pending_request_to_batch() {
   return true;
 }
 
-void RequestManager::request_complete_clean_up(int batch_index, bool attained) {
+void RequestManager::request_update_attainment(int batch_index, bool attained) {
+  Request &request = all_requests[guid_of_requests[batch_index]];
+  request.attained &= attained;
+}
+
+void RequestManager::request_complete_clean_up(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
   profiling_requests[guid].finish_time =
       Realm::Clock::current_time_in_microseconds();
@@ -616,7 +621,6 @@ void RequestManager::request_complete_clean_up(int batch_index, bool attained) {
   request_available[batch_index] = false;
   num_available_requests--;
   request.status = Request::COMPLETED;
-  request.attained = attained;
 
   // Find the sos and eos in the sequence
   auto bos_it = std::find(
@@ -849,7 +853,7 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
             result.token_ids[num_tokens + request->num_tokens_in_batch - 1]);
 
         if (request->tokens.back() == eos_token_id) {
-          request_complete_clean_up(request->batch_index, true);
+          request_complete_clean_up(request->batch_index);
         }
 
         if (decoding_mode == SPECULATIVE_DECODING) {
@@ -910,13 +914,15 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     if (request.tokens.back() == eos_token_id or
         request.tokens.size() >= get_max_sequence_length()) {
       request_completed = true;
-      request_complete_clean_up(request_index, true);
-    } else if (slo_violation_early_termination and
-               request.decode_latency_ms >
-                   get_request_expected_latency(request)) {
+      request_complete_clean_up(request_index);
+    } else if (request.decode_latency_ms >
+               get_request_expected_latency(request)) {
       // The request violates the SLO, drop that request
-      request_completed = true;
-      request_complete_clean_up(request_index, false);
+      request_update_attainment(request_index, false);
+      if (slo_violation_early_termination) {
+        request_completed = true;
+        request_complete_clean_up(request_index);
+      }
     }
 
     if (verbose) {
@@ -1592,13 +1598,15 @@ bool RequestManager::update_llm_verify_results(
     if (eos_token_found or request.tokens.size() >= get_max_sequence_length()) {
       // Request is completed
       request_completed = true;
-      request_complete_clean_up(request_index, true);
-    } else if (slo_violation_early_termination and
-               request.decode_latency_ms >
-                   get_request_expected_latency(request)) {
+      request_complete_clean_up(request_index);
+    } else if (request.decode_latency_ms >
+               get_request_expected_latency(request)) {
       // The request violates the SLO, drop that request
-      request_completed = true;
-      request_complete_clean_up(request_index, false);
+      request_update_attainment(request_index, false);
+      if (slo_violation_early_termination) {
+        request_completed = true;
+        request_complete_clean_up(request_index);
+      }
     } else {
       update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
     }

From 1a1dc569d554e31e294cb75efa4b187eb0b34345 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Sep 2024 21:53:09 -0700
Subject: [PATCH 487/667] chore: add request_per_second param

---
 inference/incr_decoding/incr_decoding.cc | 15 +++++++++++----
 inference/spec_infer/spec_infer.cc       | 15 +++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 8e2a11cd3..274db6286 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -56,7 +56,8 @@ void parse_input_args(char **argv,
                       bool &slo_attainment_early_termination,
                       int &baseline_latency_ms,
                       int &ssm_spec_latency_ms,
-                      int &llm_verify_latency_ms) {
+                      int &llm_verify_latency_ms,
+                      double &request_per_second) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -146,6 +147,10 @@ void parse_input_args(char **argv,
       llm_verify_latency_ms = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stod(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -187,6 +192,7 @@ void FlexFlow::top_level_task(Task const *task,
   int baseline_latency_ms = 50;
   int ssm_spec_latency_ms = 20;
   int llm_verify_latency_ms = 50;
+  double request_per_second = 1.0;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -210,7 +216,8 @@ void FlexFlow::top_level_task(Task const *task,
                    slo_attainment_early_termination,
                    baseline_latency_ms,
                    ssm_spec_latency_ms,
-                   llm_verify_latency_ms);
+                   llm_verify_latency_ms,
+                   request_per_second);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -367,8 +374,8 @@ void FlexFlow::top_level_task(Task const *task,
       requests.push_back(
           GenerationRequest(prompt_json[i]["prompt"].get<std::string>(), -1.0));
     }
-    // PoissonEmissionMachine emission_machine(1.0, slo_ratios);
-    ConstantEmissionMachine emission_machine(-1, slo_ratios);
+    PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
+    // ConstantEmissionMachine emission_machine(-1, slo_ratios);
     std::vector<GenerationResult> result =
         model.generate(requests, emission_machine);
   }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 0f3e7f247..5d7f0bb1b 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -76,7 +76,8 @@ void parse_input_args(char **argv,
                       bool &slo_attainment_early_termination,
                       int &baseline_latency_ms,
                       int &ssm_spec_latency_ms,
-                      int &llm_verify_latency_ms) {
+                      int &llm_verify_latency_ms,
+                      double &request_per_second) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -184,6 +185,10 @@ void parse_input_args(char **argv,
       llm_verify_latency_ms = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stod(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -354,6 +359,7 @@ void FlexFlow::top_level_task(Task const *task,
   int baseline_latency_ms = 50;
   int ssm_spec_latency_ms = 20;
   int llm_verify_latency_ms = 50;
+  double request_per_second = 1.0;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -379,7 +385,8 @@ void FlexFlow::top_level_task(Task const *task,
                    slo_attainment_early_termination,
                    baseline_latency_ms,
                    ssm_spec_latency_ms,
-                   llm_verify_latency_ms);
+                   llm_verify_latency_ms,
+                   request_per_second);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -540,8 +547,8 @@ void FlexFlow::top_level_task(Task const *task,
       requests.push_back(
           GenerationRequest(prompt_json[i]["prompt"].get<std::string>(), -1.0));
     }
-    // PoissonEmissionMachine emission_machine(1.0, slo_ratios);
-    ConstantEmissionMachine emission_machine(-1, slo_ratios);
+    PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
+    // ConstantEmissionMachine emission_machine(-1, slo_ratios);
     std::vector<GenerationResult> result =
         tree_model.generate(requests, emission_machine);
   }

From 9f034a4e1d4a54cd6bc56186de1bdde5cd356a51 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 19 Sep 2024 12:57:52 -0700
Subject: [PATCH 488/667] chore: change early drop logic

---
 src/runtime/request_manager.cc | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1b57bb24e..349d647b4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -909,20 +909,20 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
 
     request.decode_latency_ms =
         (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+    bool attained =
+        request.decode_latency_ms <= get_request_expected_latency(request);
     profiling_requests[guid].llm_decoding_steps++;
     nb_requests_decoded++;
     if (request.tokens.back() == eos_token_id or
         request.tokens.size() >= get_max_sequence_length()) {
+      request_update_attainment(request_index, attained);
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else if (!attained and slo_violation_early_termination) {
+      // Early drop that request
+      request_update_attainment(request_index, attained);
       request_completed = true;
       request_complete_clean_up(request_index);
-    } else if (request.decode_latency_ms >
-               get_request_expected_latency(request)) {
-      // The request violates the SLO, drop that request
-      request_update_attainment(request_index, false);
-      if (slo_violation_early_termination) {
-        request_completed = true;
-        request_complete_clean_up(request_index);
-      }
     }
 
     if (verbose) {
@@ -1578,6 +1578,8 @@ bool RequestManager::update_llm_verify_results(
 
     request.decode_latency_ms =
         (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
+    bool attained =
+        request.decode_latency_ms <= get_request_expected_latency(request);
 
     // Initialize the token tree for the request
     init_token_tree(guid);
@@ -1597,16 +1599,14 @@ bool RequestManager::update_llm_verify_results(
     }
     if (eos_token_found or request.tokens.size() >= get_max_sequence_length()) {
       // Request is completed
+      request_update_attainment(request_index, attained);
+      request_completed = true;
+      request_complete_clean_up(request_index);
+    } else if (!attained and slo_violation_early_termination) {
+      // Early drop that request
+      request_update_attainment(request_index, attained);
       request_completed = true;
       request_complete_clean_up(request_index);
-    } else if (request.decode_latency_ms >
-               get_request_expected_latency(request)) {
-      // The request violates the SLO, drop that request
-      request_update_attainment(request_index, false);
-      if (slo_violation_early_termination) {
-        request_completed = true;
-        request_complete_clean_up(request_index);
-      }
     } else {
       update_bitmask_prompt(guid, request.committed_tokens.size() - 1);
     }

From fe55382161d1c38ffe599dcb6a6073ae9d0a3271 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 20 Sep 2024 15:30:48 -0700
Subject: [PATCH 489/667] feat: add emission output

---
 include/flexflow/inference.h             | 16 +++++++++----
 inference/incr_decoding/incr_decoding.cc | 29 +++++++++++++++++++----
 inference/spec_infer/spec_infer.cc       | 30 ++++++++++++++++++++----
 src/runtime/inference_manager.cc         |  5 ++++
 src/runtime/request_manager.cc           |  5 +++-
 5 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 273019208..282b9f5c7 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -43,9 +43,13 @@ struct GenerationConfig {
 struct GenerationRequest {
   std::string prompt;
   double slo_ratio;
+  double emission_time_ms;
 
-  GenerationRequest(std::string const &prompt_, double slo_ratio_)
-      : prompt(prompt_), slo_ratio(slo_ratio_) {}
+  GenerationRequest(std::string const &prompt_,
+                    double slo_ratio_,
+                    double emission_time_ms_)
+      : prompt(prompt_), slo_ratio(slo_ratio_),
+        emission_time_ms(emission_time_ms_) {}
 };
 
 struct GenerationResult {
@@ -56,6 +60,8 @@ struct GenerationResult {
   std::string output_text;
   std::vector<TokenId> input_tokens;
   std::vector<TokenId> output_tokens;
+  double slo_ratio;
+  double emission_time_ms;
 };
 
 // Contains the configuration for how to emit requests to the server,
@@ -64,6 +70,7 @@ class EmissionMachine {
 public:
   enum class EmissionMode { Constant, Poisson, Trace };
   EmissionMode mode;
+  double elapsed_time_ms;
   double last_request_time_ms;
   double req_per_s;
   std::vector<std::pair<double, double>> slo_ratios;
@@ -71,8 +78,8 @@ class EmissionMachine {
   EmissionMachine(EmissionMode mode_,
                   double req_per_s_,
                   std::vector<std::pair<double, double>> slo_ratios_)
-      : mode(mode_), last_request_time_ms(0), req_per_s(req_per_s_),
-        slo_ratios(slo_ratios_) {
+      : mode(mode_), elapsed_time_ms(0), last_request_time_ms(0),
+        req_per_s(req_per_s_), slo_ratios(slo_ratios_) {
     // cumulate the slo ratios for sampling
     for (size_t i = 1; i < slo_ratios.size(); i++) {
       slo_ratios[i].second += slo_ratios[i - 1].second;
@@ -83,6 +90,7 @@ class EmissionMachine {
   // Simulate next request arrival time
   virtual double get_next_interval_ms() = 0;
   double sample_slo_ratio();
+  double get_elapsed_time_ms();
 };
 
 class ConstantEmissionMachine : public EmissionMachine {
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 274db6286..fb203fd9b 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -57,7 +57,8 @@ void parse_input_args(char **argv,
                       int &baseline_latency_ms,
                       int &ssm_spec_latency_ms,
                       int &llm_verify_latency_ms,
-                      double &request_per_second) {
+                      double &request_per_second,
+                      std::string &emission_file_path) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -151,6 +152,10 @@ void parse_input_args(char **argv,
       request_per_second = std::stod(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--emission-file-path")) {
+      emission_file_path = std::string(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -193,6 +198,7 @@ void FlexFlow::top_level_task(Task const *task,
   int ssm_spec_latency_ms = 20;
   int llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
+  std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -217,7 +223,8 @@ void FlexFlow::top_level_task(Task const *task,
                    baseline_latency_ms,
                    ssm_spec_latency_ms,
                    llm_verify_latency_ms,
-                   request_per_second);
+                   request_per_second,
+                   emission_file_path);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -371,13 +378,27 @@ void FlexFlow::top_level_task(Task const *task,
 
     std::vector<GenerationRequest> requests;
     for (size_t i = 1; i < prompt_json.size(); ++i) {
-      requests.push_back(
-          GenerationRequest(prompt_json[i]["prompt"].get<std::string>(), -1.0));
+      requests.push_back(GenerationRequest(
+          prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
     }
     PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
     // ConstantEmissionMachine emission_machine(-1, slo_ratios);
     std::vector<GenerationResult> result =
         model.generate(requests, emission_machine);
+
+    // output generation results as json
+    if (!emission_file_path.empty()) {
+      json output_json;
+      for (size_t i = 0; i < result.size(); ++i) {
+        json result_json;
+        result_json["prompt"] = requests[i].prompt;
+        result_json["slo_ratio"] = result[i].slo_ratio;
+        result_json["emission_time_ms"] = result[i].emission_time_ms;
+        output_json.push_back(result_json);
+      }
+      std::ofstream emission_file_handle(emission_file_path);
+      emission_file_handle << output_json.dump(2) << std::endl;
+    }
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 5d7f0bb1b..c1315fd48 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -21,6 +21,7 @@
 #include <cassert>
 #include <filesystem>
 #include <nlohmann/json.hpp>
+#include <string>
 #include <wordexp.h>
 
 using namespace FlexFlow;
@@ -77,7 +78,8 @@ void parse_input_args(char **argv,
                       int &baseline_latency_ms,
                       int &ssm_spec_latency_ms,
                       int &llm_verify_latency_ms,
-                      double &request_per_second) {
+                      double &request_per_second,
+                      std::string &emission_file_path) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -189,6 +191,10 @@ void parse_input_args(char **argv,
       request_per_second = std::stod(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--emission-file-path")) {
+      emission_file_path = std::string(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -360,6 +366,7 @@ void FlexFlow::top_level_task(Task const *task,
   int ssm_spec_latency_ms = 20;
   int llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
+  std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -386,7 +393,8 @@ void FlexFlow::top_level_task(Task const *task,
                    baseline_latency_ms,
                    ssm_spec_latency_ms,
                    llm_verify_latency_ms,
-                   request_per_second);
+                   request_per_second,
+                   emission_file_path);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -544,13 +552,27 @@ void FlexFlow::top_level_task(Task const *task,
 
     std::vector<GenerationRequest> requests;
     for (size_t i = 1; i < prompt_json.size(); ++i) {
-      requests.push_back(
-          GenerationRequest(prompt_json[i]["prompt"].get<std::string>(), -1.0));
+      requests.push_back(GenerationRequest(
+          prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
     }
     PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
     // ConstantEmissionMachine emission_machine(-1, slo_ratios);
     std::vector<GenerationResult> result =
         tree_model.generate(requests, emission_machine);
+
+    // output generation results as json
+    if (!emission_file_path.empty()) {
+      json output_json;
+      for (size_t i = 0; i < result.size(); ++i) {
+        json result_json;
+        result_json["prompt"] = requests[i].prompt;
+        result_json["slo_ratio"] = result[i].slo_ratio;
+        result_json["emission_time_ms"] = result[i].emission_time_ms;
+        output_json.push_back(result_json);
+      }
+      std::ofstream emission_file_handle(emission_file_path);
+      emission_file_handle << output_json.dump(2) << std::endl;
+    }
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index b2a231b8f..019a7cb3d 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -691,10 +691,15 @@ void EmissionMachine::wait_until_next_request() {
       get_next_interval_ms() - (current_time - last_request_time_ms);
   if (time_to_sleep > 0) {
     usleep(static_cast<useconds_t>(time_to_sleep * 1e3));
+    elapsed_time_ms += time_to_sleep;
   }
   last_request_time_ms = Realm::Clock::current_time_in_microseconds() * 1e-3;
 }
 
+double EmissionMachine::get_elapsed_time_ms() {
+  return elapsed_time_ms;
+}
+
 double ConstantEmissionMachine::get_next_interval_ms() {
   return interval_ms;
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 349d647b4..44a067dcd 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -447,6 +447,8 @@ RequestManager::RequestGuid
   gr.input_tokens = request.tokens;
   gr.output_text = req.prompt;
   gr.output_tokens = request.tokens;
+  gr.slo_ratio = req.slo_ratio;
+  gr.emission_time_ms = req.emission_time_ms;
 
   {
     std::lock_guard<std::mutex> const lock(request_queue_mutex);
@@ -2154,6 +2156,7 @@ std::vector<GenerationResult>
 
   for (size_t i = 0; i < requests.size(); i++) {
     requests[i].slo_ratio = emission_machine.sample_slo_ratio();
+    requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms();
     printf("Prompt[%ld] with slo %.3f: %s\n",
            i,
            requests[i].slo_ratio,
@@ -2176,7 +2179,7 @@ std::vector<GenerationResult>
                       EmissionMachine &emission_machine) {
   std::vector<GenerationRequest> requests;
   for (std::string &prompt : prompts) {
-    requests.push_back(GenerationRequest(prompt, -1.0));
+    requests.push_back(GenerationRequest(prompt, -1.0, 0));
   }
   return generate(requests, emission_machine);
 }

From 0420199342d8fa488ee2560df0172ac2f96e510a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 20 Sep 2024 21:15:51 -0700
Subject: [PATCH 490/667] Dynamically control tree width to not exceed
 max_tokens_per_ssm_batch.

---
 src/runtime/request_manager.cc | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 44a067dcd..d7cd0afbf 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -510,13 +510,7 @@ size_t RequestManager::get_num_processed_requests() {
 }
 
 int RequestManager::get_num_active_requests() {
-  int count = 0;
-  for (int i = 0; i < get_max_requests_per_batch(); i++) {
-    if (guid_of_requests[i] != INVALID_GUID) {
-      count++;
-    }
-  }
-  return count;
+  return num_available_requests;
 }
 
 int RequestManager::get_empty_request_index() {
@@ -2604,6 +2598,10 @@ void RequestManager::add_tokens_to_spec_token_tree(
     InferenceResult const &ssm_inference_result) {
   // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
   // TODO: support gumbel sampling
+  int tree_width =
+      min(get_max_tokens_per_ssm_batch() / get_num_active_requests(),
+          get_max_tree_width());
+  assert(tree_width >= 1);
 
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
@@ -2626,7 +2624,7 @@ void RequestManager::add_tokens_to_spec_token_tree(
     std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();
     std::vector<std::pair<double, int>> preallocated_vector;
-    preallocated_vector.reserve(get_max_tree_width());
+    preallocated_vector.reserve(tree_width);
     std::priority_queue<std::pair<double, int>,
                         std::vector<std::pair<double, int>>,
                         std::greater<std::pair<double, int>>>
@@ -2653,14 +2651,14 @@ void RequestManager::add_tokens_to_spec_token_tree(
 
         double accumulated_log_prob = log_prob + parent_log_prob;
 
-        if (child_probs_pq.size() == get_max_tree_width() and
+        if (child_probs_pq.size() == tree_width and
             accumulated_log_prob > child_probs_pq.top().first) {
           // The current layer is full, and the new token has a higher
           // log prob than the minimum node in tokens, we don't need to add
           // the new token to the priority queue, and remove the minimum node
           // from the priority queue
           child_probs_pq.pop();
-        } else if (child_probs_pq.size() == get_max_tree_width()) {
+        } else if (child_probs_pq.size() == tree_width) {
           // The current layer is full, and the new token has a lower log prob
           // than the minimum node in tokens, we don't need to add the new token
           // to the priority queue

From 7c7376aeb48647cd6978afacc57eb3e9ebfdf9f7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 22 Sep 2024 14:36:04 -0700
Subject: [PATCH 491/667] Simplified the method to add tokens to the token
 trees.

---
 src/runtime/request_manager.cc | 40 ++++++++++++----------------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d7cd0afbf..837184d0d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2623,13 +2623,9 @@ void RequestManager::add_tokens_to_spec_token_tree(
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();
-    std::vector<std::pair<double, int>> preallocated_vector;
-    preallocated_vector.reserve(tree_width);
-    std::priority_queue<std::pair<double, int>,
-                        std::vector<std::pair<double, int>>,
-                        std::greater<std::pair<double, int>>>
-        child_probs_pq(std::greater<std::pair<double, int>>(),
-                       std::move(preallocated_vector));
+    std::vector<std::pair<double, int>> child_probs_v;
+    child_probs_v.reserve(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES *
+                          get_max_tree_width());
     int parent_pos = 0;
     for (auto const &parent_ptr : last_layer) {
       double parent_log_prob = parent_ptr->log_accumulated_prob;
@@ -2651,30 +2647,22 @@ void RequestManager::add_tokens_to_spec_token_tree(
 
         double accumulated_log_prob = log_prob + parent_log_prob;
 
-        if (child_probs_pq.size() == tree_width and
-            accumulated_log_prob > child_probs_pq.top().first) {
-          // The current layer is full, and the new token has a higher
-          // log prob than the minimum node in tokens, we don't need to add
-          // the new token to the priority queue, and remove the minimum node
-          // from the priority queue
-          child_probs_pq.pop();
-        } else if (child_probs_pq.size() == tree_width) {
-          // The current layer is full, and the new token has a lower log prob
-          // than the minimum node in tokens, we don't need to add the new token
-          // to the priority queue
-          continue;
-        }
-        child_probs_pq.push(std::make_pair(accumulated_log_prob, result_idx));
+        child_probs_v.emplace_back(accumulated_log_prob, result_idx);
       }
       parent_pos++;
     }
 
     spec_token_tree.add_layer();
-    while (!child_probs_pq.empty()) {
-      std::pair<double, int> child_pair = child_probs_pq.top();
-      child_probs_pq.pop();
-      double accumulated_log_prob = child_pair.first;
-      int result_idx = child_pair.second;
+    int actual_width = min(tree_width, (int)child_probs_v.size());
+    if (actual_width == 0) {
+      continue;
+    }
+    std::partial_sort(child_probs_v.begin(),
+                      child_probs_v.begin() + actual_width,
+                      child_probs_v.end(),
+                      std::greater<std::pair<double, int>>());
+    for (int i = 0; i < actual_width; i++) {
+      auto [accumulated_log_prob, result_idx] = child_probs_v[i];
       int parent_pos = (result_idx - result_offset) /
                        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
       std::shared_ptr<TokenTreeNode> node_ptr = std::make_shared<TokenTreeNode>(

From 4396fc996f29de48a49471fd330b92ba8b5b2f3a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 23 Sep 2024 20:24:57 -0700
Subject: [PATCH 492/667] Dynamic max tree depth control

---
 include/flexflow/request_manager.h |  2 ++
 src/runtime/request_manager.cc     | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index af8a80ee3..4df918562 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -422,6 +422,7 @@ class RequestManager {
   bool request_available[BatchConfig::MAX_NUM_REQUESTS];
   int num_available_requests = 0;
   int ssm_completed = true;
+  int ssm_tree_depth = 0;
 
   // Multi-model support
   std::vector<FFModel *> ssm_models;
@@ -510,6 +511,7 @@ class RequestManager {
   void add_tokens_toward_slo(RequestGuid guid, int &budget);
   void add_tokens_toward_memory_occupancy(int budget);
   void add_tokens_toward_goodput(int budget);
+  void update_token_tree_depth();
 
   /* ---------- Spec Decoding Helper Functions ---------- */
   void renormalize(std::vector<std::pair<TokenId, float>> &D,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 837184d0d..476da6a44 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -705,6 +705,11 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   // write_to_output_file("", str);
 }
 
+void RequestManager::update_token_tree_depth() {
+  ssm_tree_depth = min(get_max_tokens_per_batch() / get_num_active_requests(),
+                       get_max_tree_depth());
+}
+
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
   if (num_available_requests == 0) {
@@ -818,6 +823,7 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
       if (!ssm_completed) {
         ssm_completed = update_ssm_inference_results(result);
       }
+      // If the ssm speculation is completed, we do nothing
 
       if (current_ssm_step == get_max_tree_depth()) {
         request_manager_status = LLM_VERIFY;
@@ -1304,6 +1310,7 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     }
     profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
   }
+  update_token_tree_depth();
   if (verbose) {
     std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
               << std::endl;
@@ -1653,7 +1660,8 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  if (current_ssm_step == get_max_tree_depth()) {
+  //   if (current_ssm_step == get_max_tree_depth()) {
+  if (current_ssm_step == ssm_tree_depth) {
     // Prune the token tree at the last step
     prune_token_tree();
     // Update profiling statistics before returning
@@ -2319,6 +2327,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
 
   request_manager_status = PREFILLING;
   prefill_model = SSM;
+  ssm_tree_depth = get_max_tree_depth();
 
   std::queue<InferenceResultFuture> infer_result_future_pipeline;
   infer_result_future_pipeline.push(irf_0);

From eee85fe503b458831674f89a73aba0c87ae1d638 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 24 Sep 2024 00:22:15 -0700
Subject: [PATCH 493/667] feat: update raft dependency (select_k)

---
 .gitmodules         | 13 +++----------
 config/config.linux |  3 ++-
 config/raft.patch   | 11 -----------
 deps/raft           |  2 +-
 deps/rmm            |  1 -
 deps/spdlog         |  1 -
 6 files changed, 6 insertions(+), 25 deletions(-)
 delete mode 100644 config/raft.patch
 delete mode 160000 deps/rmm
 delete mode 160000 deps/spdlog

diff --git a/.gitmodules b/.gitmodules
index 2ec860704..6b437e036 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,16 +23,9 @@
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
 	fetchRecurseSubmodules = true
-[submodule "deps/spdlog"]
-	path = deps/spdlog
-	url = https://github.com/gabime/spdlog.git
-[submodule "deps/rmm"]
-	path = deps/rmm
-	url = https://github.com/rapidsai/rmm.git
-[submodule "deps/raft"]
-	path = deps/raft
-	url = https://github.com/rapidsai/raft.git
-	ignore = dirty
 [submodule "deps/flashinfer"]
 	path = deps/flashinfer
 	url = https://github.com/flashinfer-ai/flashinfer.git
+[submodule "deps/raft"]
+	path = deps/raft
+	url = https://github.com/rapidsai/raft.git
diff --git a/config/config.linux b/config/config.linux
index 15e9c8821..e99dccd8b 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -111,7 +111,8 @@ function get_build_configs() {
     BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
-patch -p0 --batch $(dirname $0)/../deps/raft/cpp/include/raft/matrix/detail/select_radix.cuh $(dirname $0)/../config/raft.patch
+#install raft
+INSTALL_PREFIX=./install $(dirname $0)/../deps/raft/build.sh libraft > /dev/null
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc
diff --git a/config/raft.patch b/config/raft.patch
deleted file mode 100644
index e587a590c..000000000
--- a/config/raft.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- raft/cpp/include/raft/matrix/detail/select_radix.cuh	2023-04-12 07:29:14.000000000 -0700
-+++ raft/cpp/include/raft/matrix/detail/select_radix_update.cuh	2023-04-20 19:06:53.323031000 -0700
-@@ -110,7 +110,7 @@
-   // When writing is not skipped, read `in_buf`(T) and `in_idx_buf`(IdxT), and write `out_buf`(T)
-   // and `out_idx_buf`(IdxT).
-   // The ratio between these cases determines whether to skip writing and hence the buffer size.
--  constexpr float ratio = 2 + sizeof(IdxT) * 2.0 / sizeof(T);
-+  constexpr float ratio = 128;
-   return len / ratio;
- }
- 
diff --git a/deps/raft b/deps/raft
index 7d1057e77..b79f15d2f 160000
--- a/deps/raft
+++ b/deps/raft
@@ -1 +1 @@
-Subproject commit 7d1057e77c71c0cb9d28043e3f1db036995ffe56
+Subproject commit b79f15d2f229849bc02425b2e4ffd7bd3db89d4c
diff --git a/deps/rmm b/deps/rmm
deleted file mode 160000
index 6797909d5..000000000
--- a/deps/rmm
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6797909d5304be6ee56c09c0156252e19f712639
diff --git a/deps/spdlog b/deps/spdlog
deleted file mode 160000
index 100f30043..000000000
--- a/deps/spdlog
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 100f30043f33277122e0991c83845a2617172ffd

From 7caaf725046605de50be075f5284a902fb1b9e16 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 24 Sep 2024 00:23:18 -0700
Subject: [PATCH 494/667] feat: raft build file

---
 CMakeLists.txt      | 10 +++++-----
 config/config.linux |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00f257b6e..aa3c33790 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.10)
-project(FlexFlow)
+project(FlexFlow LANGUAGES CXX CUDA)
 
 
 include(ExternalProject)
@@ -204,12 +204,10 @@ if(NOT BUILD_LEGION_ONLY)
   # optional
   include(optional)
 
+  set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/build/install)
+  find_package(raft)
   list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/raft/cpp/include)
 
-  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/rmm/include)
-
-  list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/spdlog/include)
-
   list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)
 
   if (FF_GPU_BACKEND STREQUAL "cuda")
@@ -414,6 +412,8 @@ if(NOT BUILD_LEGION_ONLY)
     target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
   endif()
 
+  target_link_libraries(flexflow raft::raft)
+
   #library api version, bump from time to time
   set(SOVERSION 1)
 
diff --git a/config/config.linux b/config/config.linux
index e99dccd8b..8eb4f3087 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -112,7 +112,9 @@ function get_build_configs() {
 }
 
 #install raft
+echo "Building raft dependency ..."
 INSTALL_PREFIX=./install $(dirname $0)/../deps/raft/build.sh libraft > /dev/null
+echo "Building raft dependency ... Done"
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
     . $(dirname $0)/config.inc

From 2ab10b1dfd6a5fd4a599c9961a3c68afe495a618 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 24 Sep 2024 13:29:08 -0700
Subject: [PATCH 495/667] chore: minor

---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa3c33790..892b3c114 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,15 @@
 cmake_minimum_required(VERSION 3.10)
-project(FlexFlow LANGUAGES CXX CUDA)
+project(FlexFlow)
 
 
 include(ExternalProject)
 
+enable_language(CXX)
+enable_language(CUDA)
+if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
+    message(FATAL_ERROR "Your C++ compiler is too old. Please upgrade to version 8 or higher.")
+endif()
+
 # Set policy CMP0074 to eliminate cmake warnings
 cmake_policy(SET CMP0074 NEW)
 cmake_policy(SET CMP0077 NEW)

From 57f6378741099f00379573f490eb45fad6ea4bfa Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 24 Sep 2024 13:30:57 -0700
Subject: [PATCH 496/667] feat: update argTopk op

---
 src/ops/arg_topk.cu      | 82 ++++++++++++----------------------------
 src/ops/select_k_impl.cu | 35 +++++++++++++++++
 2 files changed, 59 insertions(+), 58 deletions(-)
 create mode 100644 src/ops/select_k_impl.cu

diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 099273da4..da09f30f9 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -15,60 +15,13 @@
 
 #include "flexflow/ops/arg_topk.h"
 #include "flexflow/utils/cuda_helper.h"
-#include "raft/matrix/detail/select_radix.cuh"
+#include "raft/core/device_resources.hpp"
+#include "raft/matrix/detail/select_k.cuh"
 
 namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
 
-// Adopted from Raft's select_k
-// https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/matrix/detail/select_radix.cuh#L1113
-template <typename T, typename idxT>
-void raft_radix_11bits_kernel(T const *in,
-                              int batch_size,
-                              idxT len,
-                              idxT k,
-                              T *out,
-                              idxT *out_idx = nullptr,
-                              bool greater = true,
-                              cudaStream_t stream = 0) {
-  raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
-      in,
-      static_cast<idxT *>(nullptr),
-      batch_size,
-      len,
-      k,
-      out,
-      out_idx,
-      !greater,
-      true, // fused_last_filter
-      stream);
-}
-
-// Adopted from Raft's select_k
-// https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/matrix/detail/select_radix.cuh#L1113
-template <typename T, typename idxT>
-void raft_radix_11bits_extra_pass_kernel(T const *in,
-                                         int batch_size,
-                                         idxT len,
-                                         idxT k,
-                                         T *out,
-                                         idxT *out_idx = nullptr,
-                                         bool greater = true,
-                                         cudaStream_t stream = 0) {
-  raft::matrix::detail::select::radix::select_k<T, idxT, 11, 512>(
-      in,
-      static_cast<idxT *>(nullptr),
-      batch_size,
-      len,
-      k,
-      out,
-      out_idx,
-      !greater,
-      false, // fused_last_filter
-      stream);
-}
-
 __global__ void half2float_kernel(half const *__restrict__ in,
                                   float *__restrict__ out,
                                   int size) {
@@ -126,6 +79,9 @@ __global__ void renormalize_kernel(DT *topk_values,
   }
 }
 
+// Adopted from Raft's select_k
+// https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/matrix/detail/select_k.cuh
+
 /*static*/
 template <typename DT>
 void ArgTopK::forward_kernel(
@@ -141,15 +97,25 @@ void ArgTopK::forward_kernel(
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
   assert(bc->num_active_requests() >= 0);
-  raft_radix_11bits_extra_pass_kernel<DT, int>(
-      input_ptr, batch_size, length, k, output_ptr, indices_ptr, true, stream);
-  if (sorted) {
-    assert(output_ptr != nullptr);
-    insertion_sort_kernel<<<GET_BLOCKS(batch_size),
-                            min((size_t)CUDA_NUM_THREADS, batch_size),
-                            0,
-                            stream>>>(output_ptr, indices_ptr, batch_size, k);
-  }
+  raft::device_resources handle(stream);
+  raft::matrix::detail::select_k(handle,
+                                 input_ptr,
+                                 (int *)nullptr,
+                                 batch_size,
+                                 (size_t)length,
+                                 k,
+                                 output_ptr,
+                                 indices_ptr,
+                                 /*select_min=*/false,
+                                 sorted);
+  // if (sorted) {
+  //   assert(output_ptr != nullptr);
+  //   insertion_sort_kernel<<<GET_BLOCKS(batch_size),
+  //                           min((size_t)CUDA_NUM_THREADS, batch_size),
+  //                           0,
+  //                           stream>>>(output_ptr, indices_ptr, batch_size,
+  //                           k);
+  // }
   if (renormalize) {
     assert(output_ptr != nullptr);
     renormalize_kernel<<<GET_BLOCKS(batch_size),
diff --git a/src/ops/select_k_impl.cu b/src/ops/select_k_impl.cu
new file mode 100644
index 000000000..18c7f762f
--- /dev/null
+++ b/src/ops/select_k_impl.cu
@@ -0,0 +1,35 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "raft/matrix/detail/select_k-inl.cuh"
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
+  template void raft::matrix::detail::select_k(raft::resources const& handle, \
+                                               const T* in_val,               \
+                                               const IdxT* in_idx,            \
+                                               size_t batch_size,             \
+                                               size_t len,                    \
+                                               int k,                         \
+                                               T* out_val,                    \
+                                               IdxT* out_idx,                 \
+                                               bool select_min,               \
+                                               bool sorted,                   \
+                                               raft::matrix::SelectAlgo algo, \
+                                               const IdxT* len_i)
+
+instantiate_raft_matrix_detail_select_k(half, int);
+instantiate_raft_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_matrix_detail_select_k

From 9fa1f4e670cc72e134ff9169832e031750f02ead Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 24 Sep 2024 16:12:03 -0700
Subject: [PATCH 497/667] chore: update emission trace

---
 inference/incr_decoding/incr_decoding.cc | 2 ++
 inference/spec_infer/spec_infer.cc       | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index fb203fd9b..4ed2158f6 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -392,6 +392,8 @@ void FlexFlow::top_level_task(Task const *task,
       for (size_t i = 0; i < result.size(); ++i) {
         json result_json;
         result_json["prompt"] = requests[i].prompt;
+        result_json["input_length"] = result[i].input_tokens.size();
+        result_json["output_length"] = result[i].output_tokens.size();
         result_json["slo_ratio"] = result[i].slo_ratio;
         result_json["emission_time_ms"] = result[i].emission_time_ms;
         output_json.push_back(result_json);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index c1315fd48..36d57d4c5 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -566,6 +566,8 @@ void FlexFlow::top_level_task(Task const *task,
       for (size_t i = 0; i < result.size(); ++i) {
         json result_json;
         result_json["prompt"] = requests[i].prompt;
+        result_json["input_length"] = result[i].input_tokens.size();
+        result_json["output_length"] = result[i].output_tokens.size();
         result_json["slo_ratio"] = result[i].slo_ratio;
         result_json["emission_time_ms"] = result[i].emission_time_ms;
         output_json.push_back(result_json);

From 0a516c6eec619d954391fa52489066677c5ef0b2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 26 Sep 2024 11:40:20 -0700
Subject: [PATCH 498/667] feat: add TraceEmissionMachine

---
 include/flexflow/inference.h             |  43 ++++++++-
 inference/incr_decoding/incr_decoding.cc | 106 +++++++++++++----------
 inference/spec_infer/spec_infer.cc       | 106 +++++++++++++----------
 src/ops/select_k_impl.cu                 |  26 +++---
 src/runtime/inference_manager.cc         |  38 ++++++++
 5 files changed, 217 insertions(+), 102 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 282b9f5c7..5a6dd8f2b 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -15,9 +15,12 @@
 
 #pragma once
 #include "flexflow/batch_config.h"
+#include <nlohmann/json.hpp>
 #include <string>
 #include <vector>
 
+using json = nlohmann::json;
+
 namespace FlexFlow {
 
 struct GenerationConfig {
@@ -89,10 +92,34 @@ class EmissionMachine {
 
   // Simulate next request arrival time
   virtual double get_next_interval_ms() = 0;
-  double sample_slo_ratio();
+  virtual double sample_slo_ratio();
   double get_elapsed_time_ms();
 };
 
+class EmissionTrace {
+public:
+  std::string prompt;
+  int input_length, output_length;
+  double slo_ratio;
+  double emission_time_ms;
+
+  EmissionTrace(std::string prompt_,
+                int input_length_,
+                int output_length_,
+                double slo_ratio_,
+                double emission_time_ms_)
+      : prompt(prompt_), input_length(input_length_),
+        output_length(output_length_), slo_ratio(slo_ratio_),
+        emission_time_ms(emission_time_ms_) {}
+  EmissionTrace(GenerationResult const &result)
+      : prompt(result.input_text), input_length(result.input_tokens.size()),
+        output_length(result.output_tokens.size()), slo_ratio(result.slo_ratio),
+        emission_time_ms(result.emission_time_ms) {}
+  EmissionTrace(json const &json_obj);
+
+  json to_json() const;
+};
+
 class ConstantEmissionMachine : public EmissionMachine {
 public:
   double interval_ms;
@@ -117,6 +144,20 @@ class PoissonEmissionMachine : public EmissionMachine {
   double get_next_interval_ms() override;
 };
 
+class TraceEmissionMachine : public EmissionMachine {
+public:
+  std::vector<double> timestamps, ratios;
+  size_t idx;
+
+  TraceEmissionMachine(std::vector<double> const &timestamps_,
+                       std::vector<double> const &ratios_)
+      : EmissionMachine(EmissionMode::Trace, 0, {}), timestamps(timestamps_),
+        ratios(ratios_), idx(0) {}
+
+  double get_next_interval_ms() override;
+  double sample_slo_ratio() override;
+};
+
 #include <string>
 #include <vector>
 
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 4ed2158f6..e289ae4ce 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -23,8 +23,6 @@
 #include <cassert>
 #include <wordexp.h>
 
-#include <nlohmann/json.hpp>
-
 using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
@@ -34,6 +32,7 @@ LegionRuntime::Logger::Category log_app("llama");
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
+  std::string trace_file_path;
   std::string output_file_path;
 };
 
@@ -78,6 +77,11 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
+    // traces
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -348,55 +352,69 @@ void FlexFlow::top_level_task(Task const *task,
   rm->start_background_server(&model);
 
   {
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
+    std::vector<GenerationRequest> requests;
+    std::vector<GenerationResult> results;
 
-    // Parse slo_ratios
-    std::vector<std::pair<double, double>> slo_ratios;
-    if (prompt_json[0].contains("slo_ratios")) {
-      for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
-        slo_ratios.emplace_back(std::stod(key), value.get<double>());
-      }
-    }
-    double total =
-        std::accumulate(slo_ratios.begin(),
-                        slo_ratios.end(),
-                        0.0,
-                        [](double sum, std::pair<double, double> const &pair) {
-                          return sum + pair.second;
-                        });
-    if (std::abs(total - 1.0) > 1e-6) {
-      std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
-                << total << std::endl;
-      assert(false);
-    }
+    if (!file_paths.prompt_file_path.empty()) {
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
 
-    std::vector<GenerationRequest> requests;
-    for (size_t i = 1; i < prompt_json.size(); ++i) {
-      requests.push_back(GenerationRequest(
-          prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
+      // Parse slo_ratios
+      std::vector<std::pair<double, double>> slo_ratios;
+      if (prompt_json[0].contains("slo_ratios")) {
+        for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+          slo_ratios.emplace_back(std::stod(key), value.get<double>());
+        }
+      }
+      double total = std::accumulate(
+          slo_ratios.begin(),
+          slo_ratios.end(),
+          0.0,
+          [](double sum, std::pair<double, double> const &pair) {
+            return sum + pair.second;
+          });
+      if (std::abs(total - 1.0) > 1e-6) {
+        std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                  << total << std::endl;
+        assert(false);
+      }
+      for (size_t i = 1; i < prompt_json.size(); ++i) {
+        requests.push_back(GenerationRequest(
+            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
+      }
+      PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
+      // ConstantEmissionMachine emission_machine(-1, slo_ratios);
+      results = model.generate(requests, emission_machine);
+    } else if (!file_paths.trace_file_path.empty()) {
+      std::ifstream file_handle(file_paths.trace_file_path);
+      assert(file_handle.good() && "Trace file does not exist.");
+      json trace_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+      std::vector<double> timestamps, ratios;
+      for (auto const &json_obj : trace_json) {
+        EmissionTrace trace(json_obj);
+        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0));
+        timestamps.push_back(trace.emission_time_ms);
+        ratios.push_back(trace.slo_ratio);
+      }
+      TraceEmissionMachine emission_machine(timestamps, ratios);
+      results = model.generate(requests, emission_machine);
+    } else {
+      assert(false && "No prompt or trace file provided.");
     }
-    PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
-    // ConstantEmissionMachine emission_machine(-1, slo_ratios);
-    std::vector<GenerationResult> result =
-        model.generate(requests, emission_machine);
 
     // output generation results as json
     if (!emission_file_path.empty()) {
       json output_json;
-      for (size_t i = 0; i < result.size(); ++i) {
-        json result_json;
-        result_json["prompt"] = requests[i].prompt;
-        result_json["input_length"] = result[i].input_tokens.size();
-        result_json["output_length"] = result[i].output_tokens.size();
-        result_json["slo_ratio"] = result[i].slo_ratio;
-        result_json["emission_time_ms"] = result[i].emission_time_ms;
-        output_json.push_back(result_json);
+      for (size_t i = 0; i < results.size(); ++i) {
+        EmissionTrace trace(results[i]);
+        output_json.push_back(trace.to_json());
       }
       std::ofstream emission_file_handle(emission_file_path);
       emission_file_handle << output_json.dump(2) << std::endl;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 36d57d4c5..aa5e1693b 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -20,7 +20,6 @@
 #include "models/opt.h"
 #include <cassert>
 #include <filesystem>
-#include <nlohmann/json.hpp>
 #include <string>
 #include <wordexp.h>
 
@@ -33,6 +32,7 @@ LegionRuntime::Logger::Category log_app("llama");
 struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
+  std::string trace_file_path;
   std::string output_file_path;
 };
 
@@ -108,6 +108,11 @@ void parse_input_args(char **argv,
       paths.prompt_file_path = std::string(argv[++i]);
       continue;
     }
+    // traces
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
     // output file
     if (!strcmp(argv[i], "-output-file")) {
       paths.output_file_path = std::string(argv[++i]);
@@ -522,55 +527,68 @@ void FlexFlow::top_level_task(Task const *task,
 
   // Register requests from prompt file
   {
-    using json = nlohmann::json;
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
-
-    // Parse slo_ratios
-    std::vector<std::pair<double, double>> slo_ratios;
-    if (prompt_json[0].contains("slo_ratios")) {
-      for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
-        slo_ratios.emplace_back(std::stod(key), value.get<double>());
-      }
-    }
-    double total =
-        std::accumulate(slo_ratios.begin(),
-                        slo_ratios.end(),
-                        0.0,
-                        [](double sum, std::pair<double, double> const &pair) {
-                          return sum + pair.second;
-                        });
-    if (std::abs(total - 1.0) > 1e-6) {
-      std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
-                << total << std::endl;
-      assert(false);
-    }
-
     std::vector<GenerationRequest> requests;
-    for (size_t i = 1; i < prompt_json.size(); ++i) {
-      requests.push_back(GenerationRequest(
-          prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
+    std::vector<GenerationResult> results;
+
+    if (!file_paths.prompt_file_path.empty()) {
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      // Parse slo_ratios
+      std::vector<std::pair<double, double>> slo_ratios;
+      if (prompt_json[0].contains("slo_ratios")) {
+        for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+          slo_ratios.emplace_back(std::stod(key), value.get<double>());
+        }
+      }
+      double total = std::accumulate(
+          slo_ratios.begin(),
+          slo_ratios.end(),
+          0.0,
+          [](double sum, std::pair<double, double> const &pair) {
+            return sum + pair.second;
+          });
+      if (std::abs(total - 1.0) > 1e-6) {
+        std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                  << total << std::endl;
+        assert(false);
+      }
+      for (size_t i = 1; i < prompt_json.size(); ++i) {
+        requests.push_back(GenerationRequest(
+            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
+      }
+      PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
+      // ConstantEmissionMachine emission_machine(-1, slo_ratios);
+      results = tree_model.generate(requests, emission_machine);
+    } else if (!file_paths.trace_file_path.empty()) {
+      std::ifstream file_handle(file_paths.trace_file_path);
+      assert(file_handle.good() && "Trace file does not exist.");
+      json trace_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+      std::vector<double> timestamps, ratios;
+      for (auto const &json_obj : trace_json) {
+        EmissionTrace trace(json_obj);
+        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0));
+        timestamps.push_back(trace.emission_time_ms);
+        ratios.push_back(trace.slo_ratio);
+      }
+      TraceEmissionMachine emission_machine(timestamps, ratios);
+      results = tree_model.generate(requests, emission_machine);
+    } else {
+      assert(false && "No prompt or trace file provided.");
     }
-    PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
-    // ConstantEmissionMachine emission_machine(-1, slo_ratios);
-    std::vector<GenerationResult> result =
-        tree_model.generate(requests, emission_machine);
 
     // output generation results as json
     if (!emission_file_path.empty()) {
       json output_json;
-      for (size_t i = 0; i < result.size(); ++i) {
-        json result_json;
-        result_json["prompt"] = requests[i].prompt;
-        result_json["input_length"] = result[i].input_tokens.size();
-        result_json["output_length"] = result[i].output_tokens.size();
-        result_json["slo_ratio"] = result[i].slo_ratio;
-        result_json["emission_time_ms"] = result[i].emission_time_ms;
-        output_json.push_back(result_json);
+      for (size_t i = 0; i < results.size(); ++i) {
+        EmissionTrace trace(results[i]);
+        output_json.push_back(trace.to_json());
       }
       std::ofstream emission_file_handle(emission_file_path);
       emission_file_handle << output_json.dump(2) << std::endl;
diff --git a/src/ops/select_k_impl.cu b/src/ops/select_k_impl.cu
index 18c7f762f..9fcdbb719 100644
--- a/src/ops/select_k_impl.cu
+++ b/src/ops/select_k_impl.cu
@@ -15,19 +15,19 @@
 
 #include "raft/matrix/detail/select_k-inl.cuh"
 
-#define instantiate_raft_matrix_detail_select_k(T, IdxT)                      \
-  template void raft::matrix::detail::select_k(raft::resources const& handle, \
-                                               const T* in_val,               \
-                                               const IdxT* in_idx,            \
-                                               size_t batch_size,             \
-                                               size_t len,                    \
-                                               int k,                         \
-                                               T* out_val,                    \
-                                               IdxT* out_idx,                 \
-                                               bool select_min,               \
-                                               bool sorted,                   \
-                                               raft::matrix::SelectAlgo algo, \
-                                               const IdxT* len_i)
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                       \
+  template void raft::matrix::detail::select_k(raft::resources const &handle,  \
+                                               const T *in_val,                \
+                                               const IdxT *in_idx,             \
+                                               size_t batch_size,              \
+                                               size_t len,                     \
+                                               int k,                          \
+                                               T *out_val,                     \
+                                               IdxT *out_idx,                  \
+                                               bool select_min,                \
+                                               bool sorted,                    \
+                                               raft::matrix::SelectAlgo algo,  \
+                                               const IdxT *len_i)
 
 instantiate_raft_matrix_detail_select_k(half, int);
 instantiate_raft_matrix_detail_select_k(float, int);
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 019a7cb3d..28ec523b9 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -22,6 +22,7 @@
 #include "flexflow/ops/noop.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 #include "flexflow/request_manager.h"
+#include <cassert>
 #include <random>
 
 namespace FlexFlow {
@@ -700,6 +701,24 @@ double EmissionMachine::get_elapsed_time_ms() {
   return elapsed_time_ms;
 }
 
+EmissionTrace::EmissionTrace(json const &json_obj) {
+  prompt = json_obj["prompt"].get<std::string>();
+  input_length = json_obj["input_length"].get<int>();
+  output_length = json_obj["output_length"].get<int>();
+  slo_ratio = json_obj["slo_ratio"].get<double>();
+  emission_time_ms = json_obj["emission_time_ms"].get<double>();
+}
+
+json EmissionTrace::to_json() const {
+  json json_obj;
+  json_obj["prompt"] = prompt;
+  json_obj["input_length"] = input_length;
+  json_obj["output_length"] = output_length;
+  json_obj["slo_ratio"] = slo_ratio;
+  json_obj["emission_time_ms"] = emission_time_ms;
+  return json_obj;
+}
+
 double ConstantEmissionMachine::get_next_interval_ms() {
   return interval_ms;
 }
@@ -713,7 +732,17 @@ double PoissonEmissionMachine::get_next_interval_ms() {
   return distribution(generator) * 1e3;
 }
 
+double TraceEmissionMachine::get_next_interval_ms() {
+  if (timestamps.empty()) {
+    return 0;
+  }
+  double next_interval = timestamps[idx] - elapsed_time_ms;
+  idx++;
+  return next_interval;
+}
+
 double EmissionMachine::sample_slo_ratio() {
+  assert(!slo_ratios.empty());
   static std::default_random_engine generator(
       std::chrono::system_clock::now().time_since_epoch().count());
   static std::uniform_real_distribution<double> distribution(0.0, 1.0);
@@ -726,4 +755,13 @@ double EmissionMachine::sample_slo_ratio() {
   }
   return slo_ratios.back().first;
 }
+
+double TraceEmissionMachine::sample_slo_ratio() {
+  // NOTE: Should be called before wait_until_next_request.
+  if (ratios.empty()) {
+    return 1.0;
+  }
+  double next_slo_ratio = ratios[idx];
+  return next_slo_ratio;
+}
 }; // namespace FlexFlow

From 2071273019f830e73f68219c985fb8b0afdadbfd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 27 Sep 2024 22:24:42 -0700
Subject: [PATCH 499/667] Add back old scheduler

---
 include/flexflow/request_manager.h |   5 ++
 src/runtime/request_manager.cc     | 104 +++++++++++++++++++++++++++--
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4df918562..09d38e673 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -292,6 +292,8 @@ class RequestManager {
   void set_memory_occupancy(bool memory_occupancy);
   void
       set_slo_violation_early_termination(bool slo_violation_early_termination);
+  void set_spec_infer_old_version(bool spec_infer_old_version);
+  bool get_spec_infer_old_version();
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
@@ -395,6 +397,7 @@ class RequestManager {
   bool streaming_cache = false;
   bool memory_occupancy = false;
   bool slo_violation_early_termination = false;
+  bool spec_infer_old_version = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -507,6 +510,8 @@ class RequestManager {
                                    BatchConfig::TokenId token_id);
   void add_tokens_to_spec_token_tree(
       InferenceResult const &ssm_inference_result);
+  void add_tokens_to_spec_token_tree_old_version(
+      InferenceResult const &ssm_inference_result);
   void prune_token_tree();
   void add_tokens_toward_slo(RequestGuid guid, int &budget);
   void add_tokens_toward_memory_occupancy(int budget);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 476da6a44..e4b87b9a8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -296,6 +296,14 @@ void RequestManager::set_slo_violation_early_termination(
   slo_violation_early_termination = slo_violation_early_termination_;
 }
 
+void RequestManager::set_spec_infer_old_version(bool spec_infer_old_version_) {
+  spec_infer_old_version = spec_infer_old_version_;
+}
+
+bool RequestManager::get_spec_infer_old_version() {
+  return spec_infer_old_version;
+}
+
 double RequestManager::get_request_expected_latency(Request &request) {
   return request.get_slo_ratio() * baseline_latency_ms *
          (request.tokens.size() - request.llm_prefill_len);
@@ -1310,7 +1318,11 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
     }
     profiling.ssm_step_start = Realm::Clock::current_time_in_microseconds();
   }
-  update_token_tree_depth();
+
+  if (!spec_infer_old_version) {
+    // Only dynamically update the tree depth in the new version
+    update_token_tree_depth();
+  }
   if (verbose) {
     std::cout << "prepare_first_spec_batch_config NEW batchconfig:"
               << std::endl;
@@ -1630,7 +1642,11 @@ bool RequestManager::update_ssm_inference_results(
   // Here we assume that the order of the tokens in the last
   // BatchConfig and hence the last InferenceResult is equal to
   // the order of the request in the last BatchConfig
-  add_tokens_to_spec_token_tree(ssm_inference_result);
+  if (!spec_infer_old_version) {
+    add_tokens_to_spec_token_tree(ssm_inference_result);
+  } else {
+    add_tokens_to_spec_token_tree_old_version(ssm_inference_result);
+  }
 
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
@@ -1660,10 +1676,11 @@ bool RequestManager::update_ssm_inference_results(
   }
 
   // Stop conditions
-  //   if (current_ssm_step == get_max_tree_depth()) {
   if (current_ssm_step == ssm_tree_depth) {
     // Prune the token tree at the last step
-    prune_token_tree();
+    if (!spec_infer_old_version) {
+      prune_token_tree();
+    }
     // Update profiling statistics before returning
     profiling.ssm_step_times.push_back(
         (Realm::Clock::current_time_in_microseconds() -
@@ -1811,6 +1828,7 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   // verification.
   return llm_bitmask;
 }
+
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {
@@ -2607,6 +2625,7 @@ void RequestManager::add_tokens_to_spec_token_tree(
     InferenceResult const &ssm_inference_result) {
   // TODO: parameterize MAX_SPECULATIVE_TREE_BRANCHES
   // TODO: support gumbel sampling
+
   int tree_width =
       min(get_max_tokens_per_ssm_batch() / get_num_active_requests(),
           get_max_tree_width());
@@ -2685,6 +2704,83 @@ void RequestManager::add_tokens_to_spec_token_tree(
   }
 }
 
+void RequestManager::add_tokens_to_spec_token_tree_old_version(
+    InferenceResult const &ssm_inference_result) {
+
+  std::vector<int> tree_width_vector = {1, 1, 3, 1, 1, 1, 1, 1};
+
+  int expand_width = tree_width_vector[current_ssm_step - 1];
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      // Request in this slot is unavailable
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+
+    int parent_num = request.num_tokens_in_batch;
+    if (parent_num == 0) {
+      continue;
+    }
+
+    int result_offset = request.first_token_offset_in_batch *
+                        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    TokenTree &spec_token_tree = request.speculative_token_trees[0];
+    std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
+        spec_token_tree.tree_layers.back();
+    spec_token_tree.add_layer();
+
+    int parent_pos = 0;
+    for (auto const &parent_ptr : last_layer) {
+      double parent_log_prob = parent_ptr->log_accumulated_prob;
+      int child_start_idx =
+          result_offset +
+          parent_pos * BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+      std::vector<std::pair<double, int>> child_probs_v;
+      child_probs_v.reserve(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES);
+      for (int result_idx = child_start_idx;
+           result_idx <
+           child_start_idx + BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+           result_idx++) {
+        double log_prob = log((double)ssm_inference_result.probs[result_idx]);
+        if (log_prob == -std::numeric_limits<double>::infinity()) {
+          continue;
+        }
+        if (log_prob == 0.0) {
+          // Slightly perturb the log prob to make it strictly less than 0
+          log_prob -= 1e-10;
+        }
+
+        double accumulated_log_prob = log_prob + parent_log_prob;
+
+        child_probs_v.emplace_back(accumulated_log_prob, result_idx);
+      }
+      int actual_width = min(expand_width, (int)child_probs_v.size());
+      if (actual_width == 0) {
+        continue;
+      }
+      std::partial_sort(child_probs_v.begin(),
+                        child_probs_v.begin() + actual_width,
+                        child_probs_v.end(),
+                        std::greater<std::pair<double, int>>());
+      for (int i = 0; i < actual_width; i++) {
+        auto [accumulated_log_prob, result_idx] = child_probs_v[i];
+        std::shared_ptr<TokenTreeNode> node_ptr =
+            std::make_shared<TokenTreeNode>(
+                ssm_inference_result.token_ids[result_idx],
+                accumulated_log_prob,
+                parent_pos);
+        node_ptr->included = true;
+        spec_token_tree.tree_layers.back().push_back(node_ptr);
+      }
+      parent_pos++;
+    }
+  }
+}
+
 void RequestManager::prune_token_tree() {
   // Each reqeust has at least one token
   int budget = get_max_tokens_per_batch() - num_available_requests;

From 79f9130142d0984ee47d571adb549edfb70d484f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 30 Sep 2024 21:47:25 -0700
Subject: [PATCH 500/667] feat: add trace generator

---
 CMakeLists.txt                               |   1 +
 include/flexflow/request_manager.h           |   1 +
 inference/trace_generator/CMakeLists.txt     |  37 ++
 inference/trace_generator/Makefile           |  37 ++
 inference/trace_generator/trace_generator.cc | 410 +++++++++++++++++++
 src/runtime/request_manager.cc               |   4 +
 6 files changed, 490 insertions(+)
 create mode 100644 inference/trace_generator/CMakeLists.txt
 create mode 100644 inference/trace_generator/Makefile
 create mode 100644 inference/trace_generator/trace_generator.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 892b3c114..bbe817c1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -581,6 +581,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/trace_generator)
   endif()
 
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4df918562..305cb4f16 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -299,6 +299,7 @@ class RequestManager {
                           int bos_token_id,
                           int eos_token_id,
                           std::string const &path);
+  std::vector<int32_t> tokenize(std::string const &text);
   void register_output_filepath(std::string const &);
 
   FFModel *get_ssm_model(int model_id);
diff --git a/inference/trace_generator/CMakeLists.txt b/inference/trace_generator/CMakeLists.txt
new file mode 100644
index 000000000..f18eb712c
--- /dev/null
+++ b/inference/trace_generator/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_TraceGenerator)
+set(project_target trace_generator)
+
+
+set(CPU_SRC
+  ${FLEXFLOW_CPP_DRV_SRC}
+  trace_generator.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target} ${CPU_SRC})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target} ${CPU_SRC})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target} DESTINATION ${BIN_DEST})
diff --git a/inference/trace_generator/Makefile b/inference/trace_generator/Makefile
new file mode 100644
index 000000000..0e4b79f51
--- /dev/null
+++ b/inference/trace_generator/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
new file mode 100644
index 000000000..322e4e590
--- /dev/null
+++ b/inference/trace_generator/trace_generator.cc
@@ -0,0 +1,410 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <cassert>
+#include <filesystem>
+#include <string>
+#include <vector>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string log_file_path;
+  std::string emission_file_path;
+};
+
+struct ModelNames {
+  std::string llm_model_name;
+  std::vector<std::string> ssm_model_names;
+};
+
+struct ModelMeta {
+  ModelNames model_names;
+
+  ModelType llm_model_type;
+  std::string llm_tokenizer_path;
+  std::string llm_weights_path;
+  std::string llm_model_config_path;
+
+  int bos_token_id, eos_token_id;
+
+  std::vector<ModelType> ssm_model_types;
+  std::vector<std::string> ssm_model_config_paths;
+  std::vector<std::string> ssm_model_weights_paths;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      ModelNames &model_names,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_sequence_length) {
+  for (int i = 1; i < argc; i++) {
+    // llm model name
+    if (!strcmp(argv[i], "-llm-model")) {
+      model_names.llm_model_name = std::string(argv[++i]);
+      for (char &c : model_names.llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // ssm models names
+    if (!strcmp(argv[i], "-ssm-model")) {
+      std::string ssm_model_name = std::string(argv[++i]);
+      for (char &c : ssm_model_name) {
+        c = std::tolower(c);
+      }
+      model_names.ssm_model_names.push_back(ssm_model_name);
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // traces
+    if (!strcmp(argv[i], "-log")) {
+      paths.log_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--emission-file-path")) {
+      paths.emission_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void get_model_meta(FilePaths &file_paths,
+                    ModelMeta &model_metadata,
+                    bool use_full_precision) {
+  if (model_metadata.model_names.llm_model_name.empty() ||
+      model_metadata.model_names.ssm_model_names.size() == 0) {
+    assert(false && "SpecInfer needs at least one LLM and one SSM for "
+                    "speculative inference");
+  }
+  model_metadata.llm_model_config_path =
+      join_path({file_paths.cache_folder_path,
+                 "configs",
+                 model_metadata.model_names.llm_model_name,
+                 "config.json"});
+  model_metadata.llm_tokenizer_path =
+      join_path({file_paths.cache_folder_path,
+                 "tokenizers",
+                 model_metadata.model_names.llm_model_name});
+  model_metadata.llm_weights_path =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 model_metadata.model_names.llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+
+  std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path);
+  if (!llm_config_file_handle.good()) {
+    std::cout << "LLM Model config file "
+              << model_metadata.llm_model_config_path << " not found."
+              << std::endl;
+    assert(false);
+  }
+  json llm_model_config = json::parse(llm_config_file_handle,
+                                      /*parser_callback_t */ nullptr,
+                                      /*allow_exceptions */ true,
+                                      /*ignore_comments */ true);
+
+  model_metadata.llm_model_type = ModelType::UNKNOWN;
+  auto architectures = llm_model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_metadata.llm_model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_metadata.llm_model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::MPT;
+      break;
+    }
+  }
+  model_metadata.bos_token_id =
+      llm_model_config.find("bos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("bos_token_id");
+  model_metadata.eos_token_id =
+      llm_model_config.find("eos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("eos_token_id");
+
+  for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
+    std::string ssm_config_path = join_path({file_paths.cache_folder_path,
+                                             "configs",
+                                             ssm_model_name,
+                                             "config.json"});
+    std::string ssm_tokenizer_path =
+        join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name});
+    std::string ssm_weights_path =
+        join_path({file_paths.cache_folder_path,
+                   "weights",
+                   ssm_model_name,
+                   use_full_precision ? "full-precision" : "half-precision"});
+
+    std::ifstream ssm_config_file_handle(ssm_config_path);
+    if (!ssm_config_file_handle.good()) {
+      std::cout << "SSM Model config file " << ssm_config_path << " not found."
+                << std::endl;
+      assert(false);
+    }
+    json ssm_model_config = json::parse(ssm_config_file_handle,
+                                        /*parser_callback_t */ nullptr,
+                                        /*allow_exceptions */ true,
+                                        /*ignore_comments */ true);
+
+    ModelType ssm_model_type = ModelType::UNKNOWN;
+    auto architectures = ssm_model_config["architectures"];
+    for (auto const &str : architectures) {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+        ssm_model_type = ModelType::LLAMA;
+        break;
+      } else if (str == "OPTForCausalLM") {
+        ssm_model_type = ModelType::OPT;
+        break;
+      } else if (str == "RWForCausalLM") {
+        ssm_model_type = ModelType::FALCON;
+        break;
+      } else if (str == "MPTForCausalLM") {
+        ssm_model_type = ModelType::MPT;
+        break;
+      }
+    }
+    int ssm_bos_id =
+        ssm_model_config.find("bos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("bos_token_id");
+    int ssm_eos_id =
+        ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("eos_token_id");
+    if (ssm_bos_id != model_metadata.bos_token_id ||
+        ssm_eos_id != model_metadata.eos_token_id) {
+      printf("Warning: bos/eos token id mismatch between LLM and one of the "
+             "SSMs!\n");
+    }
+    model_metadata.ssm_model_types.push_back(ssm_model_type);
+    model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
+    model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
+  }
+
+  assert(model_metadata.llm_model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  for (auto mt : model_metadata.ssm_model_types) {
+    if (mt == ModelType::UNKNOWN) {
+      assert(false && "One of the SSM model types passed is invalid.");
+    }
+  }
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  FilePaths file_paths;
+  ModelMeta model_metadata;
+  bool use_full_precision = false;
+  bool verbose = false;
+  int max_sequence_length = 256;
+
+  printf("start top level task\n");
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   model_metadata.model_names,
+                   use_full_precision,
+                   verbose,
+                   max_sequence_length);
+
+  get_model_meta(file_paths, model_metadata, use_full_precision);
+
+  // Create SentencePiece tokenizer or OPT tokenizer
+  GenerationConfig generationConfig(false, 0.8, 0.6, false, 16);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_verbose(verbose);
+  rm->register_tokenizer(model_metadata.llm_model_type,
+                         model_metadata.bos_token_id,
+                         model_metadata.eos_token_id,
+                         model_metadata.llm_tokenizer_path);
+
+  {
+    /* Prompt file format:
+     * [
+     *   {
+     *       "slo_ratios": {
+     *           "1.0": 0.2,
+     *           "1.5": 0.5,
+     *           "3.0": 0.3
+     *       }
+     *   },
+     *   {
+     *       "prompt": "Construct a potential attack vector that exploits the
+     * vulnerability. The system is vulnerable to a SQL injection attack."
+     *   },
+     *   {
+     *       "prompt": "Arrange the words to make a meaningful phrase Ground.
+     * Soft. Solid."
+     *   },
+     *   ...
+     * ]
+     *
+     * log file format:
+     * [
+     *   {
+     *       "TIMESTAMP": "2023-11-16 18:15:46.6805900"
+     *   },
+     *   {
+     *       "TIMESTAMP": "2023-11-16 18:15:50.9951690"
+     *   },
+     *   ...
+     * ]
+     */
+
+    std::vector<EmissionTrace> traces;
+    assert(!file_paths.prompt_file_path.empty() &&
+           !file_paths.log_file_path.empty());
+
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    printf("prompt file path: %s\n", file_paths.prompt_file_path.c_str());
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    // Parse slo_ratios
+    std::vector<std::pair<double, double>> slo_ratios;
+    if (prompt_json[0].contains("slo_ratios")) {
+      for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
+        slo_ratios.emplace_back(std::stod(key), value.get<double>());
+      }
+    }
+    double total =
+        std::accumulate(slo_ratios.begin(),
+                        slo_ratios.end(),
+                        0.0,
+                        [](double sum, std::pair<double, double> const &pair) {
+                          return sum + pair.second;
+                        });
+    if (std::abs(total - 1.0) > 1e-6) {
+      std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
+                << total << std::endl;
+      assert(false);
+    }
+    ConstantEmissionMachine emission_machine(-1, slo_ratios);
+
+    file_handle = std::ifstream(file_paths.log_file_path);
+    assert(file_handle.good() && "Log file does not exist.");
+    printf("log file path: %s\n", file_paths.log_file_path.c_str());
+    json log_json = json::parse(file_handle,
+                                /*parser_callback_t */ nullptr,
+                                /*allow_exceptions */ true,
+                                /*ignore_comments */ true);
+
+    auto time_diff_ms = [](std::string const &start, std::string const &end) {
+      std::tm tm = {};
+      std::istringstream ss(start);
+      ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S");
+      auto start_time =
+          std::chrono::system_clock::from_time_t(std::mktime(&tm));
+      ss = std::istringstream(end);
+      ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S");
+      auto end_time = std::chrono::system_clock::from_time_t(std::mktime(&tm));
+      return std::chrono::duration_cast<std::chrono::milliseconds>(end_time -
+                                                                   start_time)
+          .count();
+    };
+
+    printf("start trace generation\n");
+    int num_requests = min(prompt_json.size() - 1, log_json.size());
+    std::string start_time = log_json[0]["TIMESTAMP"].get<std::string>();
+    for (int i = 0; i < num_requests; ++i) {
+      std::string prompt = prompt_json[i + 1]["prompt"].get<std::string>();
+      std::vector<int32_t> input_tokens = rm->tokenize(prompt);
+      std::string timestamp = log_json[i]["TIMESTAMP"].get<std::string>();
+      EmissionTrace trace(prompt,
+                          input_tokens.size(),
+                          max_sequence_length,
+                          emission_machine.sample_slo_ratio(),
+                          time_diff_ms(start_time, timestamp));
+      traces.push_back(trace);
+    }
+
+    // output generation results as json
+    assert(!file_paths.emission_file_path.empty());
+    json output_json;
+    for (EmissionTrace const &trace : traces) {
+      output_json.push_back(trace.to_json());
+    }
+    std::ofstream emission_file_handle(file_paths.emission_file_path);
+    emission_file_handle << output_json.dump(2) << std::endl;
+  }
+
+  // float* data
+  std::cout << "----------trace generated--------------" << std::endl;
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 476da6a44..292b7215a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -380,6 +380,10 @@ void RequestManager::register_tokenizer(ModelType type,
   }
 }
 
+std::vector<int32_t> RequestManager::tokenize(std::string const &text) {
+  return tokenizer_->Encode(text);
+}
+
 void RequestManager::register_output_filepath(
     std::string const &_output_filepath) {
   this->output_filepath = _output_filepath;

From f224b5e688b1a653b28ddaa6dcda15fcfa668c69 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 30 Sep 2024 21:48:02 -0700
Subject: [PATCH 501/667] fix: initialization issue; read microsecond

---
 inference/trace_generator/trace_generator.cc | 81 ++++++++++++++++++--
 1 file changed, 73 insertions(+), 8 deletions(-)

diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 322e4e590..f9216d46c 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -19,6 +19,7 @@
 #include "models/mpt.h"
 #include "models/opt.h"
 #include <cassert>
+#include <chrono>
 #include <filesystem>
 #include <string>
 #include <vector>
@@ -264,9 +265,25 @@ void FlexFlow::top_level_task(Task const *task,
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_tokens_per_ssm_batch = -1;
+  int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
-
-  printf("start top level task\n");
+  int expansion_degree = 3;
+  int max_tree_depth = 8;
+  int max_tree_width = 16;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::SPECULATIVE_DECODING;
+  bool spec_sampling = false;
+  bool do_sample = false;
+  int sampling_seed = 0;
+  bool streaming_cache = false;
+  bool slo_attainment_early_termination = false;
+  int baseline_latency_ms = 50;
+  int ssm_spec_latency_ms = 20;
+  int llm_verify_latency_ms = 50;
+  double request_per_second = 1.0;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -278,18 +295,44 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    max_sequence_length);
+  if (max_tokens_per_ssm_batch == -1) {
+    max_tokens_per_ssm_batch = max_tokens_per_batch;
+  }
+  if (max_tokens_per_prefilling_batch == -1) {
+    max_tokens_per_prefilling_batch = max_tokens_per_batch;
+  }
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
   // Create SentencePiece tokenizer or OPT tokenizer
-  GenerationConfig generationConfig(false, 0.8, 0.6, false, 16);
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16);
   InferenceManager *im = InferenceManager::get_inference_manager();
   RequestManager *rm = RequestManager::get_request_manager();
+  // Must init the request manager although we don't use it, as some
+  // initialization tasks execute before the top-level task
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_tree_depth(max_tree_depth);
+  rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
+  rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
                          model_metadata.eos_token_id,
                          model_metadata.llm_tokenizer_path);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(slo_attainment_early_termination);
+  rm->set_baseline_latency(baseline_latency_ms);
+  rm->set_ssm_spec_latency(ssm_spec_latency_ms);
+  rm->set_llm_verify_latency(llm_verify_latency_ms);
 
   {
     /* Prompt file format:
@@ -330,7 +373,6 @@ void FlexFlow::top_level_task(Task const *task,
 
     std::ifstream file_handle(file_paths.prompt_file_path);
     assert(file_handle.good() && "Prompt file does not exist.");
-    printf("prompt file path: %s\n", file_paths.prompt_file_path.c_str());
     json prompt_json = json::parse(file_handle,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
@@ -358,7 +400,6 @@ void FlexFlow::top_level_task(Task const *task,
 
     file_handle = std::ifstream(file_paths.log_file_path);
     assert(file_handle.good() && "Log file does not exist.");
-    printf("log file path: %s\n", file_paths.log_file_path.c_str());
     json log_json = json::parse(file_handle,
                                 /*parser_callback_t */ nullptr,
                                 /*allow_exceptions */ true,
@@ -366,19 +407,43 @@ void FlexFlow::top_level_task(Task const *task,
 
     auto time_diff_ms = [](std::string const &start, std::string const &end) {
       std::tm tm = {};
+
       std::istringstream ss(start);
       ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S");
       auto start_time =
           std::chrono::system_clock::from_time_t(std::mktime(&tm));
+      ss.seekg(0);
+      size_t dot_pos = start.find('.');
+      std::string fraction =
+          dot_pos != std::string::npos ? start.substr(dot_pos + 1) : "0";
+      while (fraction.size() < 6) {
+        fraction += "0";
+      }
+      if (!fraction.empty()) {
+        long long microseconds = std::stoll(fraction.substr(0, 6));
+        start_time += std::chrono::microseconds(microseconds);
+      }
+
       ss = std::istringstream(end);
       ss >> std::get_time(&tm, "%Y-%m-%d %H:%M:%S");
       auto end_time = std::chrono::system_clock::from_time_t(std::mktime(&tm));
-      return std::chrono::duration_cast<std::chrono::milliseconds>(end_time -
+      ss.seekg(0);
+      dot_pos = end.find('.');
+      fraction = dot_pos != std::string::npos ? end.substr(dot_pos + 1) : "0";
+      while (fraction.size() < 6) {
+        fraction += "0";
+      }
+      if (!fraction.empty()) {
+        long long microseconds = std::stoll(fraction.substr(0, 6));
+        end_time += std::chrono::microseconds(microseconds);
+      }
+
+      return std::chrono::duration_cast<std::chrono::microseconds>(end_time -
                                                                    start_time)
-          .count();
+                 .count() /
+             1000.0;
     };
 
-    printf("start trace generation\n");
     int num_requests = min(prompt_json.size() - 1, log_json.size());
     std::string start_time = log_json[0]["TIMESTAMP"].get<std::string>();
     for (int i = 0; i < num_requests; ++i) {

From 18a70ffe91f312f759776a42df3ac043585a8b3a Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 21 Sep 2024 12:41:04 -0700
Subject: [PATCH 502/667] Update nccl (#1507)

* update nccl

* fix

* update

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-7-136.us-east-2.compute.internal>
---
 cmake/nccl.cmake         | 200 +++++++++++++++------------------------
 deps/nccl                |   2 +-
 docker/run.sh            |  12 +--
 tests/inference_tests.sh |   3 -
 4 files changed, 81 insertions(+), 136 deletions(-)

diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index c140a44ec..82cf3b412 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -2,140 +2,88 @@ set(NCCL_NAME nccl)
 # set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}")
 # message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}")
 
-set(NCCL_URL "")
-if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
-  if(LINUX_VERSION MATCHES "20.04")
-    if (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz")
-    endif()
-  elseif(LINUX_VERSION MATCHES "18.04")
-    if (CUDA_VERSION VERSION_EQUAL "10.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "10.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz")
-    endif()
-  endif()
+if(NCCL_PATH)
+  set(NCCL_ROOT ${NCCL_PATH})
+else()
+  # if NCCL_PATH is not set, let's try to find it in the CUDA root
+  set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
 endif()
 
-if(NCCL_URL)
-  # Download and import pre-compiled NCCL library
-  message(STATUS "Using pre-compiled NCCL library")
-  message(STATUS "NCCL_URL: ${NCCL_URL}")
+find_library(NCCL_LIBRARY
+  NAMES libnccl${LIBEXT}
+  PATHS ${NCCL_ROOT} ${CUDA_ROOT}
+  PATH_SUFFIXES lib lib64
+  DOC "NCCL library." )
 
-  include(FetchContent)
-  FetchContent_Declare(${NCCL_NAME}
-    URL ${NCCL_URL}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-  )
-  FetchContent_GetProperties(${NCCL_NAME})
-  if(NOT ${NCCL_NAME}_POPULATED)
-    FetchContent_Populate(${NCCL_NAME})
-  endif()
-  
-  set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME})
-  set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include)
-  set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib)
-  message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}")
-  add_library(nccl SHARED IMPORTED)
-  set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH})
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  HINTS ${NCCL_ROOT}
+  PATH_SUFFIXES include 
+  DOC "NCCL include directory.")
 
-  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT})
-  install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include)
-  install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
-  
-else()
-  if(NCCL_PATH)
-    set(NCCL_ROOT ${NCCL_PATH})
+# find NCCL, set NCCL lib and include    
+if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
+  set(NCCL_FOUND ON)
+  set(NCCL_LIBRARIES ${NCCL_LIBRARY})
+  set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
+
+  # Check NCCL version
+  if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h")
+    file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES
+         REGEX "#define NCCL_MAJOR [0-9]+" )
+    file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2
+         REGEX "#define NCCL_MINOR [0-9]+" )
+    string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
+    string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
+    set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
+    if(NCCL_VERSION VERSION_LESS 2.23)
+      set(NCCL_OLD TRUE)
+    else()
+      set(NCCL_OLD FALSE)
+    endif()
+    message(STATUS "Found NCCL version: ${NCCL_VERSION}")
   else()
-    # if NCCL_PATH is not set, let's try to find it in the CUDA root
-    set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+    message(WARNING "NCCL header not found, unable to determine version")
+    set(NCCL_OLD TRUE)  # Assume old version if we can't determine
   endif()
-  
-  find_library(NCCL_LIBRARY
-    NAMES libnccl${LIBEXT}
-    PATHS ${NCCL_ROOT} ${CUDA_ROOT}
-    PATH_SUFFIXES lib lib64
-    DOC "NCCL library." )
+endif()
 
-  find_path(NCCL_INCLUDE_DIR
-    NAMES nccl.h
-    HINTS ${NCCL_ROOT}
-    PATH_SUFFIXES include 
-    DOC "NCCL include directory.")
-  
-  # find NCCL, set NCCL lib and include    
-  if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
-    set(NCCL_FOUND ON)
-    set(NCCL_LIBRARIES ${NCCL_LIBRARY})
-    set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  endif()
-  
-  # find NCCL
-  if(NCCL_FOUND)
-    list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
-    list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
-    message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
-    message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
-    add_library(nccl SHARED IMPORTED)
-  
-  # Build NCCL from source
-  else()
-    message(STATUS "Building NCCL from source")
-    list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
-  
-    ExternalProject_Add(${NCCL_NAME}
-      SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
-      PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
-      INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
-      BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
-      INSTALL_COMMAND ""
-      CONFIGURE_COMMAND ""
-      BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
-      BUILD_IN_SOURCE 1
-    )
+# find NCCL
+if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0))
+  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
+  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
+  message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
+  message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
+  add_library(nccl SHARED IMPORTED)
+
+# Build NCCL from source
+else()
+  message(STATUS "Building NCCL from source")
+  list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
 
-    ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
-    message(STATUS "NCCL install dir: ${INSTALL_DIR}")
-    list(APPEND FLEXFLOW_INCLUDE_DIRS
-      ${INSTALL_DIR}/include)
-    list(APPEND FLEXFLOW_EXT_LIBRARIES
-      ${INSTALL_DIR}/lib/libnccl${LIBEXT})
-    set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
-    
-    install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
-    install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
+  set(NCCL_BUILD_CMD make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}")
+  if(DEFINED ENV{MAKEFLAGS})
+    set(NCCL_BUILD_CMD ${CMAKE_COMMAND} -E env MAKEFLAGS=$ENV{MAKEFLAGS} ${NCCL_BUILD_CMD})
   endif()
+  ExternalProject_Add(${NCCL_NAME}
+    SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
+    PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
+    INSTALL_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${NCCL_BUILD_CMD}
+    BUILD_IN_SOURCE 1
+  )
 
+  ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
+  message(STATUS "NCCL install dir: ${INSTALL_DIR}")
+  list(APPEND FLEXFLOW_INCLUDE_DIRS
+    ${INSTALL_DIR}/include)
+  list(APPEND FLEXFLOW_EXT_LIBRARIES
+    ${INSTALL_DIR}/lib/libnccl${LIBEXT})
+  set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
+  
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
 endif()
diff --git a/deps/nccl b/deps/nccl
index 6e24ef4e1..2ea4ee94b 160000
--- a/deps/nccl
+++ b/deps/nccl
@@ -1 +1 @@
-Subproject commit 6e24ef4e1f1eac9f104d115ef65429f179924ee7
+Subproject commit 2ea4ee94bfb04c886c79ccae60ac9961000fdee2
diff --git a/docker/run.sh b/docker/run.sh
index 666c8e112..2575150ae 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -18,8 +18,6 @@ ATTACH_GPUS=${ATTACH_GPUS:-true}
 gpu_arg=""
 if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
 
-# Whether to attach inference weights / files (make sure to download the weights first)
-ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false}
 
 # Amount of shared memory to give the Docker container access to
 # If you get a Bus Error, increase this value. If you don't have enough memory
@@ -115,9 +113,11 @@ if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":lat
   exit 1
 fi
 
-inference_volumes=""
-if $ATTACH_INFERENCE_FILES ; then 
-  inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference";
+hf_token_volume=""
+hf_token_path="$HOME/.cache/huggingface/token"
+if [ -f "$hf_token_path" ]; then
+  # If the token exists, add the volume mount to the Docker command
+  hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
 fi
 
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
index 3334939a1..5fb142282 100755
--- a/tests/inference_tests.sh
+++ b/tests/inference_tests.sh
@@ -25,9 +25,6 @@ fi
 # Clean up before test (just in case)
 cleanup
 
-# Make sure supported version of protobuf is installed
-pip3 install protobuf==3.20.3
-
 # Create test prompt file
 mkdir -p ../inference/prompt
 echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json

From ebd45d365916109876d0e874d51ee51fa569a275 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 22 Sep 2024 18:21:22 -0400
Subject: [PATCH 503/667] speedup docker builds

---
 docker/flexflow-environment/Dockerfile | 38 ++++++++++++++++++--------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 6ca337f58..fc894bcb8 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -41,17 +41,38 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         /opt/conda/bin/conda install conda-build conda-verify && \
         /opt/conda/bin/conda clean -ya
 
-# Optionally install HIP dependencies
+# set MAKEFLAGS to speedup any dependency that uses make
+ARG N_BUILD_CORES
+ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
+
+# Set env vars
+ENV PATH /opt/conda/bin:$PATH
+ENV CUDNN_DIR /usr/local/cuda
+ENV CUDA_DIR /usr/local/cuda
+
+# GPU-specific dependencies
+ARG FF_GPU_BACKEND "cuda"
+
+# Update NCCL if FF_GPU_BACKEND is cuda
+RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
+        ubuntu_version=$(lsb_release -rs); \
+        ubuntu_version=${ubuntu_version//./}; \
+        wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
+        DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
+        DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
+        rm -f cuda-keyring_1.0-1_all.deb; \
+        DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
+    else \
+        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
+    fi'
+
+# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
 # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
 # package attempts to re-install cuda even though cuda is already installed
 # in the container. It also attempts to install packages for a graphical install.
 # For our container, we don't need `hip-runtime-nvidia`
-ARG FF_GPU_BACKEND "cuda"
 ARG hip_version "5.6"
-ARG N_BUILD_CORES
-# set MAKEFLAGS to speedup any dependency that uses make
-ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
-
 RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \
         # Check that hip_version is one of 5.3,5.4,5.5,5.6
@@ -82,11 +103,6 @@ RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]
     fi
 RUN rm -rf /var/lib/apt/lists/*
 
-# Set env vars
-ENV PATH /opt/conda/bin:$PATH
-ENV CUDNN_DIR /usr/local/cuda
-ENV CUDA_DIR /usr/local/cuda
-
 # Install python packages and other dependencies
 RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
 # Install CPU-only Pytorch and related dependencies

From 347d9ada584fe5ac47e78bd9c68b3be7df5e6784 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 22 Sep 2024 18:23:00 -0400
Subject: [PATCH 504/667] update

---
 docker/flexflow/Dockerfile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile
index 60f9d4d65..dff925965 100644
--- a/docker/flexflow/Dockerfile
+++ b/docker/flexflow/Dockerfile
@@ -27,9 +27,7 @@ RUN for pair in $BUILD_CONFIGS; do \
 # Build and install C++ and Python versions of FlexFlow
 RUN mkdir -p build && cd build && \
     eval "$BUILD_CONFIGS" ../config/config.linux && \
-    make -j $N_BUILD_CORES && \
-    eval "$BUILD_CONFIGS" ../config/config.linux && \
-    make install && \
+    make -j $N_BUILD_CORES install && \
     ldconfig
 
 ENTRYPOINT ["/bin/bash"]

From 62925bbc7e0446a359800929097de2508b6e6948 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 1 Oct 2024 17:03:52 -0700
Subject: [PATCH 505/667] fix: emission time

---
 inference/spec_infer/spec_infer.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index aa5e1693b..35a528be4 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -577,6 +577,8 @@ void FlexFlow::top_level_task(Task const *task,
         timestamps.push_back(trace.emission_time_ms);
         ratios.push_back(trace.slo_ratio);
       }
+      timestamps.erase(timestamps.begin());
+      timestamps.push_back(timestamps.back() + 1000.0);
       TraceEmissionMachine emission_machine(timestamps, ratios);
       results = tree_model.generate(requests, emission_machine);
     } else {

From 2e5db3c6617fe18a0e7ca9832355e9d681cec329 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 1 Oct 2024 19:18:07 -0700
Subject: [PATCH 506/667] feat: trace generator add scaling_factor

---
 inference/trace_generator/trace_generator.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index f9216d46c..6f9214ce4 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -62,7 +62,8 @@ void parse_input_args(char **argv,
                       ModelNames &model_names,
                       bool &use_full_precision,
                       bool &verbose,
-                      int &max_sequence_length) {
+                      int &max_sequence_length,
+                      double &scaling_factor) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -113,6 +114,10 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--scaling-factor")) {
+      scaling_factor = std::stod(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -265,11 +270,13 @@ void FlexFlow::top_level_task(Task const *task,
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
+  int max_sequence_length = 256;
+  double scaling_factor = 1.0;
+
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
   int max_tokens_per_ssm_batch = -1;
   int max_tokens_per_prefilling_batch = -1;
-  int max_sequence_length = 256;
   int expansion_degree = 3;
   int max_tree_depth = 8;
   int max_tree_width = 16;
@@ -294,7 +301,8 @@ void FlexFlow::top_level_task(Task const *task,
                    model_metadata.model_names,
                    use_full_precision,
                    verbose,
-                   max_sequence_length);
+                   max_sequence_length,
+                   scaling_factor);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -454,7 +462,7 @@ void FlexFlow::top_level_task(Task const *task,
                           input_tokens.size(),
                           max_sequence_length,
                           emission_machine.sample_slo_ratio(),
-                          time_diff_ms(start_time, timestamp));
+                          time_diff_ms(start_time, timestamp) * scaling_factor);
       traces.push_back(trace);
     }
 

From a17ec6e5c77c9a73bae59f77bbfe16b730e796b1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 2 Oct 2024 17:50:49 -0700
Subject: [PATCH 507/667] feat: add old_scheduler option

---
 inference/spec_infer/spec_infer.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 35a528be4..1bdcf72da 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -79,6 +79,7 @@ void parse_input_args(char **argv,
                       int &ssm_spec_latency_ms,
                       int &llm_verify_latency_ms,
                       double &request_per_second,
+                      bool &spec_infer_old_version,
                       std::string &emission_file_path) {
   for (int i = 1; i < argc; i++) {
     // llm model name
@@ -196,6 +197,10 @@ void parse_input_args(char **argv,
       request_per_second = std::stod(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--spec-infer-old-version")) {
+      spec_infer_old_version = true;
+      continue;
+    }
     if (!strcmp(argv[i], "--emission-file-path")) {
       emission_file_path = std::string(argv[++i]);
       continue;
@@ -371,6 +376,7 @@ void FlexFlow::top_level_task(Task const *task,
   int ssm_spec_latency_ms = 20;
   int llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
+  bool spec_infer_old_version = false;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -399,6 +405,7 @@ void FlexFlow::top_level_task(Task const *task,
                    ssm_spec_latency_ms,
                    llm_verify_latency_ms,
                    request_per_second,
+                   spec_infer_old_version,
                    emission_file_path);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
@@ -437,6 +444,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_ssm_spec_latency(ssm_spec_latency_ms);
   rm->set_llm_verify_latency(llm_verify_latency_ms);
   rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_spec_infer_old_version(spec_infer_old_version);
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);

From efead4fade2d9f02b45935a92101d54993f439b7 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Mon, 12 Aug 2024 11:02:49 -0700
Subject: [PATCH 508/667] feat: cherry-pick
 https://github.com/flexflow/FlexFlow/commit/9784b5c6516bafe272fc6555daaa9b867a5eacfa

---
 CMakeLists.txt                               | 2 +-
 deps/legion                                  | 2 +-
 examples/cpp/AlexNet/alexnet.cc              | 2 +-
 examples/cpp/DLRM/dlrm.cc                    | 2 +-
 examples/cpp/InceptionV3/inception.cc        | 2 +-
 examples/cpp/ResNet/resnet.cc                | 2 +-
 examples/cpp/Transformer/transformer.cc      | 2 +-
 examples/cpp/XDL/xdl.cc                      | 2 +-
 examples/cpp/candle_uno/candle_uno.cc        | 2 +-
 examples/cpp/mixture_of_experts/moe.cc       | 2 +-
 examples/cpp/resnext50/resnext.cc            | 2 +-
 examples/cpp/split_test/split_test.cc        | 2 +-
 examples/cpp/split_test_2/split_test_2.cc    | 2 +-
 include/flexflow/graph.h                     | 2 +-
 include/flexflow/operator.h                  | 4 +++-
 include/flexflow/utils/recursive_logger.h    | 4 ++--
 inference/incr_decoding/incr_decoding.cc     | 2 +-
 inference/spec_infer/spec_infer.cc           | 2 +-
 src/mapper/mapper.cc                         | 7 ++++++-
 src/ops/inc_multihead_self_attention.cc      | 2 +-
 src/ops/tree_inc_multihead_self_attention.cc | 2 +-
 src/runtime/batch_config.cc                  | 2 +-
 src/runtime/graph.cc                         | 4 ++--
 src/runtime/inference_manager.cc             | 4 ++--
 src/runtime/model.cc                         | 6 ++++--
 src/runtime/optimizer_kernel.cpp             | 4 ++--
 src/runtime/optimizer_kernel.cu              | 2 +-
 src/runtime/request_manager.cc               | 2 +-
 src/runtime/simulator.cc                     | 8 ++++----
 src/runtime/substitution.cc                  | 4 ++--
 tests/ops/batch_matmul_test.cc               | 2 +-
 tests/ops/concat_test.cc                     | 2 +-
 tests/ops/flat_test.cc                       | 2 +-
 tests/ops/linear_test.cc                     | 2 +-
 tests/ops/reshape_test.cc                    | 2 +-
 tests/ops/tanh_test.cc                       | 2 +-
 tests/ops/transpose_test.cc                  | 2 +-
 37 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbe817c1f..577a2215d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,7 +448,7 @@ if(NOT BUILD_LEGION_ONLY)
       # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
       add_custom_command(TARGET flexflow
         POST_BUILD	
-        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+        COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
       )
       # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
diff --git a/deps/legion b/deps/legion
index 24e8c4523..02eb1010c 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835
+Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0
diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc
index 128496eab..350788232 100644
--- a/examples/cpp/AlexNet/alexnet.cc
+++ b/examples/cpp/AlexNet/alexnet.cc
@@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("AlexNet");
+Legion::Logger log_app("AlexNet");
 
 void parse_input_args(char **argv, int argc, AlexNetConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc
index 7dc49215b..d7dc16755 100644
--- a/examples/cpp/DLRM/dlrm.cc
+++ b/examples/cpp/DLRM/dlrm.cc
@@ -19,7 +19,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("DLRM");
+Legion::Logger log_app("DLRM");
 
 void parse_input_args(char **argv, int argc, DLRMConfig &apConfig);
 
diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc
index b2070cc52..6d0fa7ee5 100644
--- a/examples/cpp/InceptionV3/inception.cc
+++ b/examples/cpp/InceptionV3/inception.cc
@@ -21,7 +21,7 @@
 using namespace Legion;
 using namespace FlexFlow;
 
-LegionRuntime::Logger::Category log_app("Inceptionv3");
+Legion::Logger log_app("Inceptionv3");
 
 Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) {
   Tensor t1 = input;
diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc
index 455eb743a..49ce934a6 100644
--- a/examples/cpp/ResNet/resnet.cc
+++ b/examples/cpp/ResNet/resnet.cc
@@ -24,7 +24,7 @@ using FlexFlow::Optimizer;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("ResNet");
+Legion::Logger log_app("ResNet");
 
 void parse_input_args(char **argv, int argc, ResNetConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc
index d61a63cd0..b04093b0a 100644
--- a/examples/cpp/Transformer/transformer.cc
+++ b/examples/cpp/Transformer/transformer.cc
@@ -17,7 +17,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("Transformer");
+Legion::Logger log_app("Transformer");
 
 Tensor create_emb(FFModel *model,
                   Tensor const &input,
diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc
index 2e6c3cec9..a2272f36e 100644
--- a/examples/cpp/XDL/xdl.cc
+++ b/examples/cpp/XDL/xdl.cc
@@ -18,7 +18,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("XDL");
+Legion::Logger log_app("XDL");
 
 void parse_input_args(char **argv, int argc, XDLConfig &apConfig);
 
diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc
index 779b8e9c1..e9f4bf876 100644
--- a/examples/cpp/candle_uno/candle_uno.cc
+++ b/examples/cpp/candle_uno/candle_uno.cc
@@ -21,7 +21,7 @@
 using namespace Legion;
 using namespace std;
 
-LegionRuntime::Logger::Category log_app("Candle_Uno");
+Legion::Logger log_app("Candle_Uno");
 
 void parse_input_args(char **argv, int argc, CandleConfig &apConfig);
 
diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc
index a70731088..a25f94abd 100644
--- a/examples/cpp/mixture_of_experts/moe.cc
+++ b/examples/cpp/mixture_of_experts/moe.cc
@@ -20,7 +20,7 @@
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_app("MoE");
+Legion::Logger log_app("MoE");
 
 void parse_input_args(char **argv, int argc, MoeConfig &config) {
   for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc
index 3c28ca27b..9b71b37cc 100644
--- a/examples/cpp/resnext50/resnext.cc
+++ b/examples/cpp/resnext50/resnext.cc
@@ -7,7 +7,7 @@ using FlexFlow::Optimizer;
 using FlexFlow::SGDOptimizer;
 using FlexFlow::Tensor;
 
-LegionRuntime::Logger::Category log_app("resnext");
+Legion::Logger log_app("resnext");
 
 Tensor resnext_block(FFModel &ff,
                      Tensor input,
diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc
index 97b98c321..ac9d516a5 100644
--- a/examples/cpp/split_test/split_test.cc
+++ b/examples/cpp/split_test/split_test.cc
@@ -3,7 +3,7 @@
 using namespace Legion;
 using namespace FlexFlow;
 
-LegionRuntime::Logger::Category log_app("split_test");
+Legion::Logger log_app("split_test");
 
 void FlexFlow::top_level_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc
index 69385d14c..fef078adb 100644
--- a/examples/cpp/split_test_2/split_test_2.cc
+++ b/examples/cpp/split_test_2/split_test_2.cc
@@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph;
 using FlexFlow::PCG::GraphSearchHelper;
 using FlexFlow::PCG::Node;
 
-LegionRuntime::Logger::Category log_app("split_test_2");
+Legion::Logger log_app("split_test_2");
 
 void top_level_task(Task const *task,
                     std::vector<PhysicalRegion> const &regions,
diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h
index 2e0cf1ca4..9dc657259 100644
--- a/include/flexflow/graph.h
+++ b/include/flexflow/graph.h
@@ -24,7 +24,7 @@
 #include "legion/legion_utilities.h"
 #include <unordered_set>
 
-extern LegionRuntime::Logger::Category log_dp;
+extern Legion::Logger log_dp;
 
 namespace FlexFlow::PCG {
 
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 1b19bdb82..311699d92 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -19,7 +19,7 @@
 
 namespace FlexFlow {
 
-extern LegionRuntime::Logger::Category log_measure;
+extern Legion::Logger log_measure;
 
 class OpMeta;
 class Simulator;
@@ -233,6 +233,8 @@ class Op {
                                       std::vector<ParallelTensor> const &,
                                       MachineView const *mv = nullptr) {
     assert(false);
+    Legion::FutureMap empty_map;
+    return empty_map;
   };
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
diff --git a/include/flexflow/utils/recursive_logger.h b/include/flexflow/utils/recursive_logger.h
index 2c43b4230..d073f58f3 100644
--- a/include/flexflow/utils/recursive_logger.h
+++ b/include/flexflow/utils/recursive_logger.h
@@ -26,7 +26,7 @@ class DepthTag {
 
 class RecursiveLogger {
 public:
-  /* RecursiveLogger(LegionRuntime::Logger::Category const &); */
+  /* RecursiveLogger(Legion::Logger const &); */
   RecursiveLogger(std::string const &category_name);
 
   Realm::LoggerMessage info();
@@ -42,7 +42,7 @@ class RecursiveLogger {
 
   void print_prefix(Realm::LoggerMessage &) const;
 
-  LegionRuntime::Logger::Category logger;
+  Legion::Logger logger;
 };
 
 };     // namespace FlexFlow
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index e289ae4ce..0669d2aeb 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -27,7 +27,7 @@ using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
-LegionRuntime::Logger::Category log_app("llama");
+Legion::Logger log_app("llama");
 
 struct FilePaths {
   std::string cache_folder_path;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 1bdcf72da..f5b9eb6de 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -27,7 +27,7 @@ using namespace FlexFlow;
 using namespace Legion;
 using json = nlohmann::json;
 
-LegionRuntime::Logger::Category log_app("llama");
+Legion::Logger log_app("llama");
 
 struct FilePaths {
   std::string cache_folder_path;
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index b83907f4d..037ad1819 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -20,7 +20,7 @@ namespace FlexFlow {
 using namespace Legion;
 using namespace Mapping;
 
-LegionRuntime::Logger::Category log_ff_mapper("Mapper");
+Legion::Logger log_ff_mapper("Mapper");
 
 FFShardingFunctor::FFShardingFunctor(int _gpus_per_node,
                                      int _cpus_per_node,
@@ -297,6 +297,7 @@ void FFMapper::select_task_options(MapperContext const ctx,
     // control replicate top level task
     if (enable_control_replication) {
       output.replicate = true;
+      output.map_locally = false;
     }
     return;
   }
@@ -561,6 +562,10 @@ void FFMapper::map_task(MapperContext const ctx,
       assert(output.target_procs[i].address_space() == node_id);
     }
   }
+  if (input.shard_processor.exists()) {
+    output.target_procs = std::vector<Processor>{input.shard_processor};
+  }
+
   // Find instances that still need to be mapped
   std::vector<std::set<FieldID>> missing_fields(task.regions.size());
   runtime->filter_instances(ctx,
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 54d71ea0b..d55473231 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA");
+Legion::Logger log_inc_mha("IncrementalMHA");
 
 bool IncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 4e00bf0ef..331b2faf6 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -46,7 +46,7 @@ using Legion::TaskArgument;
 using Legion::TaskLauncher;
 using PCG::Node;
 
-LegionRuntime::Logger::Category log_tree_verify("BatchConfig");
+Legion::Logger log_tree_verify("TreeVerifyIncMHA");
 
 bool TreeIncMultiHeadSelfAttentionParams::is_valid(
     ParallelTensorShape const &input) const {
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 5981c6ce9..5cd4135d2 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -22,7 +22,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_bc("BatchConfig");
+Legion::Logger log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index ca8e51d40..299330c9e 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -67,8 +67,8 @@ namespace FlexFlow::PCG {
 using namespace Legion;
 using FlexFlow::MachineView;
 
-LegionRuntime::Logger::Category log_graph("graph");
-LegionRuntime::Logger::Category log_simplify("graph_simplify");
+Legion::Logger log_graph("graph");
+Legion::Logger log_simplify("graph_simplify");
 
 Node const Node::INVALID_NODE = Node();
 
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 28ec523b9..da650c00c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -29,8 +29,8 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_inf_mgr("InferenceManager");
-LegionRuntime::Logger::Category log_offload("Offloading");
+Legion::Logger log_inf_mgr("InferenceManager");
+Legion::Logger log_offload("Offloading");
 
 InferenceManager::InferenceManager() {}
 
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index fe4a8d4af..33cd44cd9 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -83,8 +83,8 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_model("Model");
-LegionRuntime::Logger::Category log_measure("measure");
+Legion::Logger log_model("Model");
+Legion::Logger log_measure("measure");
 
 Op::Op(FFModel &model,
        OperatorType otype,
@@ -6779,6 +6779,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(SGD_UPD_NCCL_TASK_ID, "SGD NCCL Update");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<SGDOptimizer::nccl_update_task>(
           registrar, "SGD NCCL Update Task");
@@ -6929,6 +6930,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "NCCL Init Communicators");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
           registrar, "NCCL Init Communicators Task");
diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp
index e71adc87a..59efaf525 100644
--- a/src/runtime/optimizer_kernel.cpp
+++ b/src/runtime/optimizer_kernel.cpp
@@ -21,7 +21,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_optimizer("optimizer");
+Legion::Logger log_optimizer("optimizer");
 
 __global__ void sgd_update(size_t count,
                            float lr,
@@ -247,4 +247,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
 }
 #endif
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu
index 5f654fbb5..df37e3b13 100644
--- a/src/runtime/optimizer_kernel.cu
+++ b/src/runtime/optimizer_kernel.cu
@@ -20,7 +20,7 @@
 
 namespace FlexFlow {
 
-LegionRuntime::Logger::Category log_optimizer("optimizer");
+Legion::Logger log_optimizer("optimizer");
 
 __global__ void sgd_update(size_t count,
                            float lr,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7d8a84596..6484b6855 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -35,7 +35,7 @@ namespace FlexFlow {
 using namespace Legion;
 using tokenizers::Tokenizer;
 
-LegionRuntime::Logger::Category log_req_mgr("RequestManager");
+Legion::Logger log_req_mgr("RequestManager");
 
 bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
                std::shared_ptr<TokenTreeNode> const &rhs) {
diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc
index d94337641..b71af0d47 100644
--- a/src/runtime/simulator.cc
+++ b/src/runtime/simulator.cc
@@ -31,10 +31,10 @@ namespace FlexFlow {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_sim("sim");
-LegionRuntime::Logger::Category log_ps_sim("ps_sim");
-LegionRuntime::Logger::Category log_xfer_sim("xfer_sim");
-LegionRuntime::Logger::Category log_xfer_est("xfer_est");
+Legion::Logger log_sim("sim");
+Legion::Logger log_ps_sim("ps_sim");
+Legion::Logger log_xfer_sim("xfer_sim");
+Legion::Logger log_xfer_est("xfer_est");
 
 // template class std::map<const Op*, ParallelConfig>; // for debugging in gdb
 // template class std::map<const Op*, MachineView>; // for debugging in gdb
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index c0804d6e1..b86964049 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -54,8 +54,8 @@ namespace FlexFlow::PCG {
 
 using namespace Legion;
 
-LegionRuntime::Logger::Category log_xfers("xfers");
-LegionRuntime::Logger::Category log_xfer_matches("xfer_matches");
+Legion::Logger log_xfers("xfers");
+Legion::Logger log_xfer_matches("xfer_matches");
 
 const TensorX TensorX::NO_TX = TensorX();
 
diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc
index 7931f4412..f61048feb 100644
--- a/tests/ops/batch_matmul_test.cc
+++ b/tests/ops/batch_matmul_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("bmm_test");
+Legion::Logger log_app("bmm_test");
 
 struct BMMTestMeta {
   int m, k, n, d;
diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc
index c67b718e0..b0489d1ad 100644
--- a/tests/ops/concat_test.cc
+++ b/tests/ops/concat_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("concat_test");
+Legion::Logger log_app("concat_test");
 
 struct ConcatTestMeta {
   int batch_size, i_dim, num_channels, projected_num_channels,
diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc
index 428893a0d..61de83b6b 100644
--- a/tests/ops/flat_test.cc
+++ b/tests/ops/flat_test.cc
@@ -7,7 +7,7 @@
 #include <sstream>
 
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Flat_test");
+Legion::Logger log_app("Flat_test");
 
 struct FlatTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc
index 5b65de3a5..7c84ad107 100644
--- a/tests/ops/linear_test.cc
+++ b/tests/ops/linear_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("linear_test");
+Legion::Logger log_app("linear_test");
 
 struct LinearTestMeta {
   int batch_size, i_dim, num_channels, dense_projection_o_dim,
diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc
index e8f4586b2..a8aa046a6 100644
--- a/tests/ops/reshape_test.cc
+++ b/tests/ops/reshape_test.cc
@@ -6,7 +6,7 @@
 #include <sstream>
 #define PRECISION 16
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Reshape_test");
+Legion::Logger log_app("Reshape_test");
 
 struct ReshapeTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc
index 1c24d96aa..1e86934f8 100644
--- a/tests/ops/tanh_test.cc
+++ b/tests/ops/tanh_test.cc
@@ -6,7 +6,7 @@
 #include <sstream>
 #define PRECISION 16
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("Tanh_test");
+Legion::Logger log_app("Tanh_test");
 
 struct TanhTestMeta {
   int i_dim, o_dim;
diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc
index 10481aa14..045f28479 100644
--- a/tests/ops/transpose_test.cc
+++ b/tests/ops/transpose_test.cc
@@ -5,7 +5,7 @@
 #include <iostream>
 #include <sstream>
 using namespace Legion;
-LegionRuntime::Logger::Category log_app("transpose_test");
+Legion::Logger log_app("transpose_test");
 
 struct TransposeTestMeta {
   int m, k, d;

From 285696e75f2023203a861b71ea131f1b70d6db08 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 29 Aug 2024 00:04:28 +0200
Subject: [PATCH 509/667] update legion version

---
 deps/legion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/legion b/deps/legion
index 02eb1010c..0d32b3554 160000
--- a/deps/legion
+++ b/deps/legion
@@ -1 +1 @@
-Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0
+Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b

From de55a2eb5e4be3895d2ea4166fabdd1049c09150 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 31 Aug 2024 06:00:57 -0700
Subject: [PATCH 510/667] Fix nccl-induced segfault (#1481)

---
 include/flexflow/model.h       |  1 +
 src/runtime/model.cc           | 68 ++++++++++++++++++----------------
 src/runtime/request_manager.cc |  3 ++
 3 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 854d27ffc..226105a12 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1097,6 +1097,7 @@ class FFModel {
                      bool use_propagation) const;
 #ifdef FF_USE_NCCL
   ncclComm_t *find_nccl_comms(MachineView const &view) const;
+  void finish_nccl_comms();
 #endif
 #ifdef FF_USE_PROPAGATE
   void propagate(std::map<Op *, ParallelConfig> const &current,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 33cd44cd9..7e0fb2512 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1616,41 +1616,47 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
   model_id = model_counter++;
 }
 
+#ifdef FF_USE_NCCL
+void FFModel::finish_nccl_comms() {
+  Context ctx = config.lg_ctx;
+  Runtime *runtime = config.lg_hlr;
+  for (auto const &comm : view_hash_to_nccl_comms) {
+    // Find the machine view that has the hash
+    MachineView view;
+    for (size_t l = 0; l < operators.size(); l++) {
+      view = operators[l]->outputs[0]->machine_view;
+      if (view.hash() == comm.first) {
+        break;
+      }
+    }
+    assert(view.hash() == comm.first && "Cannot find the machine view");
+    IndexSpace task_is = get_or_create_task_is(view);
+    Domain domain = runtime->get_index_space_domain(ctx, task_is);
+    ArgumentMap argmap;
+    int idx = 0;
+    for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
+      argmap.set_point(*it,
+                       TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
+    }
+    IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
+                                 task_is,
+                                 TaskArgument(nullptr, 0),
+                                 argmap,
+                                 Predicate::TRUE_PRED,
+                                 false /*must*/,
+                                 0 /*mapper_id*/,
+                                 comm.first);
+    FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+    fm.wait_all_results();
+  }
+}
+#endif
+
 FFModel::~FFModel() {
   // Destroy nccl communication groups
 #ifdef FF_USE_NCCL
   if (config.computationMode == COMP_MODE_TRAINING) {
-    Context ctx = config.lg_ctx;
-    Runtime *runtime = config.lg_hlr;
-    for (auto const &comm : view_hash_to_nccl_comms) {
-      // Find the machine view that has the hash
-      MachineView view;
-      for (size_t l = 0; l < operators.size(); l++) {
-        view = operators[l]->outputs[0]->machine_view;
-        if (view.hash() == comm.first) {
-          break;
-        }
-      }
-      assert(view.hash() == comm.first && "Cannot find the machine view");
-      IndexSpace task_is = get_or_create_task_is(view);
-      Domain domain = runtime->get_index_space_domain(ctx, task_is);
-      ArgumentMap argmap;
-      int idx = 0;
-      for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
-        argmap.set_point(*it,
-                         TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
-      }
-      IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
-                                   task_is,
-                                   TaskArgument(nullptr, 0),
-                                   argmap,
-                                   Predicate::TRUE_PRED,
-                                   false /*must*/,
-                                   0 /*mapper_id*/,
-                                   comm.first);
-      FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
-      fm.wait_all_results();
-    }
+    finish_nccl_comms();
   }
 #endif
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6484b6855..18919ae67 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2258,6 +2258,9 @@ void RequestManager::background_serving_task(
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+#ifdef FF_USE_NCCL
+  llm->finish_nccl_comms();
+#endif
 }
 
 /*static*/

From 71d8a7b4bd3b6cec497b8e26be1509158e044d67 Mon Sep 17 00:00:00 2001
From: Qinghan Chen <qinghanc@andrew.cmu.edu>
Date: Thu, 3 Oct 2024 15:51:50 -0400
Subject: [PATCH 511/667] add page_manager and request_manager functions

---
 include/page_manager.h         | 141 ++++++++++++++++++++
 src/runtime/page_manager.cc    | 227 +++++++++++++++++++++++++++++++++
 src/runtime/request_manager.cc |  42 ++++++
 3 files changed, 410 insertions(+)
 create mode 100644 include/page_manager.h
 create mode 100644 src/runtime/page_manager.cc

diff --git a/include/page_manager.h b/include/page_manager.h
new file mode 100644
index 000000000..6266923dc
--- /dev/null
+++ b/include/page_manager.h
@@ -0,0 +1,141 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flexflow/batch_config.h"
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/config.h"
+#include "flexflow/utils/file_loader.h"
+#include <future>
+#include <mutex>
+#include <tokenizers_cpp.h>
+#include <deque>
+
+namespace FlexFlow {
+
+using TokenId = BatchConfig::TokenId;
+
+/**
+ * @class LogicalTokenBlock
+ * @brief A class to represent a logical block of tokens similar to virtual memory address
+ */
+class LogicalTokenBlock {
+public:
+    using TokenId = BatchConfig::TokenId;
+    // Constructor
+    LogicalTokenBlock(int block_number, uint32_t block_size);
+
+    // Method to check if the block is empty
+    bool is_empty() const;
+
+    // Method to get the number of empty slots
+    int get_num_empty_slots() const;
+
+    // Method to get the number of allocated slots
+    int get_num_alloc_slots();
+
+    // Method to check if the block is full
+    bool is_full() const;
+
+    // Method to append tokens
+    void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
+
+    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
+    void reset_num_spec_tokens();
+
+    std::vector<TokenId> get_token_ids() const;
+
+    int block_number; // the index of the logical token block
+    uint32_t block_size; // the size of the block
+    int num_tokens; // the number of tokens currently stored in the block
+    int num_commit_tokens; // the number of tokens inside this block that are already committed
+    int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
+
+    std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
+};
+
+/**
+ * @class PhysicalTokenBlock
+ * @brief A class to represent a physical block of tokens similar to physical memory address
+ * It keeps track of the location of the tokens stored on GPU memory
+ */
+class PhysicalTokenBlock {
+public:
+    // Constructor
+    PhysicalTokenBlock(int block_number, uint32_t block_size);
+
+    int ref_count; // reference count
+    int block_number; // the index of the physical token block
+    uint32_t block_size; // the size of the block
+};
+
+/**
+ * @class BlockAllocator
+ * @brief A Block Manager that is reponsible for maintaining a pool of free blocks
+ */
+class BlockAllocator {
+public:
+    // Constructor
+    BlockAllocator(uint32_t block_size, int num_blocks);
+
+    // Allocate a block
+    PhysicalTokenBlock allocate();
+
+    // Free a block
+    void free(PhysicalTokenBlock& block);
+
+    // Get the number of free blocks
+    size_t get_num_free_blocks() const;
+
+private:
+    uint32_t block_size;
+    int num_blocks;
+    std::deque<PhysicalTokenBlock> free_blocks;
+};
+
+/*
+* @class PageManager
+* @brief A wrapper class that manages the kv cache allocation status
+* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
+*/
+class PageManager {
+public:
+    // Get the singleton instance of the PageManager as it will be shared in multiple places
+    static PageManager *get_page_manager();
+    using BlockTable = std::vector<PhysicalTokenBlock>;
+    using RequestGuid = BatchConfig::RequestGuid;
+    PageManager(uint32_t block_size, int num_total_blocks);
+
+    // Prefill the block with the given token ids at the llm prefilling stage
+    bool prefill(const RequestGuid& request_guid, const std::vector<int>& token_ids);
+    bool allocate(const RequestGuid& request_guid);
+    void free(const RequestGuid& request_guid);
+
+    size_t get_num_free_blocks() const;
+    std::vector<int32_t> get_block_table_indices(const RequestGuid& request_guid) const;
+    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
+
+    void erase_last_pages(const RequestGuid& request_guid, int num_pages);
+
+private:
+    uint32_t block_size; // the size of the block
+    int num_total_blocks; // the total number of blocks
+    BlockAllocator block_allocator;
+    std::unordered_map<int, BlockTable> block_tables;
+};
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
new file mode 100644
index 000000000..08819b6f0
--- /dev/null
+++ b/src/runtime/page_manager.cc
@@ -0,0 +1,227 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/page_manager.h"
+
+namespace FlexFlow {
+
+// For all runtime functions, they share a single page manager for pages information
+PageManager *page_manager_singleton = nullptr;
+
+LogicalTokenBlock::LogicalTokenBlock(int block_number, uint32_t block_size)
+    : block_number(block_number), block_size(block_size), num_tokens(0), num_commit_tokens(0), num_spec_tokens(0) {
+    }
+
+bool LogicalTokenBlock::is_empty() const {
+    assert(num_spec_tokens == 0 && num_commit_tokens == 0);
+    assert(num_tokens <= block_size);
+    return num_tokens == 0;
+}
+
+int LogicalTokenBlock::get_num_empty_slots() const {
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+    return block_size - num_tokens;
+}
+
+int LogicalTokenBlock::get_num_alloc_slots() {
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+    return num_tokens;
+}
+
+bool LogicalTokenBlock::is_full() const {
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+    return num_tokens == block_size;
+}
+
+void LogicalTokenBlock::reset_num_spec_tokens(){
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+
+    num_tokens -= num_spec_tokens;
+    num_spec_tokens = 0;
+
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+}
+
+void LogicalTokenBlock::append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed) {
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+    if (num_tokens + token_ids_to_append.size() > block_size) {
+        throw std::runtime_error("Block is full! Cannot append more tokens.");
+    }
+    token_ids.insert(token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end());
+    num_tokens += token_ids_to_append.size();
+    if (committed) {
+        num_commit_tokens += token_ids_to_append.size();
+    }else{
+        num_spec_tokens += token_ids_to_append.size();
+    }
+    assert(num_spec_tokens + num_commit_tokens == num_tokens);
+    assert(num_tokens <= block_size);
+}
+
+std::vector<TokenId> LogicalTokenBlock::get_token_ids() const {
+    return token_ids;
+}
+
+PhysicalTokenBlock::PhysicalTokenBlock(int block_number, uint32_t block_size)
+    : block_number(block_number), block_size(block_size), ref_count(0) {}
+
+BlockAllocator::BlockAllocator(uint32_t block_size, int num_total_blocks) {
+    for (int block_number = 0; block_number < num_total_blocks; ++block_number) {
+        free_blocks.push_back(PhysicalTokenBlock(block_number, block_size));
+    }
+    num_blocks = num_total_blocks;
+}
+
+// Allocate a block
+PhysicalTokenBlock BlockAllocator::allocate() {
+    if (free_blocks.empty()) {
+        throw std::runtime_error("Out of memory! No free blocks are available.");
+    }
+    PhysicalTokenBlock block = free_blocks.front();
+    free_blocks.pop_front();
+    block.ref_count = 1;
+    num_blocks -= 1;
+    return block;
+}
+
+// Free a block
+void BlockAllocator::free(PhysicalTokenBlock& block) {
+    if (block.ref_count == 0) {
+        throw std::runtime_error("Double free! Block is already freed.");
+    }
+    block.ref_count -= 1;
+    if (block.ref_count == 0) {
+        free_blocks.push_back(block);
+        num_blocks += 1;
+    }
+}
+
+size_t BlockAllocator::get_num_free_blocks() const {
+    assert(free_blocks.size() <= static_cast<size_t>(num_blocks));
+    if (free_blocks.size() > static_cast<size_t>(num_blocks)) {
+        std::cerr << "num free blocks: " << free_blocks.size() << std::endl;
+        std::cerr << "num total blocks: " << num_blocks << std::endl;
+        throw std::runtime_error("Number of free blocks exceeds the total number of blocks.");
+    }
+    return free_blocks.size();
+}
+
+PageManager::PageManager(uint32_t block_size, int num_total_blocks)
+    : block_size(block_size), num_total_blocks(num_total_blocks),
+      block_allocator(block_size, num_total_blocks) {}
+
+bool PageManager::prefill(const RequestGuid& request_guid, const std::vector<TokenId>& token_ids) {
+    BlockTable block_table;
+    for (size_t logical_idx = 0; logical_idx < token_ids.size(); logical_idx++) {
+        PhysicalTokenBlock block = block_allocator.allocate();
+        block_table.push_back(block);
+    }
+
+    block_tables[request_guid] = block_table;
+    return true;
+}
+
+bool PageManager::can_allocate(const RequestGuid& request_guid) const {
+    int num_free_gpu_blocks = block_allocator.get_num_free_blocks();
+    return num_free_gpu_blocks > 0;
+}
+
+bool PageManager::allocate(const RequestGuid& request_guid) {
+    // This is the prefilling for a request
+    if (!can_allocate(request_guid)) {
+        assert(false);
+    }
+    BlockTable& block_table = block_tables[request_guid];
+
+    PhysicalTokenBlock block = block_allocator.allocate();
+    block_table.push_back(block);;
+    return true;
+}
+
+void PageManager::_free_block_table(BlockTable& block_table) {
+    for (auto& block : block_table) {
+            block_allocator.free(block);
+    } 
+}
+
+void PageManager::free(const RequestGuid& request_guid) {
+    assert(block_tables.find(request_guid) != block_tables.end());
+    auto& block_table = block_tables[request_guid];
+    _free_block_table(block_table);
+}
+
+size_t PageManager::get_num_free_blocks() const {
+    return block_allocator.get_num_free_blocks();
+}
+
+std::vector<int32_t> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
+    std::vector<int32_t> indices;
+    try {
+    const auto& block_table = block_tables.at(request_guid);
+    for (const auto& block : block_table) {
+        // printf("get block indice block number is: %d\n", block.block_number);
+        indices.push_back(block.block_number);
+    }
+    } catch (const std::out_of_range& e) {
+        std::cerr << "Request GUID not found in block tables: " << e.what() << std::endl;
+        // Handle error appropriately
+        std::cout << "request ID is: " << request_guid << std::endl;
+        exit(1);
+    }
+    return indices;
+}
+
+int PageManager::get_num_allocated_blocks(const RequestGuid& request_guid) const {
+    auto it = block_tables.find(request_guid);
+    if (it == block_tables.end()) {
+        return 0;
+    }else{
+        return it->second.size();
+    }
+}
+
+void PageManager::erase_last_pages(const RequestGuid& request_guid, int last_commit_page){
+    assert(block_tables.find(request_guid) != block_tables.end());
+    auto& block_table = block_tables[request_guid];
+    assert(last_commit_page < block_table.size());
+    // free the blocks that are used for spec tokens and put them back to the queue
+    for (int i = last_commit_page + 1; i < block_table.size(); i++) {
+        block_allocator.free(block_table[i]);
+    }
+    // erase the blocks that are used for spec tokens in the block table of given request
+    block_table = std::vector<PhysicalTokenBlock>(block_table.begin(), block_table.begin() + last_commit_page + 1);
+    // need to put the last blocks back to the free list
+    block_tables[request_guid] = block_table;
+    assert(block_tables[request_guid].size() == last_commit_page + 1);
+}
+
+PageManager *PageManager::get_page_manager() {
+  if (page_manager_singleton == nullptr) {
+    int num_total_blocks = (BatchConfig::max_spec_tree_token_num() +
+        BatchConfig::max_sequence_length() + kPagesize - 1) /
+        kPagesize * BatchConfig::max_requests_per_batch();
+    page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
+  }
+  return page_manager_singleton;
+}
+
+
+}; //FlexFlow
\ No newline at end of file
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 18919ae67..d4e31da6b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1833,6 +1833,48 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   return llm_bitmask;
 }
 
+/* --------- Page Attention Related Functions --------- */
+void RequestManager::_append_logical_block_to_request(
+    Request &request, bool is_commit) {
+  // Append the logical block to the request
+  // page attention: in this function we need to remember the last logical block number that still contains committed tokens
+  LogicalTokenBlock block(request.blocks.size(),
+                                  kPagesize);
+  request.blocks.push_back(block);
+  PageManager *page_manager = PageManager::get_page_manager();
+  page_manager->allocate(request.guid);
+  // update page_id_commit
+  if (is_commit) {
+    request.page_id_commit++;
+    assert(request.page_id_commit < request.blocks.size());
+  }
+}
+
+void RequestManager::_append_tokens_to_blocks(Request &request, std::vector<TokenId> const &tokens, bool is_commit, int start, int end) {
+  assert(start >= 0 && start < tokens.size());
+  int cursor = start;
+  int marker = 0;
+  if (end == -1) {
+    marker = tokens.size();
+  } else {
+    marker = end;
+  }
+  while (cursor < marker) {
+    if (request.blocks.empty() ||
+      request.blocks.back().is_full()) {
+      // Append a new logical block
+      _append_logical_block_to_request(request, is_commit);
+    }
+    int num_empty_slots = request.blocks.back().get_num_empty_slots();
+    int num_tokens_to_append = std::min(num_empty_slots, marker - cursor);
+    // vector to be appeneded will be [cursor, cursor + num_tokens_to_append)]
+    std::vector<TokenId> tokens_to_append(tokens.begin() + cursor, tokens.begin() + cursor + num_tokens_to_append);
+    request.blocks.back().append_tokens(tokens_to_append, is_commit);
+    cursor += num_tokens_to_append;
+  }
+  assert(request.blocks.back().num_tokens <= kPagesize);
+}
+
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {

From 0eaca3934376fbf44a5c991066c093b98373f271 Mon Sep 17 00:00:00 2001
From: Qinghan Chen <qinghanc@andrew.cmu.edu>
Date: Thu, 3 Oct 2024 15:55:27 -0400
Subject: [PATCH 512/667] add batch_config

---
 include/flexflow/batch_config.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ff48bb17f..aa9d03290 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -111,6 +111,9 @@ class BatchConfig {
     int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
     int padding = 0; // Padding for memory pointer alignment
+    int num_kv_pages; //number of kv pages used
+    int kv_last_page_len; //last page length of kv
+    RequestGuid request_guid;
   };
 
   struct PerTokenInfo {

From b5fbc8b8d53b26a2a257d1b4e75d9b9d06a3fda6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 3 Oct 2024 19:26:32 -0700
Subject: [PATCH 513/667] Add option to enable old scheduler.

---
 inference/spec_infer/spec_infer.cc | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index aa5e1693b..b500a41b7 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -79,6 +79,7 @@ void parse_input_args(char **argv,
                       int &ssm_spec_latency_ms,
                       int &llm_verify_latency_ms,
                       double &request_per_second,
+                      bool &spec_infer_old_version,
                       std::string &emission_file_path) {
   for (int i = 1; i < argc; i++) {
     // llm model name
@@ -200,6 +201,10 @@ void parse_input_args(char **argv,
       emission_file_path = std::string(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--spec-infer-old-version")) {
+      spec_infer_old_version = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -372,6 +377,7 @@ void FlexFlow::top_level_task(Task const *task,
   int llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
   std::string emission_file_path;
+  bool spec_infer_old_version = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -399,6 +405,7 @@ void FlexFlow::top_level_task(Task const *task,
                    ssm_spec_latency_ms,
                    llm_verify_latency_ms,
                    request_per_second,
+                   spec_infer_old_version,
                    emission_file_path);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
@@ -413,6 +420,14 @@ void FlexFlow::top_level_task(Task const *task,
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
 
+  // Sanity check for SpecInfer old version
+  if (spec_infer_old_version) {
+    assert(max_tree_depth = 8);
+    assert(max_tree_width >= 3);
+    // Total verified tokens
+    assert(max_tokens_per_batch >= max_requests_per_batch * 21);
+  }
+
   // Create SentencePiece tokenizer or OPT tokenizer
   srand(sampling_seed);
   GenerationConfig generationConfig(do_sample, 0.8, 0.6, spec_sampling, 16);
@@ -436,6 +451,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_baseline_latency(baseline_latency_ms);
   rm->set_ssm_spec_latency(ssm_spec_latency_ms);
   rm->set_llm_verify_latency(llm_verify_latency_ms);
+  rm->set_spec_infer_old_version(spec_infer_old_version);
   rm->register_output_filepath(file_paths.output_file_path);
 
   // Create LLM model
@@ -561,7 +577,7 @@ void FlexFlow::top_level_task(Task const *task,
             prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
       }
       PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
-      // ConstantEmissionMachine emission_machine(-1, slo_ratios);
+      //   ConstantEmissionMachine emission_machine(-1, slo_ratios);
       results = tree_model.generate(requests, emission_machine);
     } else if (!file_paths.trace_file_path.empty()) {
       std::ifstream file_handle(file_paths.trace_file_path);

From 03eb516b11de3a7f9f36c422b4c0de4f0862d06c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 3 Oct 2024 19:28:51 -0700
Subject: [PATCH 514/667] Merge.

---
 inference/spec_infer/spec_infer.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index a024d6d6c..a990dfe77 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -205,10 +205,6 @@ void parse_input_args(char **argv,
       emission_file_path = std::string(argv[++i]);
       continue;
     }
-    if (!strcmp(argv[i], "--spec-infer-old-version")) {
-      spec_infer_old_version = true;
-      continue;
-    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -382,7 +378,6 @@ void FlexFlow::top_level_task(Task const *task,
   double request_per_second = 1.0;
   bool spec_infer_old_version = false;
   std::string emission_file_path;
-  bool spec_infer_old_version = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -458,7 +453,6 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_llm_verify_latency(llm_verify_latency_ms);
   rm->set_spec_infer_old_version(spec_infer_old_version);
   rm->register_output_filepath(file_paths.output_file_path);
-  rm->set_spec_infer_old_version(spec_infer_old_version);
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);

From 3fbb36491525b59bfb715bc1599bcf34db4637f0 Mon Sep 17 00:00:00 2001
From: Zhihao Jia <zhihao@cmu.edu>
Date: Thu, 3 Oct 2024 17:10:19 -0400
Subject: [PATCH 515/667] feat: cherry-pick from
 https://github.com/flexflow/FlexFlow/pull/1517/commits/93a405a00a79cf4e3d2e276518dd8e487cfad123

---
 src/parallel_ops/allreduce.cc | 2 ++
 src/runtime/model.cc          | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index f3b1a7eed..6f3148ce4 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -245,6 +245,7 @@ void AllReduce::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -274,6 +275,7 @@ void AllReduce::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 7e0fb2512..fa4ccd0c5 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6694,6 +6694,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrentluy since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::forward_task>(
           registrar, "AllReduce Forward Task");
@@ -6708,6 +6711,9 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    // AllReduce forward and backward must run concurrentluy since they
+    // use ncclAllReduce internally
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::backward_task>(
           registrar, "AllReduce Backward Task");

From 6482d76786025e061095fedb56c93e6eb7a7006c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 4 Oct 2024 16:06:02 -0700
Subject: [PATCH 516/667] fix: long request support

---
 src/runtime/request_manager.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 18919ae67..a7d5e3cc8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -423,14 +423,6 @@ RequestManager::RequestGuid
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
-  if (tokens.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << tokens.size() << ".\n";
-
-    printf("tokens size: %zu\n", tokens.size());
-    return INVALID_GUID;
-  }
   for (int i = 0; i < tokens.size(); i++) {
     std::cout << "[" << i << "]" << tokens.at(i) << "\n";
   }

From a23cddbb9037d093a97d5fc947b1d193cc8dcd91 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 4 Oct 2024 19:24:31 -0700
Subject: [PATCH 517/667] fix: memory leakage in file_loader

---
 src/runtime/file_loader.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 8588b8934..fb6d3b4dc 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -124,6 +124,7 @@ void load_attention_weights_multi_query(DT *ptr,
       ptr[data_index++] = host_array.at(i);
     }
     file_index++;
+    in.close();
   }
 }
 
@@ -302,6 +303,7 @@ void load_attention_weights_v2(DT *ptr,
     }
 
     file_index++;
+    in.close();
   }
 }
 
@@ -357,6 +359,7 @@ void FileDataLoader::load_positions(FFModel *ff,
 
   // ff->get_parallel_tensor_from_tensor(pt, position_pt);
   position_pt->set_tensor<int>(ff, dims_vec, data);
+  free(data);
 }
 
 //--------------------- quantization functions ----------------------
@@ -669,7 +672,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
   ff->get_parallel_tensor_from_tensor(weight, weight_pt);
   weight_pt->set_tensor<char>(ff, dims_vec, data);
 
-  delete data;
+  free(data);
 }
 
 template <typename DT>
@@ -761,7 +764,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   weight_pt->set_tensor<DT>(ff, dims_vec, data);
 
   // Free buffer memory
-  delete data;
+  free(data);
 }
 
 void FileDataLoader::load_weights(FFModel *ff) {

From 3574e510c490fe45785936eb9d2a1e8986a4cbc0 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 4 Oct 2024 19:49:08 -0700
Subject: [PATCH 518/667] feat: support inf slo ratio

---
 src/runtime/request_manager.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a7d5e3cc8..2964c794d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2795,6 +2795,9 @@ void RequestManager::prune_token_tree() {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
+    if (request.get_slo_ratio() < 0) {
+      continue;
+    }
     double spare_latency =
         get_request_expected_latency(request) - request.decode_latency_ms;
     spare_latency_2_request_index.push_back(

From 272a2e9a3454164a926bab5ed3bde490fb9cdcc5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 5 Oct 2024 12:07:54 -0700
Subject: [PATCH 519/667] chore: minor

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2964c794d..9751689a1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2795,7 +2795,7 @@ void RequestManager::prune_token_tree() {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    if (request.get_slo_ratio() < 0) {
+    if (request.get_slo_ratio() > 999) {
       continue;
     }
     double spare_latency =

From 29f5c69f444a804c0334d678e52a1e1a8fe72d1a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 15:08:44 -0700
Subject: [PATCH 520/667] fix: add logic of batch prefilling, request should be
 taken back and forth on batch

---
 include/flexflow/request_manager.h |  9 +++-
 src/runtime/request_manager.cc     | 67 ++++++++++++++++++++++--------
 2 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b85aa3664..923eebe56 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -414,7 +414,8 @@ class RequestManager {
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
   RequestGuid next_available_guid;
-  std::vector<Request *> prefill_requests;
+  std::queue<Request *> prefilled_requests;
+  std::vector<Request *> prefilling_requests;
 
   // Added to make the request manager stateful. During the processing of the
   // first small model inference results, the step equals to 1. That is, every
@@ -422,7 +423,11 @@ class RequestManager {
   // by 1.
   int current_ssm_step = 0;
   // Maps the index of the request in the batch config to the request guid.
+  // Note that we may have some prefilled requests not in the batch config,
+  // but should be re-considered in the decoding phase.
   int guid_of_requests[BatchConfig::MAX_NUM_REQUESTS];
+  int num_running_requests = 0;
+  // Available requests in the batch config
   bool request_available[BatchConfig::MAX_NUM_REQUESTS];
   int num_available_requests = 0;
   int ssm_completed = true;
@@ -470,6 +475,8 @@ class RequestManager {
   bool load_pending_request_to_batch();
   void request_update_attainment(int index, bool attained);
   void request_complete_clean_up(int batch_index);
+  void request_offload_from_batch(int batch_index);
+  void request_load_onto_batch(int batch_index);
   /* ---------- Incremental Decoding Helper Functions ---------- */
   bool update_llm_prefill_results(InferenceResult const &result);
   bool update_llm_decode_results(InferenceResult const &result);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9751689a1..f69fb4df1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
-#include "flexflow/request_manager.h"
 #include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
+#include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
@@ -564,8 +564,8 @@ BatchConfig
 // Return value: true if load a pending request to the batch
 bool RequestManager::load_pending_request_to_batch() {
   if (pending_request_queue.empty()) {
-    if (num_available_requests > 0) {
-      // No pending request to process, but there are available requests
+    if (num_running_requests > 0) {
+      // No pending request to process, but there are running requests
       // in the batch, do nothing
       return false;
     }
@@ -580,7 +580,7 @@ bool RequestManager::load_pending_request_to_batch() {
   }
   std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
   assert(!pending_request_queue.empty() && "No pending request to process.");
-  while (num_available_requests < get_max_requests_per_batch() &&
+  while (num_running_requests < get_max_requests_per_batch() &&
          !pending_request_queue.empty()) {
     RequestGuid guid = pending_request_queue.front().guid;
     pending_request_queue.pop();
@@ -593,12 +593,13 @@ bool RequestManager::load_pending_request_to_batch() {
     // Load request into batch
     request->batch_index = request_index;
     guid_of_requests[request_index] = guid;
+    num_running_requests++;
     request_available[request_index] = true;
     num_available_requests++;
     // Initialize the bitmask for the new request with its prompt length
     init_bitmask_prompt(guid, request->tokens.size());
 
-    prefill_requests.push_back(request);
+    prefilling_requests.push_back(request);
 
     profiling_requests[guid] = RequestProfileInfo();
     profiling_requests[guid].start_time =
@@ -618,6 +619,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
       Realm::Clock::current_time_in_microseconds();
   Request &request = all_requests[guid];
   guid_of_requests[batch_index] = INVALID_GUID;
+  num_running_requests--;
   request_available[batch_index] = false;
   num_available_requests--;
   request.status = Request::COMPLETED;
@@ -709,6 +711,21 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   // write_to_output_file("", str);
 }
 
+void RequestManager::request_offload_from_batch(int batch_index) {
+  RequestGuid guid = guid_of_requests[batch_index];
+  Request &request = all_requests[guid];
+  // Still keep the request in `guid_of_requests` where can be retrieved later
+  request_available[batch_index] = false;
+  num_available_requests--;
+}
+
+void RequestManager:: request_load_onto_batch(int batch_index) {
+  RequestGuid guid = guid_of_requests[batch_index];
+  Request &request = all_requests[guid];
+  request_available[batch_index] = true;
+  num_available_requests++;
+}
+
 void RequestManager::update_token_tree_depth() {
   ssm_tree_depth = min(get_max_tokens_per_batch() / get_num_active_requests(),
                        get_max_tree_depth());
@@ -716,7 +733,7 @@ void RequestManager::update_token_tree_depth() {
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
   // Update the inference results
-  if (num_available_requests == 0) {
+  if (num_running_requests == 0) {
     // Update nothing
     // Load the pending request to the batch
     load_pending_request_to_batch();
@@ -736,12 +753,17 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
           // finishes
 
           // Check if there are more empty slots
-          if (num_available_requests < get_max_requests_per_batch() &&
+          if (num_running_requests < get_max_requests_per_batch() &&
               load_pending_request_to_batch()) {
             // Load the pending request to the batch
             request_manager_status = PREFILLING;
           } else {
             // No more empty slots, start the decoding
+            while (!prefilled_requests.empty()) {
+              Request *request = prefilled_requests.front();
+              request_load_onto_batch(request->batch_index);
+              prefilled_requests.pop();
+            }
             request_manager_status = DECODING;
           }
         }
@@ -765,13 +787,18 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             // This indicates that the prefilling phase finishes
 
             // Check if there are more empty slots
-            if (num_available_requests < get_max_requests_per_batch() &&
+            if (num_running_requests < get_max_requests_per_batch() &&
                 load_pending_request_to_batch()) {
               // Load the pending request to the batch
               prefill_model = SSM;
               current_ssm_step = 0;
             } else {
               // No more empty slots, start the speculation
+              while (!prefilled_requests.empty()) {
+                Request *request = prefilled_requests.front();
+                request_load_onto_batch(request->batch_index);
+                prefilled_requests.pop();
+              }
               request_manager_status = SSM_SPEC;
               // Reset the prefill_request
               current_ssm_step = 0;
@@ -841,8 +868,8 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
 bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
   int num_tokens = 0;
   std::vector<Request *> incomplete_requests;
-  incomplete_requests.reserve(prefill_requests.size());
-  for (Request *request : prefill_requests) {
+  incomplete_requests.reserve(prefilling_requests.size());
+  for (Request *request : prefilling_requests) {
     if (request->num_tokens_in_batch > 0) {
       if (decoding_mode == INCREMENTAL_DECODING && streaming_cache) {
         request->streaming_cache_info.commit_cache(
@@ -854,12 +881,16 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
       request->llm_prefill_len += request->num_tokens_in_batch;
 
       if (request->llm_prefill_len == request->tokens.size()) {
-        // Indicates that the LLM prefilling phase finishes
+        // Indicates that this request's prefilling phase finishes
         request->tokens.push_back(
             result.token_ids[num_tokens + request->num_tokens_in_batch - 1]);
 
         if (request->tokens.back() == eos_token_id) {
           request_complete_clean_up(request->batch_index);
+        } else {
+          // Temporarily offload request from the batch
+          request_offload_from_batch(request->batch_index);
+          prefilled_requests.push(request);
         }
 
         if (decoding_mode == SPECULATIVE_DECODING) {
@@ -884,8 +915,8 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
     }
   }
 
-  prefill_requests.swap(incomplete_requests);
-  return prefill_requests.empty();
+  prefilling_requests.swap(incomplete_requests);
+  return prefilling_requests.empty();
 }
 
 bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
@@ -947,7 +978,7 @@ void RequestManager::update_ssm_prefill_results(
   // This function is called by update_inference_results when the
   // request_manager_status is PREFILLING and the prefill_model is SSM.
   // There's no results to update, but we should update ssm_cache_size.
-  for (Request *request : prefill_requests) {
+  for (Request *request : prefilling_requests) {
     if (request->num_tokens_in_batch > 0) {
       if (streaming_cache) {
         request->streaming_cache_info.commit_cache(
@@ -1017,7 +1048,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     std::cout << "\n############### prepare_llm_prefilling_batch "
                  "##############\n";
   }
-  assert(prefill_requests.size() > 0 &&
+  assert(prefilling_requests.size() > 0 &&
          "No prefilling request to process in the prefilling phase.");
 
   BatchConfig bc;
@@ -1029,7 +1060,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   bc.prompt_phase = true;
   bc.num_available_requests = 0;
   int num_tokens = 0;
-  for (Request *request : prefill_requests) {
+  for (Request *request : prefilling_requests) {
     int request_index = request->batch_index;
     bc.request_available[request_index] = true;
 
@@ -1086,7 +1117,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     std::cout << "\n############### prepare_ssm_prefilling_batch "
                  "##############\n";
   }
-  assert(prefill_requests.size() > 0 &&
+  assert(prefilling_requests.size() > 0 &&
          "No prefilling request to process in the prefilling phase.");
 
   BatchConfig bc;
@@ -1094,7 +1125,7 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   bc.prompt_phase = true;
   bc.num_available_requests = 0;
   int num_tokens = 0;
-  for (Request *request : prefill_requests) {
+  for (Request *request : prefilling_requests) {
     int request_index = request->batch_index;
     // Only set the prefilling request to be available
     bc.request_available[request_index] = true;

From dcb61c796533da48c2dd0789db14b6ad6e12b8ec Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 15:46:43 -0700
Subject: [PATCH 521/667] style: minor format

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f69fb4df1..f1b075496 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -13,9 +13,9 @@
  * limitations under the License.
  */
 
+#include "flexflow/request_manager.h"
 #include "flexflow/inference.h"
 #include "flexflow/parallel_ops/parallel_op.h"
-#include "flexflow/request_manager.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
 #include <cmath>
@@ -719,7 +719,7 @@ void RequestManager::request_offload_from_batch(int batch_index) {
   num_available_requests--;
 }
 
-void RequestManager:: request_load_onto_batch(int batch_index) {
+void RequestManager::request_load_onto_batch(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
   Request &request = all_requests[guid];
   request_available[batch_index] = true;

From 1659fdebb90cfcd14e93db4293d7134f7326f33f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 16:08:36 -0700
Subject: [PATCH 522/667] chore: minor info output

---
 src/runtime/request_manager.cc | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f1b075496..4c819a72a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2496,12 +2496,21 @@ void RequestManager::terminate_background_server() {
 
     long long total_time = Realm::Clock::current_time_in_microseconds() -
                            profiling.server_start_time;
-    int total_requests = profiling_requests.size();
+    int total_requests = 0;
+    for (auto const &profiling_info : profiling_requests) {
+      int request_id = profiling_info.first;
+      Request &request = all_requests[request_id];
+      if (request.status == Request::COMPLETED) {
+        total_requests++;
+      }
+    }
     int total_tokens = 0;
     for (int num_tokens : profiling.generated_tokens_per_step) {
       total_tokens += num_tokens;
     }
     str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
+    str += "\n total_requests(" + std::to_string(total_requests) + "/" +
+           std::to_string(profiling_requests.size()) + ")";
     str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
     // throughput
     str += "\n throughput_requests_per_sec(" +

From a2a5174b704329775ca42a7b80a7c70e1654e914 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 17:03:40 -0700
Subject: [PATCH 523/667] chore: use unordered_map in argtopk

---
 include/flexflow/ops/arg_topk.h |  7 +++++--
 src/ops/arg_topk.cu             | 15 ++++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 7ba5ed945..86d0bb239 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -6,6 +6,8 @@
 #include "flexflow/node.h"
 #include "flexflow/ops/arg_topk_params.h"
 #include "flexflow/utils/memory_allocator.h"
+#include "raft/core/device_resources.hpp"
+#include <unordered_map>
 
 namespace FlexFlow {
 
@@ -17,6 +19,7 @@ class ArgTopKMeta : public OpMeta {
   Realm::RegionInstance reserveInst;
   void *half_precision_output;
   int max_input_size;
+  std::unordered_map<cudaStream_t, raft::device_resources *> device_resources;
   ArgTopKMeta(FFHandler handle,
               Op const *op,
               MemoryAllocator &gpu_mem_allocator);
@@ -88,7 +91,7 @@ class ArgTopK : public Op {
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
-  static void forward_kernel(ArgTopKMeta const *m,
+  static void forward_kernel(ArgTopKMeta *m,
                              DT const *input_ptr,
                              DT *output_ptr,
                              int *indices_ptr,
@@ -99,7 +102,7 @@ class ArgTopK : public Op {
                              bool renormalize,
                              BatchConfig const *bc,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(ArgTopKMeta const *m,
+  static void forward_kernel_wrapper(ArgTopKMeta *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &prob,
                                      GenericTensorAccessorW const &indices,
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index da09f30f9..fbeb5497c 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -15,7 +15,6 @@
 
 #include "flexflow/ops/arg_topk.h"
 #include "flexflow/utils/cuda_helper.h"
-#include "raft/core/device_resources.hpp"
 #include "raft/matrix/detail/select_k.cuh"
 
 namespace FlexFlow {
@@ -85,7 +84,7 @@ __global__ void renormalize_kernel(DT *topk_values,
 /*static*/
 template <typename DT>
 void ArgTopK::forward_kernel(
-    ArgTopKMeta const *m,
+    ArgTopKMeta *m,
     DT const *input_ptr,
     DT *output_ptr,
     int *indices_ptr,
@@ -97,8 +96,11 @@ void ArgTopK::forward_kernel(
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
   assert(bc->num_active_requests() >= 0);
-  raft::device_resources handle(stream);
-  raft::matrix::detail::select_k(handle,
+  if (m->device_resources.find(stream) == m->device_resources.end()) {
+    m->device_resources[stream] = new raft::device_resources(stream);
+  }
+  raft::device_resources *handle = m->device_resources[stream];
+  raft::matrix::detail::select_k(*handle,
                                  input_ptr,
                                  (int *)nullptr,
                                  batch_size,
@@ -126,7 +128,7 @@ void ArgTopK::forward_kernel(
 }
 
 /*static*/
-void ArgTopK::forward_kernel_wrapper(ArgTopKMeta const *m,
+void ArgTopK::forward_kernel_wrapper(ArgTopKMeta *m,
                                      GenericTensorAccessorR const &input,
                                      // float *output_ptr,
                                      GenericTensorAccessorW const &probs,
@@ -236,5 +238,8 @@ ArgTopKMeta::~ArgTopKMeta() {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
     reserveInst.destroy();
   }
+  for (auto &kv : device_resources) {
+    delete kv.second;
+  }
 }
 }; // namespace FlexFlow

From 00a98eb5f68cbe6033a487003f71f3df8aa2c78a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 17:22:52 -0700
Subject: [PATCH 524/667] chore: minor

---
 src/ops/arg_topk.cc    | 8 ++++----
 src/ops/gumbel_topk.cc | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index fdf10a370..6a5248712 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -381,8 +381,8 @@ InferenceResult ArgTopK::inference_speculative_task(
     Runtime *runtime) {
   assert(regions.size() == 3);
   assert(task->regions.size() == 3);
-  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
-  if (bc.num_active_tokens() == 0) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
     // Directly return for empty batch config
     InferenceResult ir;
     return ir;
@@ -396,8 +396,8 @@ InferenceResult ArgTopK::inference_speculative_task(
   GenericTensorAccessorW probs = helperGetGenericTensorAccessorWO(
       DT_FLOAT, regions[2], task->regions[2], FID_DATA, ctx, runtime);
 
-  int batch_size = bc.num_active_tokens();
-  ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
+  int batch_size = bc->num_active_tokens();
+  ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, bc);
 
   InferenceResult ir;
   ir.num_token_ids = batch_size * m->k;
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index a57d26e10..99244efb2 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -430,8 +430,8 @@ InferenceResult GumbelTopK::inference_speculative_task(
     Runtime *runtime) {
   assert(regions.size() == 4);
   assert(task->regions.size() == 4);
-  BatchConfig const &bc = Future(task->futures[0]).get_result<BatchConfig>();
-  if (bc.num_active_tokens() == 0) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
     // Directly return for empty batch config
     InferenceResult ir;
     return ir;
@@ -447,9 +447,9 @@ InferenceResult GumbelTopK::inference_speculative_task(
   GenericTensorAccessorW perturbed_log_probs = helperGetGenericTensorAccessorWO(
       DT_FLOAT, regions[3], task->regions[3], FID_DATA, ctx, runtime);
 
-  int batch_size = bc.num_active_tokens();
+  int batch_size = bc->num_active_tokens();
   GumbelTopK::forward_kernel_wrapper(
-      m, input, log_probs, perturbed_log_probs, indices, batch_size, &bc);
+      m, input, log_probs, perturbed_log_probs, indices, batch_size, bc);
 
   InferenceResult ir;
   ir.num_token_ids = batch_size * m->k;

From 8a28da5147bbd17c37f65f7298c8a4b45802d1bd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 21:43:17 -0700
Subject: [PATCH 525/667] chore: add goodput report

---
 src/runtime/request_manager.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4c819a72a..2fa506990 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2592,19 +2592,27 @@ void RequestManager::terminate_background_server() {
     mean_generated_tokens_per_step += ")";
     str += mean_generated_tokens_per_step;
 
-    std::string slo_attainment = "\n slo_attainment( ";
-    double attainment = 0;
+    double attainment = 0, goodput = 0;
     for (auto request_pair : all_requests) {
       Request &request = request_pair.second;
       if (request.attained) {
         attainment += 1;
+        goodput += request.tokens.size() - request.llm_prefill_len;
       }
     }
     attainment /= total_requests;
+    goodput /= total_time / 1e6;
+
+    std::string slo_attainment = "\n slo_attainment( ";
     slo_attainment += std::to_string(attainment);
     slo_attainment += ")";
     str += slo_attainment;
 
+    std::string goodput_str = "\n goodput( ";
+    goodput_str += std::to_string(goodput);
+    goodput_str += ")";
+    str += goodput_str;
+
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     // Wait for the background server to terminate

From 1e68324b735503b909d9dfebce3c9f618825a9aa Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 6 Oct 2024 22:59:35 -0700
Subject: [PATCH 526/667] chore: minor

---
 src/runtime/request_manager.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2fa506990..42b26f3f1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -580,6 +580,9 @@ bool RequestManager::load_pending_request_to_batch() {
   }
   std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
   assert(!pending_request_queue.empty() && "No pending request to process.");
+  if (profiling.server_start_time == 0) {
+    reset_profiling_statistics();
+  }
   while (num_running_requests < get_max_requests_per_batch() &&
          !pending_request_queue.empty()) {
     RequestGuid guid = pending_request_queue.front().guid;
@@ -2310,7 +2313,7 @@ void RequestManager::serve_decoding(FFModel *llm) {
   std::queue<InferenceResultFuture> batch_pipeline;
   { batch_pipeline.push(last_irf); }
 
-  reset_profiling_statistics();
+  // reset_profiling_statistics();
   background_server_status = SERVING;
   while (!is_background_server_terminated()) {
 
@@ -2380,7 +2383,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
   std::queue<InferenceResultFuture> infer_result_future_pipeline;
   infer_result_future_pipeline.push(irf_0);
 
-  reset_profiling_statistics();
+  // reset_profiling_statistics();
   background_server_status = SERVING;
   while (!is_background_server_terminated()) {
     if (infer_result_future_pipeline.size() >= 4) {

From 239fe176bbf2993102c29b8f33f7da8e55ff9ef6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 7 Oct 2024 00:23:01 -0700
Subject: [PATCH 527/667] chore: replace busy_waiting to condition_variable

---
 include/flexflow/request_manager.h |  3 ++-
 src/runtime/request_manager.cc     | 28 ++++++++++++----------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 923eebe56..42b62fe3f 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -19,6 +19,7 @@
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/file_loader.h"
+#include <condition_variable>
 #include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
@@ -318,7 +319,6 @@ class RequestManager {
   void terminate_background_server();
   static void terminate_background_server_at_exit();
   // Methods to check and mark request completion
-  bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
   static void background_serving_task(
       Legion::Task const *task,
@@ -410,6 +410,7 @@ class RequestManager {
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
+  std::condition_variable request_queue_cv;
   std::mutex request_result_mutex;
   std::unordered_map<RequestGuid, std::promise<void> *> request_to_promise;
   std::mutex request_to_promise_mutex;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 42b26f3f1..d87f437ab 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -459,6 +459,7 @@ RequestManager::RequestGuid
     pending_request_queue.push(request);
     all_requests[request.guid] = request;
   }
+  request_queue_cv.notify_all();
   {
     std::lock_guard<std::mutex> const lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
@@ -481,14 +482,6 @@ RequestManager::RequestGuid
   return request.guid;
 }
 
-bool RequestManager::is_request_completed(RequestGuid const &guid) {
-  std::lock_guard<std::mutex> const lock(request_queue_mutex);
-  assert(all_requests.find(guid) != all_requests.end());
-  Request const &request = all_requests[guid];
-  // return request.tokens.size() >= request.max_sequence_length;
-  return request.status == Request::COMPLETED;
-}
-
 GenerationResult
     RequestManager::get_generation_result(RequestGuid const &guid) {
   // First get the future of the request
@@ -563,22 +556,24 @@ BatchConfig
 
 // Return value: true if load a pending request to the batch
 bool RequestManager::load_pending_request_to_batch() {
+  std::unique_lock<std::mutex> lock(request_queue_mutex);
   if (pending_request_queue.empty()) {
     if (num_running_requests > 0) {
-      // No pending request to process, but there are running requests
-      // in the batch, do nothing
+      // No pending request to process, but there are running requests in the
+      // batch. Do nothing and return
       return false;
     }
-    // Wait until there is a pending request
-    while (pending_request_queue.empty() &&
-           !is_background_server_terminated()) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(1));
-    }
+    // Wait until there is a pending request or the background server is
+    // terminated
+    request_queue_cv.wait(lock, [&] {
+      return !pending_request_queue.empty() ||
+             is_background_server_terminated();
+    });
+    // If the background server has been terminated, exit
     if (is_background_server_terminated()) {
       return false;
     }
   }
-  std::lock_guard<std::mutex> const request_queue_lock(request_queue_mutex);
   assert(!pending_request_queue.empty() && "No pending request to process.");
   if (profiling.server_start_time == 0) {
     reset_profiling_statistics();
@@ -2618,6 +2613,7 @@ void RequestManager::terminate_background_server() {
 
     write_to_output_file("", str);
     background_server_status = TERMINATED;
+    request_queue_cv.notify_all();
     // Wait for the background server to terminate
     Runtime *runtime = Runtime::get_runtime();
     Context ctx = Runtime::get_context();

From 381a8084b5c6203e31e12c0b9e95e666ea60a1d6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 7 Oct 2024 18:45:19 -0700
Subject: [PATCH 528/667] feat: make some tasks concurrent

---
 src/ops/fused.cc                 | 5 +++++
 src/parallel_ops/allreduce.cc    | 3 +++
 src/runtime/inference_manager.cc | 1 +
 src/runtime/model.cc             | 6 ++++++
 4 files changed, 15 insertions(+)

diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 2ba98bc09..6307362ea 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -355,6 +355,7 @@ void FusedOp::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -445,6 +446,7 @@ void FusedOp::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -479,6 +481,7 @@ void FusedOp::forward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
     assert(inputs[i]->part != LogicalPartition::NO_PART);
@@ -539,6 +542,7 @@ FutureMap FusedOp::inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   int offset = 0;
   for (int i = 0; i < numInputs; i++) {
@@ -590,6 +594,7 @@ void FusedOp::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   int idx = 0;
   for (int i = 0; i < numInputs; i++) {
     launcher.add_region_requirement(RegionRequirement(inputs[i]->part,
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 6f3148ce4..7ff0bb2b0 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -134,6 +134,7 @@ void AllReduce::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -172,6 +173,7 @@ void AllReduce::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -213,6 +215,7 @@ FutureMap AllReduce::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.concurrent = true;
   launcher.add_future(bc);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index da650c00c..dd13bb2e0 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -646,6 +646,7 @@ void FFModel::compile_inference() {
             false /*must*/,
             0 /*mapper_id*/,
             view.hash() /*MappingTagID*/);
+        index_launcher.concurrent = true;
         FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
         fm.wait_all_results();
         int idx = 0;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index fa4ccd0c5..25296ddcd 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6431,6 +6431,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, FusedOp::init_task>(
           registrar, "FusedOp Init Task");
@@ -6445,6 +6446,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::forward_task>(
           registrar, "FusedOp Forward Task");
@@ -6459,6 +6461,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::inference_task>(
           registrar, "FusedOp Inference Task");
@@ -6473,6 +6476,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::backward_task>(
           registrar, "FusedOp Backward Task");
@@ -6665,6 +6669,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, AllReduce::init_task>(
           registrar, "AllReduce init Task");
@@ -6680,6 +6685,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "AllReduce Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::inference_task>(
           registrar, "AllReduce Inference Task");

From 151872f2c7dab3245e94c11743461e209e1bb212 Mon Sep 17 00:00:00 2001
From: Qinghan Chen <qinghanc@andrew.cmu.edu>
Date: Mon, 7 Oct 2024 23:12:57 -0400
Subject: [PATCH 529/667] request manager h and request manger cc to be
 continued

---
 include/flexflow/request_manager.h |  4 +++
 src/runtime/request_manager.cc     | 57 ++++++++++++++++++++++++++++--
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b85aa3664..c0b677c67 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -146,6 +146,10 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
+  //page attention
+  int page_last_commited = 0;
+  std::vector<LogicalTokenBlock> blocks;
+
   // TokenTree speculative_token_tree;
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d4e31da6b..9fffbc6cc 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1028,6 +1028,9 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   assert(prefill_requests.size() > 0 &&
          "No prefilling request to process in the prefilling phase.");
 
+  // get page manager
+  PageManager *page_manager = get_page_manager();
+
   BatchConfig bc;
   if (decoding_mode == INCREMENTAL_DECODING) {
     bc.inference_mode = InferenceMode::INC_DECODING_MODE;
@@ -1070,11 +1073,15 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       assert(request->llm_prefill_len + idx < request->tokens.size());
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
+      _append_tokens_to_blocks(request, {bc.tokensInfo[token_idx].token_id}, true);
     }
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
+    //update related page info in batch config
+    bc.requestsInfo[request_index].num_kv_pages = page_manager->get_num_allocated_blocks(request->guid);
+    bc.requestsInfo[request_index].kv_last_page_len = request->tokens.back().size(); //last block length
   }
   bc.num_tokens = num_tokens;
 
@@ -1464,6 +1471,8 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
+    //page attention: before commit token, reset the pages assigned by cleaning all the tokens
+    std::vector<int> block_table_before_commit = _reset_block_table(request);
     // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.tokens.size() - 1; // Exclude the last token
@@ -1482,14 +1491,15 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
          committed_token_index++) {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
+      _append_tokens_to_blocks(request, {committed_token.token_id}, true);
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
-          committed_token.from_index;
+          block_table_before_commit[committed_token.from_index / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-          committed_token.to_index;
       new_bc.num_tokens_to_commit++;
-    }
+      // also append the token to the block
+      }
 
     // Load the tokens on the token tree that are not yet pruned to
     // BatchConfig.tokensInfo.
@@ -1864,6 +1874,9 @@ void RequestManager::_append_tokens_to_blocks(Request &request, std::vector<Toke
       request.blocks.back().is_full()) {
       // Append a new logical block
       _append_logical_block_to_request(request, is_commit);
+      // also allocate one physical page
+      PageManager *page_manager = PageManager::get_page_manager();
+      page_manager->allocate(request.guid);
     }
     int num_empty_slots = request.blocks.back().get_num_empty_slots();
     int num_tokens_to_append = std::min(num_empty_slots, marker - cursor);
@@ -1875,6 +1888,44 @@ void RequestManager::_append_tokens_to_blocks(Request &request, std::vector<Toke
   assert(request.blocks.back().num_tokens <= kPagesize);
 }
 
+int RequestManager::_append_token_to_block(Request &request, TokenId token, bool is_commit) {
+  if (request.blocks.empty() ||
+      request.blocks.back().is_full()) {
+    // Append a new logical block
+    _append_logical_block_to_request(request, is_commit);
+    // also allocate one physical page
+    PageManager *page_manager = PageManager::get_page_manager();
+    page_manager->allocate(request.guid);
+  }
+  request.blocks.back().append_tokens({token}, is_commit);
+  assert(request.blocks.back().num_tokens <= kPagesize);
+  // return the index of the token in the gpu kv cache
+  std::vector<int> block_table_indices = PageManager::get_page_manager()->get_block_table_indices(request.guid);
+  // WARNING: need to check!!!
+  return (block_table_indices[request.blocks.size() - 1] - 1) * kPagesize + request.blocks.back().num_tokens - 1;
+}
+
+std::vector<int> RequestManager::_reset_block_table(Request &request){
+  // get the indices of original physical block table for request
+  PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  // reset the block table according to the request's page_last_commit
+  page_manager->erase_last_pages(request.guid, request.page_id_commit);
+  // reset this request's logical block table
+  request.blocks.erase(request.blocks.begin() + request.page_id_commit + 1, request.blocks.end());
+  return block_table_indices;
+}
+
+int idx_logical_to_physical(Request &request, int idx_logical) {
+  // get physical indices
+  PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  assert(request.blocks.size() == block_table_indices.size());
+  return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
+}
+
+int 
+
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {

From e3abef8250bd0d52d5c2ed8e1c9a55f689cc58bc Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 7 Oct 2024 21:16:22 -0700
Subject: [PATCH 530/667] refactored the interface of block manager but may not
 be bug free

---
 include/page_manager.h      |  53 +++++++++--------
 src/runtime/page_manager.cc | 111 +++++++++++++-----------------------
 2 files changed, 69 insertions(+), 95 deletions(-)

diff --git a/include/page_manager.h b/include/page_manager.h
index 6266923dc..a40b8141e 100644
--- a/include/page_manager.h
+++ b/include/page_manager.h
@@ -36,31 +36,33 @@ using TokenId = BatchConfig::TokenId;
 class LogicalTokenBlock {
 public:
     using TokenId = BatchConfig::TokenId;
+
     // Constructor
     LogicalTokenBlock(int block_number, uint32_t block_size);
 
     // Method to check if the block is empty
     bool is_empty() const;
 
+    // Method to check if the block is full
+    bool is_full() const;
+
     // Method to get the number of empty slots
     int get_num_empty_slots() const;
 
     // Method to get the number of allocated slots
-    int get_num_alloc_slots();
+    int get_num_alloc_slots() const;
 
-    // Method to check if the block is full
-    bool is_full() const;
+    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
+    void reset_num_spec_tokens();
 
     // Method to append tokens
     void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
 
-    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
-    void reset_num_spec_tokens();
-
     std::vector<TokenId> get_token_ids() const;
 
+private:
     int block_number; // the index of the logical token block
-    uint32_t block_size; // the size of the block
+    int block_size; // the size of the block
     int num_tokens; // the number of tokens currently stored in the block
     int num_commit_tokens; // the number of tokens inside this block that are already committed
     int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
@@ -78,9 +80,13 @@ class PhysicalTokenBlock {
     // Constructor
     PhysicalTokenBlock(int block_number, uint32_t block_size);
 
+    // Method to get the block number
+    int get_block_number() const { return block_number; }
+
+private:
     int ref_count; // reference count
     int block_number; // the index of the physical token block
-    uint32_t block_size; // the size of the block
+    int block_size; // the size of the block
 };
 
 /**
@@ -90,7 +96,7 @@ class PhysicalTokenBlock {
 class BlockAllocator {
 public:
     // Constructor
-    BlockAllocator(uint32_t block_size, int num_blocks);
+    BlockAllocator(int block_size, int num_total_blocks);
 
     // Allocate a block
     PhysicalTokenBlock allocate();
@@ -99,11 +105,11 @@ class BlockAllocator {
     void free(PhysicalTokenBlock& block);
 
     // Get the number of free blocks
-    size_t get_num_free_blocks() const;
+    int get_num_free_blocks() const;
 
 private:
-    uint32_t block_size;
-    int num_blocks;
+    int block_size;
+    int num_total_blocks;
     std::deque<PhysicalTokenBlock> free_blocks;
 };
 
@@ -118,24 +124,21 @@ class PageManager {
     static PageManager *get_page_manager();
     using BlockTable = std::vector<PhysicalTokenBlock>;
     using RequestGuid = BatchConfig::RequestGuid;
-    PageManager(uint32_t block_size, int num_total_blocks);
+    PageManager(int block_size, int num_total_blocks);
 
-    // Prefill the block with the given token ids at the llm prefilling stage
-    bool prefill(const RequestGuid& request_guid, const std::vector<int>& token_ids);
-    bool allocate(const RequestGuid& request_guid);
-    void free(const RequestGuid& request_guid);
-
-    size_t get_num_free_blocks() const;
-    std::vector<int32_t> get_block_table_indices(const RequestGuid& request_guid) const;
-    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
-
-    void erase_last_pages(const RequestGuid& request_guid, int num_pages);
+    int allocate_one_block(const RequestGuid& request_guid);
+    void free_request(const RequestGuid& request_guid);
+    void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
 
+    std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;
 private:
-    uint32_t block_size; // the size of the block
+    int block_size; // the size of the block
     int num_total_blocks; // the total number of blocks
     BlockAllocator block_allocator;
-    std::unordered_map<int, BlockTable> block_tables;
+    std::unordered_map<RequestGuid, BlockTable> block_tables;
+
+    int get_num_total_free_blocks() const;
+    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
 };
 
 }; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 08819b6f0..98dac322b 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -20,6 +20,7 @@ namespace FlexFlow {
 // For all runtime functions, they share a single page manager for pages information
 PageManager *page_manager_singleton = nullptr;
 
+// the interface of logicaltokenblock
 LogicalTokenBlock::LogicalTokenBlock(int block_number, uint32_t block_size)
     : block_number(block_number), block_size(block_size), num_tokens(0), num_commit_tokens(0), num_spec_tokens(0) {
     }
@@ -30,22 +31,22 @@ bool LogicalTokenBlock::is_empty() const {
     return num_tokens == 0;
 }
 
-int LogicalTokenBlock::get_num_empty_slots() const {
+bool LogicalTokenBlock::is_full() const {
     assert(num_spec_tokens + num_commit_tokens == num_tokens);
     assert(num_tokens <= block_size);
-    return block_size - num_tokens;
+    return num_tokens == block_size;
 }
 
-int LogicalTokenBlock::get_num_alloc_slots() {
+int LogicalTokenBlock::get_num_empty_slots() const {
     assert(num_spec_tokens + num_commit_tokens == num_tokens);
     assert(num_tokens <= block_size);
-    return num_tokens;
+    return block_size - num_tokens;
 }
 
-bool LogicalTokenBlock::is_full() const {
+int LogicalTokenBlock::get_num_alloc_slots() const {
     assert(num_spec_tokens + num_commit_tokens == num_tokens);
     assert(num_tokens <= block_size);
-    return num_tokens == block_size;
+    return num_tokens;
 }
 
 void LogicalTokenBlock::reset_num_spec_tokens(){
@@ -80,14 +81,14 @@ std::vector<TokenId> LogicalTokenBlock::get_token_ids() const {
     return token_ids;
 }
 
-PhysicalTokenBlock::PhysicalTokenBlock(int block_number, uint32_t block_size)
+PhysicalTokenBlock::PhysicalTokenBlock(int block_number, int block_size)
     : block_number(block_number), block_size(block_size), ref_count(0) {}
 
-BlockAllocator::BlockAllocator(uint32_t block_size, int num_total_blocks) {
+BlockAllocator::BlockAllocator(int block_size, int num_total_blocks) {
     for (int block_number = 0; block_number < num_total_blocks; ++block_number) {
         free_blocks.push_back(PhysicalTokenBlock(block_number, block_size));
     }
-    num_blocks = num_total_blocks;
+    num_total_blocks = num_total_blocks;
 }
 
 // Allocate a block
@@ -98,7 +99,6 @@ PhysicalTokenBlock BlockAllocator::allocate() {
     PhysicalTokenBlock block = free_blocks.front();
     free_blocks.pop_front();
     block.ref_count = 1;
-    num_blocks -= 1;
     return block;
 }
 
@@ -110,50 +110,28 @@ void BlockAllocator::free(PhysicalTokenBlock& block) {
     block.ref_count -= 1;
     if (block.ref_count == 0) {
         free_blocks.push_back(block);
-        num_blocks += 1;
+    }else{
+        // in current implementation this should not be the case
+        throw std::runtime_error("Block is not freed. Ref count: " + std::to_string(block.ref_count));
     }
 }
 
-size_t BlockAllocator::get_num_free_blocks() const {
-    assert(free_blocks.size() <= static_cast<size_t>(num_blocks));
-    if (free_blocks.size() > static_cast<size_t>(num_blocks)) {
-        std::cerr << "num free blocks: " << free_blocks.size() << std::endl;
-        std::cerr << "num total blocks: " << num_blocks << std::endl;
-        throw std::runtime_error("Number of free blocks exceeds the total number of blocks.");
-    }
+int BlockAllocator::get_num_free_blocks() const {
     return free_blocks.size();
 }
 
-PageManager::PageManager(uint32_t block_size, int num_total_blocks)
+PageManager::PageManager(int block_size, int num_total_blocks)
     : block_size(block_size), num_total_blocks(num_total_blocks),
       block_allocator(block_size, num_total_blocks) {}
 
-bool PageManager::prefill(const RequestGuid& request_guid, const std::vector<TokenId>& token_ids) {
-    BlockTable block_table;
-    for (size_t logical_idx = 0; logical_idx < token_ids.size(); logical_idx++) {
-        PhysicalTokenBlock block = block_allocator.allocate();
-        block_table.push_back(block);
-    }
-
-    block_tables[request_guid] = block_table;
-    return true;
-}
-
-bool PageManager::can_allocate(const RequestGuid& request_guid) const {
-    int num_free_gpu_blocks = block_allocator.get_num_free_blocks();
-    return num_free_gpu_blocks > 0;
-}
-
-bool PageManager::allocate(const RequestGuid& request_guid) {
-    // This is the prefilling for a request
-    if (!can_allocate(request_guid)) {
-        assert(false);
-    }
+//return the physical number of this block
+int PageManager::allocate_one_block(const RequestGuid& request_guid) {
     BlockTable& block_table = block_tables[request_guid];
 
     PhysicalTokenBlock block = block_allocator.allocate();
-    block_table.push_back(block);;
-    return true;
+    block_table.push_back(block);
+    block_tables[request_guid] = block_table;
+    return block.get_block_number();
 }
 
 void PageManager::_free_block_table(BlockTable& block_table) {
@@ -162,33 +140,41 @@ void PageManager::_free_block_table(BlockTable& block_table) {
     } 
 }
 
-void PageManager::free(const RequestGuid& request_guid) {
+void PageManager::free_request(const RequestGuid& request_guid) {
+    //we only free the blocks that are already used
     assert(block_tables.find(request_guid) != block_tables.end());
     auto& block_table = block_tables[request_guid];
     _free_block_table(block_table);
+    return;
 }
 
-size_t PageManager::get_num_free_blocks() const {
-    return block_allocator.get_num_free_blocks();
-}
+void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_blocks) {
+    assert(block_tables.find(request_guid) != block_tables.end());
+    auto& block_table = block_tables[request_guid];
+    assert(num_blocks <= block_table.size());
+    int num_blocks_allocated = block_table.size();
+    for (int i = 0; i < num_blocks; i++) {
+        block_allocator.free(block_table[num_blocks_allocated - i - 1]);
+    }
+    block_table = std::vector<PhysicalTokenBlock>(block_table.begin() + num_blocks, block_table.end());
+    block_tables[request_guid] = block_table;
+    return;
 
-std::vector<int32_t> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
-    std::vector<int32_t> indices;
+std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
+    std::vector<int> indices;
     try {
     const auto& block_table = block_tables.at(request_guid);
     for (const auto& block : block_table) {
         // printf("get block indice block number is: %d\n", block.block_number);
-        indices.push_back(block.block_number);
-    }
-    } catch (const std::out_of_range& e) {
-        std::cerr << "Request GUID not found in block tables: " << e.what() << std::endl;
-        // Handle error appropriately
-        std::cout << "request ID is: " << request_guid << std::endl;
-        exit(1);
+        indices.push_back(block.get_block_number());
     }
     return indices;
 }
 
+int PageManager::get_num_total_free_blocks() const {
+    return block_allocator.get_num_free_blocks();
+}
+
 int PageManager::get_num_allocated_blocks(const RequestGuid& request_guid) const {
     auto it = block_tables.find(request_guid);
     if (it == block_tables.end()) {
@@ -198,21 +184,6 @@ int PageManager::get_num_allocated_blocks(const RequestGuid& request_guid) const
     }
 }
 
-void PageManager::erase_last_pages(const RequestGuid& request_guid, int last_commit_page){
-    assert(block_tables.find(request_guid) != block_tables.end());
-    auto& block_table = block_tables[request_guid];
-    assert(last_commit_page < block_table.size());
-    // free the blocks that are used for spec tokens and put them back to the queue
-    for (int i = last_commit_page + 1; i < block_table.size(); i++) {
-        block_allocator.free(block_table[i]);
-    }
-    // erase the blocks that are used for spec tokens in the block table of given request
-    block_table = std::vector<PhysicalTokenBlock>(block_table.begin(), block_table.begin() + last_commit_page + 1);
-    // need to put the last blocks back to the free list
-    block_tables[request_guid] = block_table;
-    assert(block_tables[request_guid].size() == last_commit_page + 1);
-}
-
 PageManager *PageManager::get_page_manager() {
   if (page_manager_singleton == nullptr) {
     int num_total_blocks = (BatchConfig::max_spec_tree_token_num() +

From d9ff5eef737d8954a2bbc9b2dbf10c505f1f25e3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 9 Oct 2024 12:03:30 -0700
Subject: [PATCH 531/667] chore: add more profiling

---
 src/runtime/request_manager.cc | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d87f437ab..cfecfb4e5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2530,6 +2530,39 @@ void RequestManager::terminate_background_server() {
     }
     latency_per_request_ms += ")";
     str += latency_per_request_ms;
+
+    std::string ttft_per_request_ms = "\n ttft_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double prefilling_time_ms = 0;
+      auto const &profiling = profiling_info.second;
+      if (profiling.start_decoding_time != 0) {
+        prefilling_time_ms =
+            (profiling.start_decoding_time - profiling.start_time) / 1000.0;
+      } else {
+        prefilling_time_ms =
+            (profiling.finish_time - profiling.start_time) / 1000.0;
+      }
+      ttft_per_request_ms += std::to_string(prefilling_time_ms) + " ";
+    }
+    ttft_per_request_ms += ")";
+    str += ttft_per_request_ms;
+
+    std::string per_token_time_per_request_ms =
+        "\n per_token_time_per_request_ms( ";
+    for (auto const &profiling_info : profiling_requests) {
+      double per_token_time_ms = 0;
+      auto const &request = all_requests[profiling_info.first];
+      auto const &profiling = profiling_info.second;
+      if (profiling.start_decoding_time != 0) {
+        per_token_time_ms =
+            (profiling.finish_time - profiling.start_decoding_time) / 1000.0 /
+            (request.tokens.size() - request.llm_prefill_len);
+      }
+      per_token_time_per_request_ms += std::to_string(per_token_time_ms) + " ";
+    }
+    per_token_time_per_request_ms += ")";
+    str += per_token_time_per_request_ms;
+
     average_latency_per_request /= total_requests;
     str += "\n average_latency_per_request_ms(" +
            std::to_string(average_latency_per_request) + ")";

From 73dc69965fa16f2ce090deafb0d71727c1195fc8 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 9 Oct 2024 22:21:52 -0700
Subject: [PATCH 532/667] ckpt before build

---
 .../inc_multihead_self_attention_kernels.h    |  46 ++++--
 include/flexflow/request_manager.h            |   1 +
 include/page_manager.h                        | 144 -----------------
 .../inc_multihead_self_attention_kernels.cu   |   8 +-
 src/runtime/batch_config.cc                   |   2 +
 src/runtime/page_manager.cc                   |  10 ++
 src/runtime/request_manager.cc                | 120 +++++++-------
 src/runtime/request_manager.cu                | 150 ++++++++++++++----
 8 files changed, 232 insertions(+), 249 deletions(-)
 delete mode 100644 include/page_manager.h

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 8f69ad380..51f228569 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -15,29 +15,47 @@ namespace Kernels {
 namespace IncMultiHeadAttention {
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
+__device__ __forceinline__ size_t get_k_entry_offset(int const token_idx,
+                                                     int const page_idx,
                                                      int const num_heads,
                                                      int const head_dim) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          token_idx % kPagesize) * /* page slot index */
-         num_heads *
-         head_dim;
+  size_t index = ((page_idx) * kPagesize * 2 + (token_idx % kPagesize)) * head_dim * num_heads;
+  return index;
 }
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-                                                     int const token_idx,
-                                                     int const max_num_pages,
+__device__ __forceinline__ size_t get_v_entry_offset(int const token_idx,
+                                                     int const page_idx,
                                                      int const num_heads,
                                                      int const head_dim) {
-  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-          kPagesize + token_idx % kPagesize) * /* page slot index */
-         num_heads *
-         head_dim;
+  size_t index = ((page_idx) * kPagesize * 2 + kPagesize + (token_idx % kPagesize)) * head_dim * num_heads;
+  return index;
 }
 
+// // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+// __device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+//                                                      int const token_idx,
+//                                                      int const max_num_pages,
+//                                                      int const num_heads,
+//                                                      int const head_dim) {
+//   return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+//           token_idx % kPagesize) * /* page slot index */
+//          num_heads *
+//          head_dim;
+// }
+
+// // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+// __device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+//                                                      int const token_idx,
+//                                                      int const max_num_pages,
+//                                                      int const num_heads,
+//                                                      int const head_dim) {
+//   return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+//           kPagesize + token_idx % kPagesize) * /* page slot index */
+//          num_heads *
+//          head_dim;
+// }
+
 template <typename DT>
 void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
                       GenericTensorAccessorR const weight,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 4ee288b9c..753f6c983 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -19,6 +19,7 @@
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/file_loader.h"
+#include "flexflow/page_manager.h"
 #include <condition_variable>
 #include <future>
 #include <mutex>
diff --git a/include/page_manager.h b/include/page_manager.h
deleted file mode 100644
index a40b8141e..000000000
--- a/include/page_manager.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2023 CMU, Stanford, Facebook, LANL
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "flexflow/batch_config.h"
-#include "flexflow/inference.h"
-#include "flexflow/model.h"
-#include "flexflow/config.h"
-#include "flexflow/utils/file_loader.h"
-#include <future>
-#include <mutex>
-#include <tokenizers_cpp.h>
-#include <deque>
-
-namespace FlexFlow {
-
-using TokenId = BatchConfig::TokenId;
-
-/**
- * @class LogicalTokenBlock
- * @brief A class to represent a logical block of tokens similar to virtual memory address
- */
-class LogicalTokenBlock {
-public:
-    using TokenId = BatchConfig::TokenId;
-
-    // Constructor
-    LogicalTokenBlock(int block_number, uint32_t block_size);
-
-    // Method to check if the block is empty
-    bool is_empty() const;
-
-    // Method to check if the block is full
-    bool is_full() const;
-
-    // Method to get the number of empty slots
-    int get_num_empty_slots() const;
-
-    // Method to get the number of allocated slots
-    int get_num_alloc_slots() const;
-
-    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
-    void reset_num_spec_tokens();
-
-    // Method to append tokens
-    void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
-
-    std::vector<TokenId> get_token_ids() const;
-
-private:
-    int block_number; // the index of the logical token block
-    int block_size; // the size of the block
-    int num_tokens; // the number of tokens currently stored in the block
-    int num_commit_tokens; // the number of tokens inside this block that are already committed
-    int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
-
-    std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
-};
-
-/**
- * @class PhysicalTokenBlock
- * @brief A class to represent a physical block of tokens similar to physical memory address
- * It keeps track of the location of the tokens stored on GPU memory
- */
-class PhysicalTokenBlock {
-public:
-    // Constructor
-    PhysicalTokenBlock(int block_number, uint32_t block_size);
-
-    // Method to get the block number
-    int get_block_number() const { return block_number; }
-
-private:
-    int ref_count; // reference count
-    int block_number; // the index of the physical token block
-    int block_size; // the size of the block
-};
-
-/**
- * @class BlockAllocator
- * @brief A Block Manager that is reponsible for maintaining a pool of free blocks
- */
-class BlockAllocator {
-public:
-    // Constructor
-    BlockAllocator(int block_size, int num_total_blocks);
-
-    // Allocate a block
-    PhysicalTokenBlock allocate();
-
-    // Free a block
-    void free(PhysicalTokenBlock& block);
-
-    // Get the number of free blocks
-    int get_num_free_blocks() const;
-
-private:
-    int block_size;
-    int num_total_blocks;
-    std::deque<PhysicalTokenBlock> free_blocks;
-};
-
-/*
-* @class PageManager
-* @brief A wrapper class that manages the kv cache allocation status
-* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
-*/
-class PageManager {
-public:
-    // Get the singleton instance of the PageManager as it will be shared in multiple places
-    static PageManager *get_page_manager();
-    using BlockTable = std::vector<PhysicalTokenBlock>;
-    using RequestGuid = BatchConfig::RequestGuid;
-    PageManager(int block_size, int num_total_blocks);
-
-    int allocate_one_block(const RequestGuid& request_guid);
-    void free_request(const RequestGuid& request_guid);
-    void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
-
-    std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;
-private:
-    int block_size; // the size of the block
-    int num_total_blocks; // the total number of blocks
-    BlockAllocator block_allocator;
-    std::unordered_map<RequestGuid, BlockTable> block_tables;
-
-    int get_num_total_free_blocks() const;
-    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
-};
-
-}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index e65f2c060..e6b32df3a 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -459,6 +459,8 @@ __global__ void
     update_qkv_in_batch_kernel(DT *qkv_proj_array,
                                half *qTmp_ptr,
                                half *kvCache_ptr,
+                               int32_t *kv_indptr,
+                               int32_t *kv_page_indices,
                                BatchConfig::PerTokenInfo const *tokenInfos,
                                int const max_num_pages,
                                int num_q_heads,
@@ -483,10 +485,12 @@ __global__ void
       static_cast<half>(qkv_proj_array[from_idx + offset]);
 
   if (offset < kv_hidden_size) {
+    int start = kv_indptr[req_idx];
+    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
     size_t to_k_idx = get_k_entry_offset(
-               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
+           token_abs_idx, page_idx, num_kv_heads, head_dim);
            to_v_idx = get_v_entry_offset(
-               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
+           token_abs_idx, page_idx, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 5cd4135d2..1536cf06c 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -298,6 +298,8 @@ void StreamingCacheInfo::reset_cache() {
   commit_len = 0;
 }
 
+//page attention: TODO: I think we just need to change the index
+
 int StreamingCacheInfo::global_2_cache_index(int global_index) {
   if (global_index < sink_cache_size) {
     return global_index;
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 98dac322b..f323f4bbb 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -159,6 +159,16 @@ void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_
     block_table = std::vector<PhysicalTokenBlock>(block_table.begin() + num_blocks, block_table.end());
     block_tables[request_guid] = block_table;
     return;
+}
+
+std::vector<int> PageManager::copy_block_table(const RequestGuid& request_guid) const {
+    return block_tables.at(request_guid);
+}
+
+int PageManager::get_index_last_block(const RequestGuid& request_guid) const {
+    const auto& block_table = block_tables.at(request_guid);
+    return block_table.back.get_block_number();
+}
 
 std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
     std::vector<int> indices;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7aad8b54c..c5a54df52 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1094,15 +1094,16 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       assert(request->llm_prefill_len + idx < request->tokens.size());
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
-      _append_tokens_to_blocks(request, {bc.tokensInfo[token_idx].token_id}, true);
+
+      append_token_to_block(request, request->tokens[request->llm_prefill_len + idx], true);
     }
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
     //update related page info in batch config
-    bc.requestsInfo[request_index].num_kv_pages = page_manager->get_num_allocated_blocks(request->guid);
-    bc.requestsInfo[request_index].kv_last_page_len = request->tokens.back().size(); //last block length
+    bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
+    bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
   }
   bc.num_tokens = num_tokens;
 
@@ -1493,7 +1494,12 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     assert(request.status == Request::RUNNING);
 
     //page attention: before commit token, reset the pages assigned by cleaning all the tokens
-    std::vector<int> block_table_before_commit = _reset_block_table(request);
+    std::vector<int> block_table_before_commit = get_block_table_indices(guid);
+    // also need to reset the pages
+
+
+    int token_offset = request.first_token_offset_in_batch;
+
     // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.tokens.size() - 1; // Exclude the last token
@@ -1512,12 +1518,17 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
          committed_token_index++) {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
-      _append_tokens_to_blocks(request, {committed_token.token_id}, true);
+      
+      int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
+      int idx_from_logical = committed_token.from_index - first_token_offset_in_batch;
+      int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
+      reset_block_table(request);
+
+
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
-          block_table_before_commit[committed_token.from_index / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache = idx_from_physical;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = idx_to_physical;
       new_bc.num_tokens_to_commit++;
       // also append the token to the block
       }
@@ -1538,6 +1549,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
           new_bc.num_tokens++;
           token_tree_index++;
+
+          // Append the token to the block
+          append_token_to_block(request, tree_node->id, false);
         }
       }
       layer_index++;
@@ -1865,6 +1879,31 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 }
 
 /* --------- Page Attention Related Functions --------- */
+int get_num_blocks_allocated(Request &request) const {
+  return request.blocks.size();
+}
+
+int get_len_last_block(Request &request) const {
+  return request.blocks.back().num_tokens;
+}
+
+int get_idx_last_logical_token(Request &request) const {
+  if (request.blocks.empty()) {
+    printf("Error: request.blocks is empty\n");
+    return -1;
+  }else{
+    return (request.blocks.size() - 1) * kPagesize + request.blocks.back().num_tokens - 1;
+  }
+}
+
+int idx_logical_to_physical(Request &request, int idx_logical) {
+  // get physical indices
+  PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  assert(request.blocks.size() == block_table_indices.size());
+  return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
+}
+
 void RequestManager::_append_logical_block_to_request(
     Request &request, bool is_commit) {
   // Append the logical block to the request
@@ -1881,72 +1920,39 @@ void RequestManager::_append_logical_block_to_request(
   }
 }
 
-void RequestManager::_append_tokens_to_blocks(Request &request, std::vector<TokenId> const &tokens, bool is_commit, int start, int end) {
-  assert(start >= 0 && start < tokens.size());
-  int cursor = start;
-  int marker = 0;
-  if (end == -1) {
-    marker = tokens.size();
-  } else {
-    marker = end;
-  }
-  while (cursor < marker) {
-    if (request.blocks.empty() ||
-      request.blocks.back().is_full()) {
-      // Append a new logical block
-      _append_logical_block_to_request(request, is_commit);
-      // also allocate one physical page
-      PageManager *page_manager = PageManager::get_page_manager();
-      page_manager->allocate(request.guid);
-    }
-    int num_empty_slots = request.blocks.back().get_num_empty_slots();
-    int num_tokens_to_append = std::min(num_empty_slots, marker - cursor);
-    // vector to be appeneded will be [cursor, cursor + num_tokens_to_append)]
-    std::vector<TokenId> tokens_to_append(tokens.begin() + cursor, tokens.begin() + cursor + num_tokens_to_append);
-    request.blocks.back().append_tokens(tokens_to_append, is_commit);
-    cursor += num_tokens_to_append;
-  }
-  assert(request.blocks.back().num_tokens <= kPagesize);
-}
-
-int RequestManager::_append_token_to_block(Request &request, TokenId token, bool is_commit) {
+//this function is used for appending a token to the last logical block and also the last physical block
+//it will return the physical position of this token
+int RequestManager::append_token_to_block(Request &request, TokenId token, bool is_commit) {
+  int page_cur_physical = PageManager::get_page_manager()->get_index_last_block(request.guid);
   if (request.blocks.empty() ||
       request.blocks.back().is_full()) {
+    PageManager *page_manager = PageManager::get_page_manager();
     // Append a new logical block
     _append_logical_block_to_request(request, is_commit);
     // also allocate one physical page
-    PageManager *page_manager = PageManager::get_page_manager();
-    page_manager->allocate(request.guid);
+    page_cur_physical = page_manager->allocate_one_block(request.guid);
   }
-  request.blocks.back().append_tokens({token}, is_commit);
-  assert(request.blocks.back().num_tokens <= kPagesize);
-  // return the index of the token in the gpu kv cache
-  std::vector<int> block_table_indices = PageManager::get_page_manager()->get_block_table_indices(request.guid);
-  // WARNING: need to check!!!
-  return (block_table_indices[request.blocks.size() - 1] - 1) * kPagesize + request.blocks.back().num_tokens - 1;
+  // insert token to both logical block and physical block
+  request.blocks.back().append_token(token, is_commit);
+  int idx_logical = get_idx_last_logical_token(request);
+  int idx_physical = idx_logical_to_physical(request, idx_logical);
+  return idx_physical;
 }
 
-std::vector<int> RequestManager::_reset_block_table(Request &request){
+void RequestManager::reset_block_table(Request &request){
   // get the indices of original physical block table for request
   PageManager *page_manager = PageManager::get_page_manager();
-  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
-  page_manager->erase_last_pages(request.guid, request.page_id_commit);
+  page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_id_commit);
   // reset this request's logical block table
   request.blocks.erase(request.blocks.begin() + request.page_id_commit + 1, request.blocks.end());
-  return block_table_indices;
-}
 
-int idx_logical_to_physical(Request &request, int idx_logical) {
-  // get physical indices
-  PageManager *page_manager = PageManager::get_page_manager();
-  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+
+  BlockTable block_table = page_manager->get_block_table(request.guid);
   assert(request.blocks.size() == block_table_indices.size());
-  return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
+  return;
 }
 
-int 
-
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 733cca745..1c9d659e6 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -76,6 +76,76 @@ void RequestManager::load_tokens_task(
   }
 }
 
+void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
+                                       PageManager *pm,
+                                       FFHandler handle,
+                                       cudaStream_t stream,
+                                       uint32_t const max_num_pages,
+                                       int32_t *q_indptr_h,
+                                       int32_t *kv_indptr_h,
+                                       int32_t *kv_indices_h,
+                                       int32_t *kv_last_page_len_h,
+                                       int32_t *qk_indptr_h) {
+  int batch_size = batch_config->num_active_requests();
+  // we just search for the page number for each request
+  q_indptr_h[0] = 0;
+  kv_indptr_h[0] = 0;
+  qk_indptr_h[0] = 0;
+  int cnt_1 = 0, q_lens = 0, qk_lens = 0;
+  int indices_offset = 0, indices_lens = 0, kv_len = 0;
+  for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+    if (batch_config->request_available[req_idx]) {
+      int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+      int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                  batch_config->requestsInfo[req_idx].first_token_index_in_request;
+      q_lens += q_len;
+      qk_lens += (q_len * kv_len + 7) / 8;
+      indices_offset = indices_lens;
+      indices_lens += (kv_len + kPagesize - 1) / kPagesize;
+      q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+      kv_indptr_h[indptr_idx + 1] = batch_config->requestsInfo[req_idx].num_kv_pages + kv_indptr_h[indptr_idx];
+
+      assert(batch_config->requestsInfo[req_idx].num_kv_pages == (kv_len + kPagesize - 1) / kPagesize);
+      assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= 64);
+      std::vector<int32_t> kv_indices = pm -> get_block_table_indices(batch_config->requestsInfo[req_idx].request_guid);
+      assert(kv_indices.size() == (kv_len + kPagesize - 1) / kPagesize);
+      for (int i = indices_offset; i < indices_lens; i++) {
+        kv_indices_h[i] = kv_indices[i - indices_offset];
+      }
+      qk_indptr_h[indptr_idx + 1] = qk_lens;
+      kv_last_page_len_h[indptr_idx] = batch_config->requestsInfo[req_idx].kv_last_page_len;
+      indptr_idx++;
+    }
+  }
+
+  // do the copy
+  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_indices,
+                            kv_indices_h,
+                            sizeof(int32_t) * batch_size * max_num_pages,
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_last_page_len,
+                            kv_last_page_len_h,
+                            sizeof(int32_t) * batch_size,
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->q_indptr,
+                            q_indptr_h,
+                            sizeof(int32_t) * (batch_size + 1),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_indptr,
+                            kv_indptr_h,
+                            sizeof(int32_t) * (batch_size + 1),
+                            cudaMemcpyHostToDevice,
+                            stream));
+  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->qk_indptr,
+                            qk_indptr_h,
+                            sizeof(int32_t) * (batch_size + 1),
+                            cudaMemcpyHostToDevice,
+                            stream));
+}
+
 // q_indptr: the start offset of q in the batch for each request,
 //           the length is `num_requests + 1`: [0, num_q_0, num_q_0 + num_q_1,
 //           ..., num_q_0 + num_q_1 + ... + num_q_{num_requests - 1}]
@@ -546,6 +616,12 @@ void RequestManager::load_batch_config_task(
       }
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
+    static PageManager *pm = PageManager::get_page_manager();
+    static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_NUM_TOKENS];
+    static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
+
     if (handle.tree_verify_attention_metadata->enabled()) {
       // calculate the attention meta data
       {
@@ -569,19 +645,29 @@ void RequestManager::load_batch_config_task(
                            BatchConfig::max_spec_tree_token_num());
 
         int parallelism = batch_size;
-        prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-            batch_size,
-            request_infos,
-            request_available,
-            max_num_pages,
-            handle.tree_verify_attention_metadata->q_indptr,
-            handle.tree_verify_attention_metadata->kv_indptr,
-            handle.tree_verify_attention_metadata->kv_indices,
-            handle.tree_verify_attention_metadata->kv_last_page_len,
-            handle.tree_verify_attention_metadata->qk_indptr);
+        // prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
+        //                                   min(CUDA_NUM_THREADS, parallelism),
+        //                                   0,
+        //                                   stream>>>(
+        //     batch_size,
+        //     request_infos,
+        //     request_available,
+        //     max_num_pages,
+        //     handle.tree_verify_attention_metadata->q_indptr,
+        //     handle.tree_verify_attention_metadata->kv_indptr,
+        //     handle.tree_verify_attention_metadata->kv_indices,
+        //     handle.tree_verify_attention_metadata->kv_last_page_len,
+        //     handle.tree_verify_attention_metadata->qk_indptr);
+        prepare_inference_params_kernel_h(batch_config,
+                                          pm,
+                                          handle,
+                                          stream,
+                                          max_num_pages,
+                                          q_indptr_h,
+                                          kv_indptr_h,
+                                          kv_indices_h,
+                                          kv_last_page_len_h,
+                                          qk_indptr_h);
 
         // Update gpu-side custom mask referring from CaualMask
         if (!batch_config->prompt_phase) {
@@ -639,25 +725,25 @@ void RequestManager::load_batch_config_task(
                   ->prompt_handler_collections[batch_size]);
         }
 
-        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
-            kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
-        q_indptr_h[0] = 0;
-        kv_indptr_h[0] = 0;
-        for (int req_idx = 0, indptr_idx = 0;
-             req_idx < batch_config->max_requests_per_batch();
-             req_idx++) {
-          if (batch_config->request_available[req_idx]) {
-            int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-            int kv_len =
-                batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                batch_config->requestsInfo[req_idx]
-                    .first_token_index_in_request;
-            q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-            kv_indptr_h[indptr_idx + 1] =
-                kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
-            indptr_idx++;
-          }
-        }
+        // static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        //     kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+        // q_indptr_h[0] = 0;
+        // kv_indptr_h[0] = 0;
+        // for (int req_idx = 0, indptr_idx = 0;
+        //      req_idx < batch_config->max_requests_per_batch();
+        //      req_idx++) {
+        //   if (batch_config->request_available[req_idx]) {
+        //     int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+        //     int kv_len =
+        //         batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+        //         batch_config->requestsInfo[req_idx]
+        //             .first_token_index_in_request;
+        //     q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+        //     kv_indptr_h[indptr_idx + 1] =
+        //         kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
+        //     indptr_idx++;
+        //   }
+        // }
 
         handler->SetCUDAStream(stream);
         handler->BeginForward<half, int32_t>(

From de0b803587fc28c3a1ea0c7e13374068441f3e1a Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Wed, 9 Oct 2024 23:42:02 -0700
Subject: [PATCH 533/667] some fix

---
 .../inc_multihead_self_attention_kernels.h    |  50 ++++----
 .../inc_multihead_self_attention_kernels.cu   |   8 +-
 src/ops/tree_inc_multihead_self_attention.cu  | 114 +++++++++++++++---
 src/runtime/page_manager.cc                   |  13 +-
 src/runtime/request_manager.cc                |  10 +-
 5 files changed, 138 insertions(+), 57 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 51f228569..fe8d32387 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -15,7 +15,7 @@ namespace Kernels {
 namespace IncMultiHeadAttention {
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_k_entry_offset(int const token_idx,
+__device__ __forceinline__ size_t get_k_entry_offset_verify(int const token_idx,
                                                      int const page_idx,
                                                      int const num_heads,
                                                      int const head_dim) {
@@ -24,7 +24,7 @@ __device__ __forceinline__ size_t get_k_entry_offset(int const token_idx,
 }
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_v_entry_offset(int const token_idx,
+__device__ __forceinline__ size_t get_v_entry_offset_verify(int const token_idx,
                                                      int const page_idx,
                                                      int const num_heads,
                                                      int const head_dim) {
@@ -33,28 +33,28 @@ __device__ __forceinline__ size_t get_v_entry_offset(int const token_idx,
 }
 
 // // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-// __device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
-//                                                      int const token_idx,
-//                                                      int const max_num_pages,
-//                                                      int const num_heads,
-//                                                      int const head_dim) {
-//   return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-//           token_idx % kPagesize) * /* page slot index */
-//          num_heads *
-//          head_dim;
-// }
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const num_heads,
+                                                     int const head_dim) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
+}
 
-// // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-// __device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
-//                                                      int const token_idx,
-//                                                      int const max_num_pages,
-//                                                      int const num_heads,
-//                                                      int const head_dim) {
-//   return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
-//           kPagesize + token_idx % kPagesize) * /* page slot index */
-//          num_heads *
-//          head_dim;
-// }
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const num_heads,
+                                                     int const head_dim) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
+}
 
 template <typename DT>
 void pre_build_weight(IncMultiHeadSelfAttentionMeta const *m,
@@ -107,6 +107,10 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream);
 
+void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
+                                BatchConfig const *bc,
+                                cudaStream_t stream);
+
 // [For the tokens in streaming cache]
 // Convert the out-of-order cache to in-order relative position.
 // Source: pre-pos-encoding kv values in the streaming cache.
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index e6b32df3a..e65f2c060 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -459,8 +459,6 @@ __global__ void
     update_qkv_in_batch_kernel(DT *qkv_proj_array,
                                half *qTmp_ptr,
                                half *kvCache_ptr,
-                               int32_t *kv_indptr,
-                               int32_t *kv_page_indices,
                                BatchConfig::PerTokenInfo const *tokenInfos,
                                int const max_num_pages,
                                int num_q_heads,
@@ -485,12 +483,10 @@ __global__ void
       static_cast<half>(qkv_proj_array[from_idx + offset]);
 
   if (offset < kv_hidden_size) {
-    int start = kv_indptr[req_idx];
-    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
     size_t to_k_idx = get_k_entry_offset(
-           token_abs_idx, page_idx, num_kv_heads, head_dim);
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
            to_v_idx = get_v_entry_offset(
-           token_abs_idx, page_idx, num_kv_heads, head_dim);
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a2272e5f2..9103bc814 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -47,8 +47,87 @@ using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
+template <typename DT>
+__global__ void
+    update_qkv_in_batch_verify_kernel(DT *qkv_proj_array,
+                               half *qTmp_ptr,
+                               half *kvCache_ptr,
+                               int32_t *kv_indptr,
+                               int32_t *kv_page_indices,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int const max_num_pages,
+                               int num_q_heads,
+                               int num_kv_heads,
+                               int head_dim,
+                               int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / q_hidden_size;
+  int const offset = thread_idx % q_hidden_size;
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  qTmp_ptr[token_idx * q_hidden_size + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + offset]);
+
+  if (offset < kv_hidden_size) {
+    int start = kv_indptr[req_idx];
+    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
+    size_t to_k_idx = get_k_entry_offset_verify(
+           token_abs_idx, page_idx, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset_verify(
+           token_abs_idx, page_idx, num_kv_heads, head_dim);
+    // key and value cache should be stored interleaved
+    int const stride = num_q_heads / num_kv_heads;
+    int const kv_offset =
+        offset / head_dim * stride * head_dim + offset % head_dim;
+    kvCache_ptr[to_k_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
+    kvCache_ptr[to_v_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                         temp_kv_hidden_size + kv_offset]);
+  }
+}
+
+template <typename DT>
+void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         cudaStream_t stream) {
+  int num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
+  int parallelism = m->local_hidden_size * num_new_tokens;
+  int const max_num_pages =
+      round_up_pages(BatchConfig::max_sequence_length() +
+                     BatchConfig::max_spec_tree_token_num());
+  update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
+                               min(CUDA_NUM_THREADS, parallelism),
+                               0,
+                               stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                         static_cast<half *>(m->queryTmp),
+                                         static_cast<half *>(m->kvCache),
+                                         m->handle.tree_verify_attention_metadata->kv_indptr,
+                                         m->handle.tree_verify_attention_metadata->kv_indices,
+                                         m->token_infos,
+                                         max_num_pages,
+                                         m->num_q_heads,
+                                         m->num_kv_heads,
+                                         m->qk_dim,
+                                         num_new_tokens);
+}
+
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     bool const *request_available,
     int num_requests,
@@ -70,6 +149,9 @@ __global__ void commit_tokens_kernel(
     }
   }
 
+  int start = kv_indptr[requext_idx_in_batch];
+  int end = kv_indptr[requext_idx_in_batch + 1] - 1;
+
   for (int i = 0; i < *num_committed_tokens; i++) {
     if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
       int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache;
@@ -77,23 +159,19 @@ __global__ void commit_tokens_kernel(
         continue;
       }
 
-      int const req_id = committedTokenInfos[i].request_index;
+      // int const req_id = committedTokenInfos[i].request_index;
       int const tok_id = committedTokenInfos[i].token_depth;
-
-      size_t from_k_idx = get_k_entry_offset(req_id,
-                                             index_in_kv_cache,
-                                             max_num_pages,
-                                             num_kv_heads,
-                                             head_dim),
-             from_v_idx = get_v_entry_offset(req_id,
-                                             index_in_kv_cache,
-                                             max_num_pages,
-                                             num_kv_heads,
-                                             head_dim);
-      size_t to_k_idx = get_k_entry_offset(
-                 req_id, tok_id, max_num_pages, num_kv_heads, head_dim),
-             to_v_idx = get_v_entry_offset(
-                 req_id, tok_id, max_num_pages, num_kv_heads, head_dim);
+      int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
+      int const page_from_idx = committedTokenInfos[i].index_in_kv_cache / kPagesize;
+
+      size_t from_k_idx = get_k_entry_offset_verify(
+                  committedTokenInfos[i].index_in_kv_cache, page_from_idx, num_kv_heads, head_dim),
+             from_v_idx = get_v_entry_offset_verify(
+                  committedTokenInfos[i].index_in_kv_cache, page_from_idx, num_kv_heads, head_dim);
+      size_t to_k_idx = get_k_entry_offset_verify(
+                 committedTokenInfos[i].token_depth, page_to_idx, num_kv_heads, head_dim),
+             to_v_idx = get_v_entry_offset_verify(
+                 committedTokenInfos[i].token_depth, page_to_idx, num_kv_heads, head_dim);
       assert(to_k_idx <= from_k_idx);
 
       kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
@@ -119,6 +197,8 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
                          stream>>>(static_cast<half *>(m->kvCache),
+                                   m->handle.tree_verify_attention_metadata->kv_indptr,
+                                   m->handle.tree_verify_attention_metadata->kv_indices,
                                    m->committed_token_infos,
                                    m->request_available,
                                    num_requests,
@@ -418,7 +498,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventRecord(t_start, stream);
 
   // Update key-val cache, compact q array
-  update_qkv_in_batch<DT>(m, bc, stream);
+  update_qkv_in_batch_verify<DT>(m, bc, stream);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index f323f4bbb..2e83e64fb 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -98,7 +98,7 @@ PhysicalTokenBlock BlockAllocator::allocate() {
     }
     PhysicalTokenBlock block = free_blocks.front();
     free_blocks.pop_front();
-    block.ref_count = 1;
+    block.incr_ref_count();
     return block;
 }
 
@@ -107,7 +107,7 @@ void BlockAllocator::free(PhysicalTokenBlock& block) {
     if (block.ref_count == 0) {
         throw std::runtime_error("Double free! Block is already freed.");
     }
-    block.ref_count -= 1;
+    block.decr_ref_count();
     if (block.ref_count == 0) {
         free_blocks.push_back(block);
     }else{
@@ -134,17 +134,19 @@ int PageManager::allocate_one_block(const RequestGuid& request_guid) {
     return block.get_block_number();
 }
 
-void PageManager::_free_block_table(BlockTable& block_table) {
+void PageManager::free_block_table(BlockTable& block_table) {
     for (auto& block : block_table) {
             block_allocator.free(block);
     } 
+    return;
 }
 
 void PageManager::free_request(const RequestGuid& request_guid) {
     //we only free the blocks that are already used
     assert(block_tables.find(request_guid) != block_tables.end());
-    auto& block_table = block_tables[request_guid];
-    _free_block_table(block_table);
+    BlockTable block_table = block_tables[request_guid];
+    free_block_table(block_table);
+    block_tables.erase(request_guid);
     return;
 }
 
@@ -172,7 +174,6 @@ int PageManager::get_index_last_block(const RequestGuid& request_guid) const {
 
 std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
     std::vector<int> indices;
-    try {
     const auto& block_table = block_tables.at(request_guid);
     for (const auto& block : block_table) {
         // printf("get block indice block number is: %d\n", block.block_number);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index c5a54df52..1cda70e84 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1050,7 +1050,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
          "No prefilling request to process in the prefilling phase.");
 
   // get page manager
-  PageManager *page_manager = get_page_manager();
+  PageManager *page_manager = PageManager::get_page_manager();
 
   BatchConfig bc;
   if (decoding_mode == INCREMENTAL_DECODING) {
@@ -1879,15 +1879,15 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 }
 
 /* --------- Page Attention Related Functions --------- */
-int get_num_blocks_allocated(Request &request) const {
+int RequestManager::get_num_blocks_allocated(Request &request) const {
   return request.blocks.size();
 }
 
-int get_len_last_block(Request &request) const {
+int RequestManager::get_len_last_block(Request &request) const {
   return request.blocks.back().num_tokens;
 }
 
-int get_idx_last_logical_token(Request &request) const {
+int RequestManager::get_idx_last_logical_token(Request &request) const {
   if (request.blocks.empty()) {
     printf("Error: request.blocks is empty\n");
     return -1;
@@ -1896,7 +1896,7 @@ int get_idx_last_logical_token(Request &request) const {
   }
 }
 
-int idx_logical_to_physical(Request &request, int idx_logical) {
+int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   // get physical indices
   PageManager *page_manager = PageManager::get_page_manager();
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);

From 0e405c1770f5a1d0be07dbbaebda6ac02e225219 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 00:02:30 -0700
Subject: [PATCH 534/667] ready for sanity check

---
 include/flexflow/page_manager.h    | 151 +++++++++++++++++++++++++++++
 include/flexflow/request_manager.h |  12 ++-
 src/runtime/page_manager.cc        |  12 +--
 src/runtime/request_manager.cc     |  34 +++----
 4 files changed, 183 insertions(+), 26 deletions(-)
 create mode 100644 include/flexflow/page_manager.h

diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
new file mode 100644
index 000000000..d8c41c0f5
--- /dev/null
+++ b/include/flexflow/page_manager.h
@@ -0,0 +1,151 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flexflow/batch_config.h"
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/config.h"
+#include "flexflow/utils/file_loader.h"
+#include <future>
+#include <mutex>
+#include <tokenizers_cpp.h>
+#include <deque>
+
+namespace FlexFlow {
+
+using TokenId = BatchConfig::TokenId;
+
+/**
+ * @class LogicalTokenBlock
+ * @brief A class to represent a logical block of tokens similar to virtual memory address
+ */
+class LogicalTokenBlock {
+public:
+    using TokenId = BatchConfig::TokenId;
+
+    // Constructor
+    LogicalTokenBlock(int block_number, uint32_t block_size);
+
+    // Method to check if the block is empty
+    bool is_empty() const;
+
+    // Method to check if the block is full
+    bool is_full() const;
+
+    // Method to get the number of empty slots
+    int get_num_empty_slots() const;
+
+    // Method to get the number of allocated slots
+    int get_num_alloc_slots() const;
+
+    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
+    void reset_num_spec_tokens();
+
+    // Method to append tokens
+    void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
+
+    int get_num_tokens() const { return num_tokens; }
+
+    std::vector<TokenId> get_token_ids() const;
+
+private:
+    int block_number; // the index of the logical token block
+    int block_size; // the size of the block
+    int num_tokens; // the number of tokens currently stored in the block
+    int num_commit_tokens; // the number of tokens inside this block that are already committed
+    int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
+
+    std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
+};
+
+/**
+ * @class PhysicalTokenBlock
+ * @brief A class to represent a physical block of tokens similar to physical memory address
+ * It keeps track of the location of the tokens stored on GPU memory
+ */
+class PhysicalTokenBlock {
+public:
+    // Constructor
+    PhysicalTokenBlock(int block_number, int block_size);
+
+    // Method to get the block number
+    int get_block_number() const { return block_number; }
+    void incr_ref_count() { ref_count++; }
+    void decr_ref_count() { ref_count--; }
+    int ref_count; // reference count, TODO: move to private
+
+private:
+    int block_number; // the index of the physical token block
+    int block_size; // the size of the block
+};
+
+/**
+ * @class BlockAllocator
+ * @brief A Block Manager that is reponsible for maintaining a pool of free blocks
+ */
+class BlockAllocator {
+public:
+    // Constructor
+    BlockAllocator(int block_size, int num_total_blocks);
+
+    // Allocate a block
+    PhysicalTokenBlock allocate();
+
+    // Free a block
+    void free(PhysicalTokenBlock& block);
+
+    // Get the number of free blocks
+    int get_num_free_blocks() const;
+
+private:
+    int block_size;
+    int num_total_blocks;
+    std::deque<PhysicalTokenBlock> free_blocks;
+};
+
+/*
+* @class PageManager
+* @brief A wrapper class that manages the kv cache allocation status
+* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
+*/
+class PageManager {
+public:
+    // Get the singleton instance of the PageManager as it will be shared in multiple places
+    static PageManager *get_page_manager();
+    using BlockTable = std::vector<PhysicalTokenBlock>;
+    using RequestGuid = BatchConfig::RequestGuid;
+    PageManager(int block_size, int num_total_blocks);
+
+
+    int allocate_one_block(const RequestGuid& request_guid);
+    void free_request(const RequestGuid& request_guid);
+    void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
+    std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;
+
+    
+    void free_block_table(BlockTable& block_table);
+private:
+    int block_size; // the size of the block
+    int num_total_blocks; // the total number of blocks
+    BlockAllocator block_allocator;
+    std::unordered_map<RequestGuid, BlockTable> block_tables;
+
+    int get_num_total_free_blocks() const;
+    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
+};
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 753f6c983..dbed4560e 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -149,7 +149,7 @@ struct Request {
   std::vector<BatchConfig::TokenId> tokens;
 
   //page attention
-  int page_last_commited = 0;
+  int page_last_committed = 0;
   std::vector<LogicalTokenBlock> blocks;
 
   // TokenTree speculative_token_tree;
@@ -518,6 +518,16 @@ class RequestManager {
   void init_bitmask_spec(RequestGuid guid);
   BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
 
+  // Page Attention related
+  int get_num_blocks_allocated(Request &request) const;
+  int get_len_last_block(Request &request) const;
+  int get_idx_last_logical_token(Request &request) const;
+  int idx_logical_to_physical(Request &request, int idx_logical);
+  void _append_logical_block_to_request(
+    Request &request, bool is_commit);
+  int append_token_to_block(Request &request, TokenId token, bool is_commit);
+  void reset_block_table(Request &request);
+
   // Token tree related
   void init_token_tree(RequestGuid guid);
   void add_root_to_spec_token_tree(RequestGuid guid,
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 2e83e64fb..f633b1b52 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -163,14 +163,10 @@ void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_
     return;
 }
 
-std::vector<int> PageManager::copy_block_table(const RequestGuid& request_guid) const {
-    return block_tables.at(request_guid);
-}
-
-int PageManager::get_index_last_block(const RequestGuid& request_guid) const {
-    const auto& block_table = block_tables.at(request_guid);
-    return block_table.back.get_block_number();
-}
+// int PageManager::get_index_last_block(const RequestGuid& request_guid) const {
+//     const auto& block_table = block_tables.at(request_guid);
+//     return block_table.back.get_block_number();
+// }
 
 std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
     std::vector<int> indices;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1cda70e84..049c8f1ed 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1095,15 +1095,15 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
 
-      append_token_to_block(request, request->tokens[request->llm_prefill_len + idx], true);
+      append_token_to_block(*request, request->tokens[request->llm_prefill_len + idx], true);
     }
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
     //update related page info in batch config
-    bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
-    bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
+    bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
+    bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
   }
   bc.num_tokens = num_tokens;
 
@@ -1484,6 +1484,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
             std::begin(new_bc.request_available));
   new_bc.num_available_requests = num_available_requests;
 
+  // get page manager
+  PageManager *page_manager = PageManager::get_page_manager();
+
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1494,7 +1497,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     assert(request.status == Request::RUNNING);
 
     //page attention: before commit token, reset the pages assigned by cleaning all the tokens
-    std::vector<int> block_table_before_commit = get_block_table_indices(guid);
+    std::vector<int> block_table_before_commit = page_manager->get_block_table_indices(guid);
     // also need to reset the pages
 
 
@@ -1520,7 +1523,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           committed_tokens.at(committed_token_index);
       
       int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
-      int idx_from_logical = committed_token.from_index - first_token_offset_in_batch;
+      int idx_from_logical = committed_token.from_index - request.first_token_offset_in_batch;
       int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
       reset_block_table(request);
 
@@ -1884,7 +1887,7 @@ int RequestManager::get_num_blocks_allocated(Request &request) const {
 }
 
 int RequestManager::get_len_last_block(Request &request) const {
-  return request.blocks.back().num_tokens;
+  return request.blocks.back().get_num_tokens();
 }
 
 int RequestManager::get_idx_last_logical_token(Request &request) const {
@@ -1892,7 +1895,7 @@ int RequestManager::get_idx_last_logical_token(Request &request) const {
     printf("Error: request.blocks is empty\n");
     return -1;
   }else{
-    return (request.blocks.size() - 1) * kPagesize + request.blocks.back().num_tokens - 1;
+    return (request.blocks.size() - 1) * kPagesize + request.blocks.back().get_num_tokens() - 1;
   }
 }
 
@@ -1912,28 +1915,26 @@ void RequestManager::_append_logical_block_to_request(
                                   kPagesize);
   request.blocks.push_back(block);
   PageManager *page_manager = PageManager::get_page_manager();
-  page_manager->allocate(request.guid);
+  page_manager->allocate_one_block(request.guid);
   // update page_id_commit
   if (is_commit) {
-    request.page_id_commit++;
-    assert(request.page_id_commit < request.blocks.size());
+    request.page_last_committed++;
+    assert(request.page_last_committed < request.blocks.size());
   }
 }
 
 //this function is used for appending a token to the last logical block and also the last physical block
 //it will return the physical position of this token
 int RequestManager::append_token_to_block(Request &request, TokenId token, bool is_commit) {
-  int page_cur_physical = PageManager::get_page_manager()->get_index_last_block(request.guid);
   if (request.blocks.empty() ||
       request.blocks.back().is_full()) {
     PageManager *page_manager = PageManager::get_page_manager();
     // Append a new logical block
     _append_logical_block_to_request(request, is_commit);
     // also allocate one physical page
-    page_cur_physical = page_manager->allocate_one_block(request.guid);
   }
   // insert token to both logical block and physical block
-  request.blocks.back().append_token(token, is_commit);
+  request.blocks.back().append_tokens({token}, is_commit);
   int idx_logical = get_idx_last_logical_token(request);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
   return idx_physical;
@@ -1942,13 +1943,12 @@ int RequestManager::append_token_to_block(Request &request, TokenId token, bool
 void RequestManager::reset_block_table(Request &request){
   // get the indices of original physical block table for request
   PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
-  page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_id_commit);
+  page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_last_committed);
   // reset this request's logical block table
-  request.blocks.erase(request.blocks.begin() + request.page_id_commit + 1, request.blocks.end());
-
+  request.blocks.erase(request.blocks.begin() + request.page_last_committed + 1, request.blocks.end());
 
-  BlockTable block_table = page_manager->get_block_table(request.guid);
   assert(request.blocks.size() == block_table_indices.size());
   return;
 }

From 6b4777ec9853be2e68ccf7ca8649b55a82967230 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 15:14:04 -0700
Subject: [PATCH 535/667] fix last commit index

---
 include/flexflow/request_manager.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index dbed4560e..37b03f0d6 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -148,8 +148,8 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  //page attention
-  int page_last_committed = 0;
+  //page attention, page_last_committed should be -1 because there are no blocks at the beginning
+  int page_last_committed = -1;
   std::vector<LogicalTokenBlock> blocks;
 
   // TokenTree speculative_token_tree;
@@ -523,7 +523,7 @@ class RequestManager {
   int get_len_last_block(Request &request) const;
   int get_idx_last_logical_token(Request &request) const;
   int idx_logical_to_physical(Request &request, int idx_logical);
-  void _append_logical_block_to_request(
+  void _append_block_to_request(
     Request &request, bool is_commit);
   int append_token_to_block(Request &request, TokenId token, bool is_commit);
   void reset_block_table(Request &request);

From b02d7633ed413dbbba29fc3326dac3d276eceac3 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 17:38:00 -0700
Subject: [PATCH 536/667] chore: minor

---
 include/flexflow/batch_config.h | 2 +-
 src/ops/arg_topk.cu             | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ff48bb17f..053fc002d 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -86,7 +86,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
-  inline static int const MAX_NUM_TOKENS = 1024;
+  inline static int const MAX_NUM_TOKENS = 2048;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
   inline static int const MAX_TREE_DEPTH = 8;
   inline static int const MAX_TREE_WIDTH = 16;
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index fbeb5497c..2d36aee80 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -95,6 +95,9 @@ void ArgTopK::forward_kernel(
     bool renormalize,
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
+  if (bc->prompt_phase) {
+    return;
+  }
   assert(bc->num_active_requests() >= 0);
   if (m->device_resources.find(stream) == m->device_resources.end()) {
     m->device_resources[stream] = new raft::device_resources(stream);

From 8394f15800b26d2035f33a054678c1fea4f66c15 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 17:38:45 -0700
Subject: [PATCH 537/667] fix request id error

---
 src/runtime/request_manager.cc | 37 +++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 605c6ff1c..19217e18a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1095,6 +1095,8 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
 
+      // printf("in prefilling: page_last_committed: %d, request->blocks.size(): %d\n", request->page_last_committed, request->blocks.size());
+      // assert(request->page_last_committed < static_cast<int>(request->blocks.size()));
       append_token_to_block(*request, request->tokens[request->llm_prefill_len + idx], true);
     }
     num_tokens += num_tokens_in_batch;
@@ -1103,7 +1105,10 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     }
     //update related page info in batch config
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
+    printf("num kv pages: %d\n", bc.requestsInfo[request_index].num_kv_pages);
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
+    printf("kv last page len: %d\n", bc.requestsInfo[request_index].kv_last_page_len);
+    bc.requestsInfo[request_index].request_guid = request->guid;
   }
   bc.num_tokens = num_tokens;
 
@@ -1499,6 +1504,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     //page attention: before commit token, reset the pages assigned by cleaning all the tokens
     std::vector<int> block_table_before_commit = page_manager->get_block_table_indices(guid);
     // also need to reset the pages
+    reset_block_table(request);
 
 
     int token_offset = request.first_token_offset_in_batch;
@@ -1522,10 +1528,11 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
       
+      // assert(request.page_last_committed < request.blocks.size());
+      printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
       int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index - request.first_token_offset_in_batch;
       int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
-      reset_block_table(request);
 
 
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
@@ -1554,6 +1561,8 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           token_tree_index++;
 
           // Append the token to the block
+          printf("in verify spec tree: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
+          // assert(request.page_last_committed < request.blocks.size());
           append_token_to_block(request, tree_node->id, false);
         }
       }
@@ -1570,6 +1579,11 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
 
     // Copy the streaming cache info
     new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+
+    // page attention information
+    new_bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
+    new_bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
+    new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
   if (verbose) {
@@ -1883,6 +1897,8 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 
 /* --------- Page Attention Related Functions --------- */
 int RequestManager::get_num_blocks_allocated(Request &request) const {
+  // needs some assertion
+  assert(request.blocks.size() == PageManager::get_page_manager()->get_block_table_indices(request.guid).size());
   return request.blocks.size();
 }
 
@@ -1907,30 +1923,39 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
 }
 
-void RequestManager::_append_logical_block_to_request(
+// this will allocate one logical block and one physical block to the request
+void RequestManager::_append_block_to_request(
     Request &request, bool is_commit) {
+  PageManager *page_manager = PageManager::get_page_manager();
+  assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   // Append the logical block to the request
   // page attention: in this function we need to remember the last logical block number that still contains committed tokens
   LogicalTokenBlock block(request.blocks.size(),
                                   kPagesize);
   request.blocks.push_back(block);
-  PageManager *page_manager = PageManager::get_page_manager();
   page_manager->allocate_one_block(request.guid);
+  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
     request.page_last_committed++;
-    assert(request.page_last_committed < request.blocks.size());
+    int size_blocks = request.blocks.size();
+    if (request.page_last_committed >= size_blocks) {
+      printf("request page_last_committed: %d, size_blocks) {: %d\n", request.page_last_committed, size_blocks);
+      assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+    }
   }
 }
 
 //this function is used for appending a token to the last logical block and also the last physical block
 //it will return the physical position of this token
 int RequestManager::append_token_to_block(Request &request, TokenId token, bool is_commit) {
+  // assert(request.page_last_committed < request.blocks.size());
   if (request.blocks.empty() ||
       request.blocks.back().is_full()) {
     PageManager *page_manager = PageManager::get_page_manager();
     // Append a new logical block
-    _append_logical_block_to_request(request, is_commit);
+    _append_block_to_request(request, is_commit);
     // also allocate one physical page
   }
   // insert token to both logical block and physical block
@@ -1943,6 +1968,8 @@ int RequestManager::append_token_to_block(Request &request, TokenId token, bool
 void RequestManager::reset_block_table(Request &request){
   // get the indices of original physical block table for request
   PageManager *page_manager = PageManager::get_page_manager();
+  assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
   page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_last_committed);

From 34b3f373034d643458cbef60080ed45bc70b287a Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 18:00:08 -0700
Subject: [PATCH 538/667] fix: allreduce should handle `elts==0`

---
 deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
index 959f52d3d..619eb8987 100644
--- a/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
+++ b/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu
@@ -437,6 +437,10 @@ void customAllReduce(AllReduceParams &params,
   params.local_output_buffer_ptr = data;
   params.elts_total = elts;
 
+  if (elts == 0) {
+    return;
+  }
+
   if (dataType == DT_FLOAT) {
     invokeOneOrTwoShotAllReduceKernel<float>(params, strat, stream);
   } else if (dataType == DT_HALF) {

From 2ec8b5b1922d52ecbe885e689df0b2a93aa4a6be Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 10 Oct 2024 19:18:16 -0700
Subject: [PATCH 539/667] fix spec token num

---
 src/runtime/request_manager.cc | 42 ++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 19217e18a..066609aa7 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1529,7 +1529,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           committed_tokens.at(committed_token_index);
       
       // assert(request.page_last_committed < request.blocks.size());
-      printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
+      // printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
       int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index - request.first_token_offset_in_batch;
       int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
@@ -1561,13 +1561,15 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           token_tree_index++;
 
           // Append the token to the block
-          printf("in verify spec tree: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
+          // printf("in verify spec tree: page_last_committed: %d, request->blocks.size(): %d ", request.page_last_committed, request.blocks.size());
+          // printf("in verify spec tree: last page len: %d\n", get_len_last_block(request));
           // assert(request.page_last_committed < request.blocks.size());
           append_token_to_block(request, tree_node->id, false);
         }
       }
       layer_index++;
     }
+    printf("there are %d tokens in the token tree\n", token_tree_index);
     new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
     request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
@@ -1919,7 +1921,11 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   // get physical indices
   PageManager *page_manager = PageManager::get_page_manager();
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
-  assert(request.blocks.size() == block_table_indices.size());
+  if (request.blocks.size() != block_table_indices.size()) {
+    printf("page manager get block table indices: %d, request.blocks.size(): %d\n", page_manager->get_block_table_indices(request.guid).size(), request.blocks.size());
+    printf("request.blocks.size(): %d, block_table_indices.size(): %d\n", request.blocks.size(), block_table_indices.size());
+    assert(request.blocks.size() == block_table_indices.size());
+  }
   return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
 }
 
@@ -1951,15 +1957,17 @@ void RequestManager::_append_block_to_request(
 //it will return the physical position of this token
 int RequestManager::append_token_to_block(Request &request, TokenId token, bool is_commit) {
   // assert(request.page_last_committed < request.blocks.size());
+  PageManager *page_manager = PageManager::get_page_manager();
   if (request.blocks.empty() ||
       request.blocks.back().is_full()) {
-    PageManager *page_manager = PageManager::get_page_manager();
     // Append a new logical block
     _append_block_to_request(request, is_commit);
+    assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
     // also allocate one physical page
   }
   // insert token to both logical block and physical block
   request.blocks.back().append_tokens({token}, is_commit);
+  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   int idx_logical = get_idx_last_logical_token(request);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
   return idx_physical;
@@ -1972,14 +1980,34 @@ void RequestManager::reset_block_table(Request &request){
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
-  page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_last_committed);
+  assert(block_table_indices.size() > request.page_last_committed);
+  page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_last_committed - 1);
   // reset this request's logical block table
-  request.blocks.erase(request.blocks.begin() + request.page_last_committed + 1, request.blocks.end());
+  if (request.page_last_committed < static_cast<int>(request.blocks.size())) {
+    request.blocks.erase(request.blocks.begin() + request.page_last_committed + 1, request.blocks.end());
+  }
+  request.blocks.back().reset_num_spec_tokens();
+  printf("after reset, block now has %d tokens\n", request.blocks.back().get_num_tokens());
+  printf("number of pages allocated: %d\n", page_manager->get_block_table_indices(request.guid).size());
+  printf("number of blocks: %d\n", request.blocks.size()); 
+  printf("num spec tokens: %d\n", request.blocks.back().get_num_spec_tokens());
+  printf("num committed tokens: %d\n", request.blocks.back().get_num_commit_tokens());
 
-  assert(request.blocks.size() == block_table_indices.size());
+  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   return;
 }
 
+// debug function
+void RequestManager::print_num_tokens(Request &request) {
+  PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  printf("number of blocks: %d", request.blocks.size());
+  printf(" number of pages allocated: %d", block_table_indices.size());
+  printf(" last page length: %d", request.blocks.back().get_num_tokens());
+  printf(" last page spec tokens: %d", request.blocks.back().get_num_spec_tokens());
+  printf(" last page commit tokens: %d\n", request.blocks.back().get_num_commit_tokens());
+}
+
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {

From b12df8c81874e0065e2d2b6617ebbdef67586473 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 10 Oct 2024 21:59:20 -0700
Subject: [PATCH 540/667] fix small error in free_multiple_blocks

---
 src/runtime/page_manager.cc | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index f633b1b52..fa94cf8c5 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -109,7 +109,8 @@ void BlockAllocator::free(PhysicalTokenBlock& block) {
     }
     block.decr_ref_count();
     if (block.ref_count == 0) {
-        free_blocks.push_back(block);
+        printf("put block number: %d back to free_blocks\n", block.get_block_number());
+        free_blocks.push_front(block);
     }else{
         // in current implementation this should not be the case
         throw std::runtime_error("Block is not freed. Ref count: " + std::to_string(block.ref_count));
@@ -131,6 +132,7 @@ int PageManager::allocate_one_block(const RequestGuid& request_guid) {
     PhysicalTokenBlock block = block_allocator.allocate();
     block_table.push_back(block);
     block_tables[request_guid] = block_table;
+    printf("request_guid: %d, block_number: %d\n", request_guid, block.get_block_number());
     return block.get_block_number();
 }
 
@@ -150,6 +152,7 @@ void PageManager::free_request(const RequestGuid& request_guid) {
     return;
 }
 
+// delete the last num_blocks in the request_guid
 void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_blocks) {
     assert(block_tables.find(request_guid) != block_tables.end());
     auto& block_table = block_tables[request_guid];
@@ -158,7 +161,8 @@ void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_
     for (int i = 0; i < num_blocks; i++) {
         block_allocator.free(block_table[num_blocks_allocated - i - 1]);
     }
-    block_table = std::vector<PhysicalTokenBlock>(block_table.begin() + num_blocks, block_table.end());
+    // only keep the first num_blocks_allocated - num_blocks blocks
+    block_table.erase(block_table.begin() + num_blocks_allocated - num_blocks, block_table.end());
     block_tables[request_guid] = block_table;
     return;
 }
@@ -170,7 +174,12 @@ void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_
 
 std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
     std::vector<int> indices;
-    const auto& block_table = block_tables.at(request_guid);
+    const auto& it = block_tables.find(request_guid);
+    if (it == block_tables.end()) {
+        printf("not found request_guid: %d\n", request_guid);
+        return indices;
+    }
+    const auto& block_table = it->second;
     for (const auto& block : block_table) {
         // printf("get block indice block number is: %d\n", block.block_number);
         indices.push_back(block.get_block_number());

From 6298f2aacebf86d548000518f321382197aad175 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 23:18:44 -0700
Subject: [PATCH 541/667] ckpt single request

---
 include/flexflow/page_manager.h              |  2 ++
 include/flexflow/request_manager.h           |  1 +
 src/ops/tree_inc_multihead_self_attention.cu |  4 ++--
 src/runtime/request_manager.cc               | 14 +++++++++++++-
 src/runtime/request_manager.cu               | 17 +++++++++++++++--
 5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
index d8c41c0f5..fe6792574 100644
--- a/include/flexflow/page_manager.h
+++ b/include/flexflow/page_manager.h
@@ -59,6 +59,8 @@ class LogicalTokenBlock {
     void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
 
     int get_num_tokens() const { return num_tokens; }
+    int get_num_commit_tokens() const { return num_commit_tokens; }
+    int get_num_spec_tokens() const { return num_spec_tokens; }
 
     std::vector<TokenId> get_token_ids() const;
 
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 37b03f0d6..ae819d0fe 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -527,6 +527,7 @@ class RequestManager {
     Request &request, bool is_commit);
   int append_token_to_block(Request &request, TokenId token, bool is_commit);
   void reset_block_table(Request &request);
+  void print_num_tokens(Request &request);
 
   // Token tree related
   void init_token_tree(RequestGuid guid);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 9103bc814..829d61f96 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -149,8 +149,8 @@ __global__ void commit_tokens_kernel(
     }
   }
 
-  int start = kv_indptr[requext_idx_in_batch];
-  int end = kv_indptr[requext_idx_in_batch + 1] - 1;
+  // int start = kv_indptr[requext_idx_in_batch];
+  // int end = kv_indptr[requext_idx_in_batch + 1] - 1;
 
   for (int i = 0; i < *num_committed_tokens; i++) {
     if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 066609aa7..bd01876f7 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1529,10 +1529,11 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           committed_tokens.at(committed_token_index);
       
       // assert(request.page_last_committed < request.blocks.size());
-      // printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
+      printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
       int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index - request.first_token_offset_in_batch;
       int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
+      printf("id to physical: %d, from physical: %d\n", idx_to_physical, idx_from_physical);
 
 
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
@@ -1941,6 +1942,10 @@ void RequestManager::_append_block_to_request(
                                   kPagesize);
   request.blocks.push_back(block);
   page_manager->allocate_one_block(request.guid);
+  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  for (int i = 0; i < block_table_indices.size(); i++) {
+    printf("block table indices: %d\n", block_table_indices[i]);
+  }
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
@@ -1969,7 +1974,9 @@ int RequestManager::append_token_to_block(Request &request, TokenId token, bool
   request.blocks.back().append_tokens({token}, is_commit);
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   int idx_logical = get_idx_last_logical_token(request);
+  // printf("idx_logical: %d\n", idx_logical);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
+  // printf("idx_physical: %d\n", idx_physical);
   return idx_physical;
 }
 
@@ -1992,6 +1999,11 @@ void RequestManager::reset_block_table(Request &request){
   printf("number of blocks: %d\n", request.blocks.size()); 
   printf("num spec tokens: %d\n", request.blocks.back().get_num_spec_tokens());
   printf("num committed tokens: %d\n", request.blocks.back().get_num_commit_tokens());
+  // the indices of block table should be the same as the number of blocks
+  std::vector<int> block_table = page_manager->get_block_table_indices(request.guid);
+  for (int i = 0; i < request.blocks.size(); i++) {
+    printf("block table indices: %d\n", block_table[i]);
+  }
 
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   return;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 1c9d659e6..6dd82761c 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -98,6 +98,7 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
       int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
                   batch_config->requestsInfo[req_idx].first_token_index_in_request;
+      
       q_lens += q_len;
       qk_lens += (q_len * kv_len + 7) / 8;
       indices_offset = indices_lens;
@@ -106,8 +107,12 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       kv_indptr_h[indptr_idx + 1] = batch_config->requestsInfo[req_idx].num_kv_pages + kv_indptr_h[indptr_idx];
 
       assert(batch_config->requestsInfo[req_idx].num_kv_pages == (kv_len + kPagesize - 1) / kPagesize);
-      assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= 64);
+      assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= kPagesize);
       std::vector<int32_t> kv_indices = pm -> get_block_table_indices(batch_config->requestsInfo[req_idx].request_guid);
+      printf("request_guid: %d\n", batch_config->requestsInfo[req_idx].request_guid);
+      printf("kv_indices.size() = %d, kv_len = %d\n", kv_indices.size(), kv_len);
+      printf("kv last page len = %d\n", batch_config->requestsInfo[req_idx].kv_last_page_len);
+      printf("num_kv_pages = %d\n", batch_config->requestsInfo[req_idx].num_kv_pages);
       assert(kv_indices.size() == (kv_len + kPagesize - 1) / kPagesize);
       for (int i = indices_offset; i < indices_lens; i++) {
         kv_indices_h[i] = kv_indices[i - indices_offset];
@@ -616,7 +621,9 @@ void RequestManager::load_batch_config_task(
       }
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
-    static PageManager *pm = PageManager::get_page_manager();
+    PageManager *pm = PageManager::get_page_manager();
+    // hardcode request 
+    // printf("request has allocated %d pages\n", pm -> get_block_table_indices(1000000).size());
     static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
     static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_NUM_TOKENS];
     static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
@@ -760,6 +767,12 @@ void RequestManager::load_batch_config_task(
             handle.tree_verify_attention_metadata->num_kv_heads(),
             handle.tree_verify_attention_metadata->head_dim(),
             kPagesize);
+
+            cudaError_t syncErr = cudaDeviceSynchronize();
+            if (syncErr != cudaSuccess) {
+              printf("Kernel execution error: %s\n", cudaGetErrorString(syncErr));
+              assert(false);
+            }
       }
     }
   }

From c00ddecf4662858d52f9d106d83076f5e7b45617 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 10 Oct 2024 23:26:28 -0700
Subject: [PATCH 542/667] add cleanup

---
 src/runtime/request_manager.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bd01876f7..51a0a7885 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -622,6 +622,10 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   num_available_requests--;
   request.status = Request::COMPLETED;
 
+  // page attention: free the pages
+  PageManager *page_manager = PageManager::get_page_manager();
+  page_manager->free_request(guid);
+
   // Find the sos and eos in the sequence
   auto bos_it = std::find(
       request.tokens.begin(), request.tokens.end(), this->bos_token_id);

From b1ff32338025ef4c510456f257d384c97db7adec Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Fri, 11 Oct 2024 09:36:03 -0700
Subject: [PATCH 543/667] ckpt before index error in prepare_parameters

---
 src/ops/tree_inc_multihead_self_attention.cu | 10 ++++++++--
 src/runtime/page_manager.cc                  | 11 ++++++-----
 src/runtime/request_manager.cc               | 19 ++++++++++++++-----
 src/runtime/request_manager.cu               |  9 +++++----
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 829d61f96..a51b89a96 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -79,6 +79,13 @@ __global__ void
 
   if (offset < kv_hidden_size) {
     int start = kv_indptr[req_idx];
+    int end = kv_indptr[req_idx + 1] - 1;
+    if (start > end) {
+      printf("Invalid kv_indptr: %d %d\n", start, end);
+    }
+    assert(start <= end && "Invalid kv_indptr");
+    assert(start + (token_abs_idx / kPagesize) <= end &&
+           "Invalid page index");
     int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
     size_t to_k_idx = get_k_entry_offset_verify(
            token_abs_idx, page_idx, num_kv_heads, head_dim),
@@ -160,7 +167,7 @@ __global__ void commit_tokens_kernel(
       }
 
       // int const req_id = committedTokenInfos[i].request_index;
-      int const tok_id = committedTokenInfos[i].token_depth;
+      // int const tok_id = committedTokenInfos[i].token_depth;
       int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
       int const page_from_idx = committedTokenInfos[i].index_in_kv_cache / kPagesize;
 
@@ -172,7 +179,6 @@ __global__ void commit_tokens_kernel(
                  committedTokenInfos[i].token_depth, page_to_idx, num_kv_heads, head_dim),
              to_v_idx = get_v_entry_offset_verify(
                  committedTokenInfos[i].token_depth, page_to_idx, num_kv_heads, head_dim);
-      assert(to_k_idx <= from_k_idx);
 
       kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
       kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index fa94cf8c5..534775717 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -137,9 +137,11 @@ int PageManager::allocate_one_block(const RequestGuid& request_guid) {
 }
 
 void PageManager::free_block_table(BlockTable& block_table) {
-    for (auto& block : block_table) {
-            block_allocator.free(block);
-    } 
+    // make it reverse order to free the last allocated block first
+    BlockTable::reverse_iterator rit = block_table.rbegin();
+    for (; rit != block_table.rend(); ++rit) {
+        block_allocator.free(*rit);
+    }
     return;
 }
 
@@ -176,12 +178,11 @@ std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request
     std::vector<int> indices;
     const auto& it = block_tables.find(request_guid);
     if (it == block_tables.end()) {
-        printf("not found request_guid: %d\n", request_guid);
+        printf("page manager not found request_guid: %d\n", request_guid);
         return indices;
     }
     const auto& block_table = it->second;
     for (const auto& block : block_table) {
-        // printf("get block indice block number is: %d\n", block.block_number);
         indices.push_back(block.get_block_number());
     }
     return indices;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 51a0a7885..38f318cdd 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -624,6 +624,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
 
   // page attention: free the pages
   PageManager *page_manager = PageManager::get_page_manager();
+  printf("free request %d\n", guid);
   page_manager->free_request(guid);
 
   // Find the sos and eos in the sequence
@@ -1109,9 +1110,11 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     }
     //update related page info in batch config
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
-    printf("num kv pages: %d\n", bc.requestsInfo[request_index].num_kv_pages);
+    printf("request: %d has %d kv pages after prefilling\n", request->guid, bc.requestsInfo[request_index].num_kv_pages);
+    assert(bc.requestsInfo[request_index].num_kv_pages > 0);
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
-    printf("kv last page len: %d\n", bc.requestsInfo[request_index].kv_last_page_len);
+    printf("request: %d has %d kv last page len after prefilling\n", request->guid, bc.requestsInfo[request_index].kv_last_page_len);
+    assert(bc.requestsInfo[request_index].kv_last_page_len > 0);
     bc.requestsInfo[request_index].request_guid = request->guid;
   }
   bc.num_tokens = num_tokens;
@@ -1589,7 +1592,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
 
     // page attention information
     new_bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
+    assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
     new_bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
+    assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
     new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
@@ -1910,6 +1915,10 @@ int RequestManager::get_num_blocks_allocated(Request &request) const {
 }
 
 int RequestManager::get_len_last_block(Request &request) const {
+  int num_tokens = request.blocks.back().get_num_tokens();
+  if (request.blocks.empty()) {
+    return 0;
+  }
   return request.blocks.back().get_num_tokens();
 }
 
@@ -1947,9 +1956,9 @@ void RequestManager::_append_block_to_request(
   request.blocks.push_back(block);
   page_manager->allocate_one_block(request.guid);
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
-  for (int i = 0; i < block_table_indices.size(); i++) {
-    printf("block table indices: %d\n", block_table_indices[i]);
-  }
+  // for (int i = 0; i < block_table_indices.size(); i++) {
+  //   printf("block table indices: %d\n", block_table_indices[i]);
+  // }
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 6dd82761c..3f30268b4 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -105,14 +105,15 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       indices_lens += (kv_len + kPagesize - 1) / kPagesize;
       q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
       kv_indptr_h[indptr_idx + 1] = batch_config->requestsInfo[req_idx].num_kv_pages + kv_indptr_h[indptr_idx];
+      assert(kv_indptr_h[indptr_idx] >= 0);
 
       assert(batch_config->requestsInfo[req_idx].num_kv_pages == (kv_len + kPagesize - 1) / kPagesize);
       assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= kPagesize);
       std::vector<int32_t> kv_indices = pm -> get_block_table_indices(batch_config->requestsInfo[req_idx].request_guid);
-      printf("request_guid: %d\n", batch_config->requestsInfo[req_idx].request_guid);
-      printf("kv_indices.size() = %d, kv_len = %d\n", kv_indices.size(), kv_len);
-      printf("kv last page len = %d\n", batch_config->requestsInfo[req_idx].kv_last_page_len);
-      printf("num_kv_pages = %d\n", batch_config->requestsInfo[req_idx].num_kv_pages);
+      // printf("request_guid: %d\n", batch_config->requestsInfo[req_idx].request_guid);
+      // printf("kv_indices.size() = %d, kv_len = %d\n", kv_indices.size(), kv_len);
+      // printf("kv last page len = %d\n", batch_config->requestsInfo[req_idx].kv_last_page_len);
+      // printf("num_kv_pages = %d\n", batch_config->requestsInfo[req_idx].num_kv_pages);
       assert(kv_indices.size() == (kv_len + kPagesize - 1) / kPagesize);
       for (int i = indices_offset; i < indices_lens; i++) {
         kv_indices_h[i] = kv_indices[i - indices_offset];

From 0083c2f11351d9d2592dd95bc61f594049f304a9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 11 Oct 2024 10:55:05 -0700
Subject: [PATCH 544/667] fix: embedding use real batch_size

---
 src/ops/embedding.cc                 |  3 ++-
 src/ops/fused.cu                     |  1 +
 src/ops/kernels/embedding_kernels.cu | 24 ++++++++++++------------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index 644d79efe..af8d78973 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -470,7 +470,7 @@ FutureMap Embedding::inference(
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
 
-  IndexLauncher launcher(EMBED_FWD_TASK_ID,
+  IndexLauncher launcher(EMBED_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(NULL, 0),
                          argmap,
@@ -626,6 +626,7 @@ void Embedding::inference_task(Task const *task,
     effective_batch_size = output.domain.get_volume() / out_dim;
     assert(effective_batch_size * in_dim == input.domain.get_volume());
   }
+  effective_batch_size = bc->num_active_tokens();
   forward_kernel_wrapper(
       m, input, output, kernel, in_dim, out_dim, effective_batch_size);
   if (m->inference_debugging) {
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 434b91012..9b2140800 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -839,6 +839,7 @@ __host__ void
               assert(effective_batch_size * in_dim ==
                      my_input_accessor[0].domain.get_volume());
             }
+            effective_batch_size = bc->num_active_tokens();
 
             assert(my_input_accessor[0].data_type == DT_INT32 ||
                    my_input_accessor[0].data_type == DT_INT64);
diff --git a/src/ops/kernels/embedding_kernels.cu b/src/ops/kernels/embedding_kernels.cu
index 22d8161ff..f58d2dde9 100644
--- a/src/ops/kernels/embedding_kernels.cu
+++ b/src/ops/kernels/embedding_kernels.cu
@@ -48,7 +48,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_FLOAT) {
       Internal::forward_kernel(input.get_int32_ptr(),
@@ -58,7 +58,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_DOUBLE) {
       Internal::forward_kernel(input.get_int32_ptr(),
@@ -68,7 +68,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -82,7 +82,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_FLOAT) {
       Internal::forward_kernel(input.get_int64_ptr(),
@@ -92,7 +92,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else if (weight.data_type == DT_DOUBLE) {
       Internal::forward_kernel(input.get_int64_ptr(),
@@ -102,7 +102,7 @@ void forward_kernel_wrapper(EmbeddingMeta const *m,
                                out_dim,
                                batch_size,
                                m->aggr,
-                               output.domain.get_volume(),
+                               out_dim * batch_size,
                                stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -139,7 +139,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_FLOAT) {
       Internal::backward_kernel(input.get_int32_ptr(),
@@ -149,7 +149,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_DOUBLE) {
       Internal::backward_kernel(input.get_int32_ptr(),
@@ -159,7 +159,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");
@@ -173,7 +173,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_FLOAT) {
       Internal::backward_kernel(input.get_int64_ptr(),
@@ -183,7 +183,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else if (m->output_type[0] == DT_DOUBLE) {
       Internal::backward_kernel(input.get_int64_ptr(),
@@ -193,7 +193,7 @@ void backward_kernel_wrapper(EmbeddingMeta const *m,
                                 out_dim,
                                 batch_size,
                                 m->aggr,
-                                output.domain.get_volume(),
+                                out_dim * batch_size,
                                 stream);
     } else {
       assert(false && "Unsupported DataType in Embedding");

From 869d32643d952493e146e346d80447c9ce1666a1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 11 Oct 2024 11:27:11 -0700
Subject: [PATCH 545/667] fix: residualRMSNorm uses real batch size

---
 .../flexflow/ops/kernels/residual_rms_norm_kernels.h |  3 ++-
 src/ops/fused.cu                                     |  3 ++-
 src/ops/kernels/residual_rms_norm_kernels.cu         | 12 ++++++++----
 src/ops/residual_rms_norm.cc                         |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 0eef4ca72..084898710 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -47,7 +47,8 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &input2,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
-                            GenericTensorAccessorW const &output);
+                            GenericTensorAccessorW const &output,
+                            int batch_size);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 9b2140800..587e49777 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -903,7 +903,8 @@ __host__ void
                 my_input_accessor[1],
                 my_weight_accessor[0],
                 my_output_accessor[0],
-                my_output_accessor[1]);
+                my_output_accessor[1],
+                bc->num_active_tokens());
             break;
           }
           case OP_INC_MULTIHEAD_SELF_ATTENTION: {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 17ac14449..6a3f6a993 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -145,12 +145,13 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T const *weight_ptr,
                     T *residual_output_ptr,
                     T *output_ptr,
+                    int batch_size,
                     cudaStream_t stream) {
-
+  assert(batch_size <= m->batch_size);
   std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
+      std::make_pair(batch_size, kCUDABlockReduceNumThreads);
   std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
+      std::make_pair(batch_size, kCUDANumThreads);
 
   int num_blocks =
       std::max(kernel1_parallelism.first, kernel2_parallelism.first);
@@ -174,7 +175,8 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &input2,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
-                            GenericTensorAccessorW const &output) {
+                            GenericTensorAccessorW const &output,
+                            int batch_size) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -195,6 +197,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                    weight.get_half_ptr(),
                    residual_output.get_half_ptr(),
                    output.get_half_ptr(),
+                   batch_size,
                    stream);
   } else if (output.data_type == DT_FLOAT) {
     forward_kernel(m,
@@ -203,6 +206,7 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                    weight.get_float_ptr(),
                    residual_output.get_float_ptr(),
                    output.get_float_ptr(),
+                   batch_size,
                    stream);
   } else {
     assert(false && "Unsupported data type");
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 78973a165..edea7a4ad 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -447,7 +447,7 @@ void ResidualRMSNorm::inference_task(Task const *task,
       m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output);
+  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output, bc->num_tokens);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];

From 8a3975a9f4197ff51ef7d42a1f908fe628b83887 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Fri, 11 Oct 2024 12:27:50 -0700
Subject: [PATCH 546/667] fix token error in prepare_batch_config

---
 src/runtime/request_manager.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 38f318cdd..466a920ba 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1102,6 +1102,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
 
       // printf("in prefilling: page_last_committed: %d, request->blocks.size(): %d\n", request->page_last_committed, request->blocks.size());
       // assert(request->page_last_committed < static_cast<int>(request->blocks.size()));
+      assert(request->llm_prefill_len + idx < request->tokens.size());
       append_token_to_block(*request, request->tokens[request->llm_prefill_len + idx], true);
     }
     num_tokens += num_tokens_in_batch;
@@ -1123,6 +1124,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
     bc.print();
   }
+  printf("there are %d requests in the batch in prefilling stage\n", bc.num_available_requests);
   return bc;
 }
 
@@ -1538,7 +1540,12 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       // assert(request.page_last_committed < request.blocks.size());
       printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
       int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
-      int idx_from_logical = committed_token.from_index - request.first_token_offset_in_batch;
+      int idx_from_logical = committed_token.from_index;
+      if (idx_from_logical < 0) {
+        printf("idx_from_logical: %d, from_index: %d, first_token_offset_in_batch: %d\n", idx_from_logical, committed_token.from_index, request.first_token_offset_in_batch);
+      }
+      assert(idx_from_logical >= 0);
+      assert(idx_from_logical / kPagesize < block_table_before_commit.size());
       int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
       printf("id to physical: %d, from physical: %d\n", idx_to_physical, idx_from_physical);
 

From f4e73ea6426aab8b485d3d7f5af9efcc10a65fa3 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Fri, 11 Oct 2024 13:09:29 -0700
Subject: [PATCH 547/667] ckpt, something wrong in the prefilling

---
 src/ops/tree_inc_multihead_self_attention.cu | 23 ++++++++++++++++++--
 src/runtime/request_manager.cc               |  7 ++++--
 src/runtime/request_manager.cu               |  8 +++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a51b89a96..022bc43cb 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -54,6 +54,7 @@ __global__ void
                                half *kvCache_ptr,
                                int32_t *kv_indptr,
                                int32_t *kv_page_indices,
+                               bool const *request_available,
                                BatchConfig::PerTokenInfo const *tokenInfos,
                                int const max_num_pages,
                                int num_q_heads,
@@ -66,6 +67,8 @@ __global__ void
   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int const token_idx = thread_idx / q_hidden_size;
   int const offset = thread_idx % q_hidden_size;
+
+
   if (token_idx >= num_new_tokens) {
     return;
   }
@@ -73,13 +76,25 @@ __global__ void
   int const req_idx = tokenInfos[token_idx].request_index;
   int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
+
+  // calculate the compact request index in the easiest way
+  // TODO: recheck
+  int req_idx_compact = -1;
+  int cnt = 0;
+  while (cnt < req_idx + 1) {
+    if (request_available[cnt]) {
+      req_idx_compact++;
+    }
+    cnt++;
+  }
+
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
       static_cast<half>(qkv_proj_array[from_idx + offset]);
 
   if (offset < kv_hidden_size) {
-    int start = kv_indptr[req_idx];
-    int end = kv_indptr[req_idx + 1] - 1;
+    int start = kv_indptr[req_idx_compact];
+    int end = kv_indptr[req_idx_compact + 1] - 1;
     if (start > end) {
       printf("Invalid kv_indptr: %d %d\n", start, end);
     }
@@ -107,6 +122,7 @@ template <typename DT>
 void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream) {
+  // printf("entered update_qkv_in_batch_verify\n");
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
     return;
@@ -123,12 +139,15 @@ void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
                                          static_cast<half *>(m->kvCache),
                                          m->handle.tree_verify_attention_metadata->kv_indptr,
                                          m->handle.tree_verify_attention_metadata->kv_indices,
+                                         m->request_available,
                                          m->token_infos,
                                          max_num_pages,
                                          m->num_q_heads,
                                          m->num_kv_heads,
                                          m->qk_dim,
                                          num_new_tokens);
+  // cudaStreamSynchronize(stream);
+  // printf("exited update_qkv_in_batch_verify\n");
 }
 
 __global__ void commit_tokens_kernel(
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 466a920ba..4079c0c0d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1112,6 +1112,10 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     //update related page info in batch config
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
     printf("request: %d has %d kv pages after prefilling\n", request->guid, bc.requestsInfo[request_index].num_kv_pages);
+    if (bc.requestsInfo[request_index].num_kv_pages <= 0){
+      printf("request num tokens in batch: %d\n", bc.requestsInfo[request_index].num_tokens_in_batch);
+      printf("request: %d has %d kv pages after prefilling\n", request->guid, bc.requestsInfo[request_index].num_kv_pages);
+    }
     assert(bc.requestsInfo[request_index].num_kv_pages > 0);
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
     printf("request: %d has %d kv last page len after prefilling\n", request->guid, bc.requestsInfo[request_index].kv_last_page_len);
@@ -1514,8 +1518,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     std::vector<int> block_table_before_commit = page_manager->get_block_table_indices(guid);
     // also need to reset the pages
     reset_block_table(request);
-
-
+    
     int token_offset = request.first_token_offset_in_batch;
 
     // 1. Maintain requestsInfo
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 3f30268b4..5bdd92334 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -122,6 +122,14 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       kv_last_page_len_h[indptr_idx] = batch_config->requestsInfo[req_idx].kv_last_page_len;
       indptr_idx++;
     }
+    // }else{
+    //   q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx];
+    //   q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx];
+    //   kv_indptr_h[indptr_idx + 1] = kv_indptr_h[indptr_idx];
+    //   qk_indptr_h[indptr_idx + 1] = 0;
+    //   kv_last_page_len_h[indptr_idx] = 0;
+    //   indptr_idx++;
+    // }
   }
 
   // do the copy

From f9d9415ef6894efe915c1bd88905621b93755940 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 11 Oct 2024 19:54:46 -0700
Subject: [PATCH 548/667] fix: SigmoidSiluMulti uses real batch size

---
 include/flexflow/flexflow_c.h                 |  1 +
 include/flexflow/model.h                      |  1 +
 include/flexflow/ops/sigmoid_silu_multi.h     | 13 +++++-
 .../flexflow/ops/sigmoid_silu_multi_params.h  |  1 +
 inference/models/llama.cc                     |  2 +-
 python/flexflow/core/flexflow_cffi.py         |  4 +-
 python/flexflow/serve/models/llama.py         |  2 +-
 src/c/flexflow_c.cc                           |  3 +-
 src/ops/embedding.cc                          |  1 +
 src/ops/fused.cu                              |  5 ++-
 src/ops/kernels/residual_rms_norm_kernels.cu  |  1 +
 src/ops/sigmoid_silu_multi.cc                 | 43 ++++++++++++++++---
 src/ops/sigmoid_silu_multi.cu                 | 23 +++++++---
 src/runtime/substitution.cc                   |  3 +-
 14 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 1ac5e1fb6..5257a6c49 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -287,6 +287,7 @@ flexflow_tensor_t
     flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle,
                                           flexflow_tensor_t const input1,
                                           flexflow_tensor_t const input2,
+                                          int intermediate_size,
                                           char const *name);
 
 flexflow_tensor_t
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 226105a12..e7974756b 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -580,6 +580,7 @@ class FFModel {
   // Add a sigmoid_silu_multi layer
   Tensor sigmoid_silu_multi(Tensor const input1,
                             Tensor const input2,
+                            int intermediate_size,
                             DataType data_type = DT_NONE,
                             char const *name = NULL);
   // Add a batch_norm layer
diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
index 604438260..bc07e253e 100644
--- a/include/flexflow/ops/sigmoid_silu_multi.h
+++ b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -19,6 +19,8 @@ class SigmoidSiluMulti : public Op {
                    LayerID const &_layer_guid,
                    const ParallelTensor _input1,
                    const ParallelTensor _input2,
+                   int _intermediate_size,
+                   int _tensor_parallelism_degree,
                    char const *name = nullptr);
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
@@ -68,18 +70,25 @@ class SigmoidSiluMulti : public Op {
   static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m,
                                        GenericTensorAccessorR const &input1,
                                        GenericTensorAccessorR const &input2,
-                                       GenericTensorAccessorW const &output);
+                                       GenericTensorAccessorW const &output,
+                                       int token_size);
+
+public:
+  int intermediate_size, tensor_parallelism_degree;
 };
 
 class SigmoidSiluMultiMeta : public OpMeta {
 public:
   SigmoidSiluMultiMeta(FFHandler handle,
                        SigmoidSiluMulti const *ln,
-                       MemoryAllocator &gpu_mem_allocator);
+                       MemoryAllocator &gpu_mem_allocator,
+                       int _global_intermediate_size,
+                       int _intermediate_size);
   ~SigmoidSiluMultiMeta(void);
 
 public:
   Realm::RegionInstance reserveInst;
+  int global_intermediate_size, intermediate_size;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h
index eb152db5c..0e92c0aa6 100644
--- a/include/flexflow/ops/sigmoid_silu_multi_params.h
+++ b/include/flexflow/ops/sigmoid_silu_multi_params.h
@@ -8,6 +8,7 @@ namespace FlexFlow {
 
 struct SigmoidSiluMultiParams {
   LayerID layer_guid;
+  int intermediate_size, tensor_parallelism_degree;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 81e255d83..a1cd14226 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -215,7 +215,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
                      .c_str());
 
-    Tensor multi = ff.sigmoid_silu_multi(w1, w3);
+    Tensor multi = ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size);
 
     w2 =
         ff.dense(multi,
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index dcdda6698..c09722fb4 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -2046,10 +2046,10 @@ def add_bias_residual_layer_norm(
             handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
         ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
 
-    def sigmoid_silu_multi(self, input1, input2, name=None):
+    def sigmoid_silu_multi(self, input1, input2, intermediate_size, name=None):
         c_name = get_c_name(name)
         handle = ffc().flexflow_model_add_sigmoid_silu_multi(
-            self.handle, input1.handle, input2.handle, c_name
+            self.handle, input1.handle, input2.handle, intermediate_size, c_name
         )
         self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
         return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index f1a5d4abf..503a4b40f 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -209,7 +209,7 @@ def build_model(self, max_tokens_per_batch):
                 False,
                 name=f"layers_{i}_feed_forward_w3",
             )
-            multi = ffmodel.sigmoid_silu_multi(w1, w3)
+            multi = ffmodel.sigmoid_silu_multi(w1, w3, self.llama_config.intermediate_size)
             w2 = ffmodel.dense(
                 multi,
                 self.llama_config.hidden_size,
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index bba5a3882..ff2be9913 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -748,12 +748,13 @@ flexflow_tensor_t
     flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle_,
                                           flexflow_tensor_t const input1_,
                                           flexflow_tensor_t const input2_,
+                                          int intermediate_size,
                                           char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor const input1 = FFCObjectWrapper::unwrap(input1_);
   Tensor const input2 = FFCObjectWrapper::unwrap(input2_);
   Tensor tensor =
-      handle->sigmoid_silu_multi(input1, input2, input1->data_type, name);
+      handle->sigmoid_silu_multi(input1, input2, intermediate_size, input1->data_type, name);
   DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s",
               tensor,
               input1,
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index af8d78973..4e7b01dca 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -626,6 +626,7 @@ void Embedding::inference_task(Task const *task,
     effective_batch_size = output.domain.get_volume() / out_dim;
     assert(effective_batch_size * in_dim == input.domain.get_volume());
   }
+  // use active batch size
   effective_batch_size = bc->num_active_tokens();
   forward_kernel_wrapper(
       m, input, output, kernel, in_dim, out_dim, effective_batch_size);
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 587e49777..fabd46b1e 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -839,6 +839,7 @@ __host__ void
               assert(effective_batch_size * in_dim ==
                      my_input_accessor[0].domain.get_volume());
             }
+            // use active batch size
             effective_batch_size = bc->num_active_tokens();
 
             assert(my_input_accessor[0].data_type == DT_INT32 ||
@@ -1079,10 +1080,12 @@ __host__ void
             assert(fused->op_num_outputs[op] == 1);
             SigmoidSiluMultiMeta const *m =
                 (SigmoidSiluMultiMeta *)metas->meta[op];
+            // use active number of tokens
             SigmoidSiluMulti::inference_kernel_wrapper(m,
                                                        my_input_accessor[0],
                                                        my_input_accessor[1],
-                                                       my_output_accessor[0]);
+                                                       my_output_accessor[0],
+                                                       bc->num_active_tokens());
             break;
           }
           case OP_SOFTMAX: {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 6a3f6a993..2c82308ab 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -148,6 +148,7 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     int batch_size,
                     cudaStream_t stream) {
   assert(batch_size <= m->batch_size);
+  // use active batch size
   std::pair<int, int> kernel1_parallelism =
       std::make_pair(batch_size, kCUDABlockReduceNumThreads);
   std::pair<int, int> kernel2_parallelism =
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3b3d75146..e866a5e05 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -41,7 +41,9 @@ using Legion::TaskLauncher;
 
 bool operator==(SigmoidSiluMultiParams const &lhs,
                 SigmoidSiluMultiParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid;
+  return lhs.layer_guid == rhs.layer_guid &&
+         lhs.intermediate_size == rhs.intermediate_size &&
+         lhs.tensor_parallelism_degree == rhs.tensor_parallelism_degree;
 }
 
 bool SigmoidSiluMultiParams::is_valid(
@@ -52,6 +54,8 @@ bool SigmoidSiluMultiParams::is_valid(
 SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
   SigmoidSiluMultiParams params;
   params.layer_guid = this->layer_guid;
+  params.intermediate_size = this->intermediate_size;
+  params.tensor_parallelism_degree = this->tensor_parallelism_degree;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -60,6 +64,7 @@ SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
 
 Tensor FFModel::sigmoid_silu_multi(const Tensor input1,
                                    const Tensor input2,
+                                   int intermediate_size,
                                    DataType data_type,
                                    char const *name) {
 
@@ -94,6 +99,8 @@ Tensor FFModel::sigmoid_silu_multi(const Tensor input1,
                          casted_input2);
   ssm->outputs[0] = create_tensor_legion_ordering(
       input1->num_dims, input1->dims, data_type, ssm, 0, false /*create_grad*/);
+  ssm->add_int_property("intermediate_size", intermediate_size);
+  ssm->add_int_property("tensor_parallelism_degree", config.tensor_parallelism_degree);
   layers.push_back(ssm);
   return ssm->outputs[0];
 }
@@ -102,9 +109,15 @@ Op *SigmoidSiluMulti::create_operator_from_layer(
     FFModel &model,
     Layer const *layer,
     std::vector<ParallelTensor> const &inputs) {
-
+  long long value;
+  layer->get_int_property("intermediate_size", value);
+  int intermediate_size = value;
+  layer->get_int_property("tensor_parallelism_degree", value);
+  int tensor_parallelism_degree = value;
   return new SigmoidSiluMulti(
-      model, layer->layer_guid, inputs[0], inputs[1], layer->name);
+      model, layer->layer_guid, inputs[0], inputs[1], 
+      intermediate_size, tensor_parallelism_degree,
+      layer->name);
 }
 
 SigmoidSiluMulti::SigmoidSiluMulti(
@@ -113,12 +126,15 @@ SigmoidSiluMulti::SigmoidSiluMulti(
     std::pair<ParallelTensor, ParallelTensor> const &inputs,
     char const *name)
     : SigmoidSiluMulti(
-          model, params.layer_guid, inputs.first, inputs.second, params.name) {}
+          model, params.layer_guid, inputs.first, inputs.second, 
+          params.intermediate_size, params.tensor_parallelism_degree, params.name) {}
 
 SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model,
                                    LayerID const &_layer_guid,
                                    const ParallelTensor _input1,
                                    const ParallelTensor _input2,
+                                   int _intermediate_size,
+                                   int _tensor_parallelism_degree,
                                    char const *name)
     : Op(model,
          OP_SIGMOID_SILU_MULTI,
@@ -128,7 +144,9 @@ SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model,
          0 /*weights*/,
          1 /*outputs*/,
          _input1,
-         _input2) {
+         _input2),
+      intermediate_size(_intermediate_size),
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(_input1->num_dims,
@@ -242,8 +260,9 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task,
                        .best_affinity_to(task->target_proc)
                        .first();
   MemoryAllocator gpu_mem_allocator(gpu_mem);
+  int intermediate_size = ssm->intermediate_size / ssm->tensor_parallelism_degree;
   SigmoidSiluMultiMeta *meta =
-      new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator);
+      new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator, ssm->intermediate_size, intermediate_size);
   meta->input_type[0] = ssm->inputs[0]->data_type;
   meta->input_type[1] = ssm->inputs[1]->data_type;
   meta->output_type[0] = ssm->outputs[0]->data_type;
@@ -350,7 +369,8 @@ void SigmoidSiluMulti::inference_task(
   assert(input1_domain == input2_domain);
   assert(input1_domain == output_domain);
 
-  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output);
+  // use active number of tokens
+  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output, bc->num_active_tokens());
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -369,6 +389,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
   sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->intermediate_size);
+  sez.serialize(this->tensor_parallelism_degree);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -381,9 +403,12 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff,
                                    int num_inputs) {
   assert(num_inputs == 2);
   size_t id, transformer_layer_id, deserialized_model_id;
+  int intermediate_size, tensor_parallelism_degree;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
   dez.deserialize(deserialized_model_id);
+  dez.deserialize(intermediate_size);
+  dez.deserialize(tensor_parallelism_degree);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -392,6 +417,8 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff,
 
   SigmoidSiluMultiParams params;
   params.layer_guid = layer_guid;
+  params.intermediate_size = intermediate_size;
+  params.tensor_parallelism_degree = tensor_parallelism_degree;
   strcpy(params.name, name);
   return ff.get_or_create_node<SigmoidSiluMulti>({inputs[0], inputs[1]},
                                                  params);
@@ -406,6 +433,8 @@ size_t hash<FlexFlow::SigmoidSiluMultiParams>::operator()(
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.layer_guid.transformer_layer_id);
   hash_combine(key, params.layer_guid.model_id);
+  hash_combine(key, params.intermediate_size);
+  hash_combine(key, params.tensor_parallelism_degree);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 590b641b5..962777ff3 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -21,10 +21,14 @@ namespace FlexFlow {
 
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
-                                           MemoryAllocator &gpu_mem_allocator)
+                                           MemoryAllocator &gpu_mem_allocator,
+                                           int _global_intermediate_size,
+                                           int _intermediate_size)
     : OpMeta(handle) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
+  global_intermediate_size = _global_intermediate_size;
+  intermediate_size = _intermediate_size;
 }
 
 SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) {
@@ -50,13 +54,18 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiMeta const *m,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
-    GenericTensorAccessorW const &output) {
+    GenericTensorAccessorW const &output,
+    int token_size) {
+  if (token_size == 0) {
+    return;
+  }
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
-  int num_elements = input1.domain.get_volume();
-  assert(input2.domain.get_volume() == num_elements);
-  assert(output.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == input1.domain.get_volume());
+  assert(output.domain.get_volume() == input1.domain.get_volume());
+
+  int num_elements = token_size * m->intermediate_size;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -68,7 +77,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
-                             stream>>>(input1.domain.get_volume(),
+                             stream>>>(num_elements,
                                        input1.get_float_ptr(),
                                        input2.get_float_ptr(),
                                        output.get_float_ptr());
@@ -76,7 +85,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
                              0,
-                             stream>>>(input1.domain.get_volume(),
+                             stream>>>(num_elements,
                                        input1.get_half_ptr(),
                                        input2.get_half_ptr(),
                                        output.get_half_ptr());
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index b86964049..7625deb9f 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -3826,9 +3826,8 @@ bool FFModel::convert_graph_to_operators(
       case OP_SIGMOID_SILU_MULTI: {
         assert(inList.size() == 2);
         SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr;
-        SigmoidSiluMultiParams params = ssm->get_params();
         new_op = new SigmoidSiluMulti(
-            *this, ssm->layer_guid, inputs[0], inputs[1], NULL);
+            *this, ssm->layer_guid, inputs[0], inputs[1], ssm->intermediate_size, ssm->tensor_parallelism_degree, NULL);
         break;
       }
       default: {

From 6bb79ddee2f73917f1e2122be00867bcfddd917f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 11 Oct 2024 19:56:07 -0700
Subject: [PATCH 549/667] style: format

---
 inference/models/llama.cc     |  3 ++-
 src/c/flexflow_c.cc           |  4 ++--
 src/ops/residual_rms_norm.cc  |  3 ++-
 src/ops/sigmoid_silu_multi.cc | 37 +++++++++++++++++++++++------------
 src/runtime/substitution.cc   |  9 +++++++--
 5 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index a1cd14226..5a3c6ed00 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -215,7 +215,8 @@ void LLAMA::create_llama_model(FFModel &ff,
                  std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
                      .c_str());
 
-    Tensor multi = ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size);
+    Tensor multi =
+        ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size);
 
     w2 =
         ff.dense(multi,
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index ff2be9913..f37955f02 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -753,8 +753,8 @@ flexflow_tensor_t
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor const input1 = FFCObjectWrapper::unwrap(input1_);
   Tensor const input2 = FFCObjectWrapper::unwrap(input2_);
-  Tensor tensor =
-      handle->sigmoid_silu_multi(input1, input2, intermediate_size, input1->data_type, name);
+  Tensor tensor = handle->sigmoid_silu_multi(
+      input1, input2, intermediate_size, input1->data_type, name);
   DEBUG_PRINT("[SigmoidSiluMulti] new Tensor %p, input1 %p, input2 %p, name %s",
               tensor,
               input1,
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index edea7a4ad..9cea6421f 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -447,7 +447,8 @@ void ResidualRMSNorm::inference_task(Task const *task,
       m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output, bc->num_tokens);
+  forward_kernel_wrapper(
+      m, input1, input2, weight, residual_output, output, bc->num_tokens);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index e866a5e05..9d1261123 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -100,7 +100,8 @@ Tensor FFModel::sigmoid_silu_multi(const Tensor input1,
   ssm->outputs[0] = create_tensor_legion_ordering(
       input1->num_dims, input1->dims, data_type, ssm, 0, false /*create_grad*/);
   ssm->add_int_property("intermediate_size", intermediate_size);
-  ssm->add_int_property("tensor_parallelism_degree", config.tensor_parallelism_degree);
+  ssm->add_int_property("tensor_parallelism_degree",
+                        config.tensor_parallelism_degree);
   layers.push_back(ssm);
   return ssm->outputs[0];
 }
@@ -114,10 +115,13 @@ Op *SigmoidSiluMulti::create_operator_from_layer(
   int intermediate_size = value;
   layer->get_int_property("tensor_parallelism_degree", value);
   int tensor_parallelism_degree = value;
-  return new SigmoidSiluMulti(
-      model, layer->layer_guid, inputs[0], inputs[1], 
-      intermediate_size, tensor_parallelism_degree,
-      layer->name);
+  return new SigmoidSiluMulti(model,
+                              layer->layer_guid,
+                              inputs[0],
+                              inputs[1],
+                              intermediate_size,
+                              tensor_parallelism_degree,
+                              layer->name);
 }
 
 SigmoidSiluMulti::SigmoidSiluMulti(
@@ -125,9 +129,13 @@ SigmoidSiluMulti::SigmoidSiluMulti(
     SigmoidSiluMultiParams const &params,
     std::pair<ParallelTensor, ParallelTensor> const &inputs,
     char const *name)
-    : SigmoidSiluMulti(
-          model, params.layer_guid, inputs.first, inputs.second, 
-          params.intermediate_size, params.tensor_parallelism_degree, params.name) {}
+    : SigmoidSiluMulti(model,
+                       params.layer_guid,
+                       inputs.first,
+                       inputs.second,
+                       params.intermediate_size,
+                       params.tensor_parallelism_degree,
+                       params.name) {}
 
 SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model,
                                    LayerID const &_layer_guid,
@@ -260,9 +268,13 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task,
                        .best_affinity_to(task->target_proc)
                        .first();
   MemoryAllocator gpu_mem_allocator(gpu_mem);
-  int intermediate_size = ssm->intermediate_size / ssm->tensor_parallelism_degree;
-  SigmoidSiluMultiMeta *meta =
-      new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator, ssm->intermediate_size, intermediate_size);
+  int intermediate_size =
+      ssm->intermediate_size / ssm->tensor_parallelism_degree;
+  SigmoidSiluMultiMeta *meta = new SigmoidSiluMultiMeta(handle,
+                                                        ssm,
+                                                        gpu_mem_allocator,
+                                                        ssm->intermediate_size,
+                                                        intermediate_size);
   meta->input_type[0] = ssm->inputs[0]->data_type;
   meta->input_type[1] = ssm->inputs[1]->data_type;
   meta->output_type[0] = ssm->outputs[0]->data_type;
@@ -370,7 +382,8 @@ void SigmoidSiluMulti::inference_task(
   assert(input1_domain == output_domain);
 
   // use active number of tokens
-  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output, bc->num_active_tokens());
+  SigmoidSiluMulti::inference_kernel_wrapper(
+      m, input1, input2, output, bc->num_active_tokens());
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index 7625deb9f..176133c49 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -3826,8 +3826,13 @@ bool FFModel::convert_graph_to_operators(
       case OP_SIGMOID_SILU_MULTI: {
         assert(inList.size() == 2);
         SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr;
-        new_op = new SigmoidSiluMulti(
-            *this, ssm->layer_guid, inputs[0], inputs[1], ssm->intermediate_size, ssm->tensor_parallelism_degree, NULL);
+        new_op = new SigmoidSiluMulti(*this,
+                                      ssm->layer_guid,
+                                      inputs[0],
+                                      inputs[1],
+                                      ssm->intermediate_size,
+                                      ssm->tensor_parallelism_degree,
+                                      NULL);
         break;
       }
       default: {

From 4eeb021c6b1a0f9564252465fa4ce0ea61ccfff3 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Fri, 11 Oct 2024 22:05:41 -0700
Subject: [PATCH 550/667] ckpt

---
 src/ops/tree_inc_multihead_self_attention.cu |  9 +++-
 src/runtime/page_manager.cc                  | 10 +++-
 src/runtime/request_manager.cc               | 51 +++++++++++++++-----
 src/runtime/request_manager.cu               |  3 ++
 4 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 022bc43cb..d49de51c1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -88,6 +88,8 @@ __global__ void
     cnt++;
   }
 
+  assert(req_idx_compact >= 0 && "Invalid request index");
+
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
       static_cast<half>(qkv_proj_array[from_idx + offset]);
@@ -435,7 +437,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
-
+  printf("entered inference_kernel\n");
   //   int device;
   //   checkCUDA(cudaGetDevice(&device));
   //   cudaEvent_t t_start, t_end;
@@ -611,6 +613,11 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   //   delete[] temp_output;
   // }
+  cudaError_t err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+      std::cerr << "Kernel launch failed with error: " << cudaGetErrorString(err) << std::endl;
+  }
+  printf("exited inference_kernel\n");
 }
 
 } // namespace TreeIncMultiHeadAttention
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 534775717..6cb3c2b56 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -64,6 +64,7 @@ void LogicalTokenBlock::append_tokens(const std::vector<TokenId>& token_ids_to_a
     assert(num_spec_tokens + num_commit_tokens == num_tokens);
     assert(num_tokens <= block_size);
     if (num_tokens + token_ids_to_append.size() > block_size) {
+        printf("block is full! Cannot append more tokens\n");
         throw std::runtime_error("Block is full! Cannot append more tokens.");
     }
     token_ids.insert(token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end());
@@ -94,6 +95,7 @@ BlockAllocator::BlockAllocator(int block_size, int num_total_blocks) {
 // Allocate a block
 PhysicalTokenBlock BlockAllocator::allocate() {
     if (free_blocks.empty()) {
+        printf("no free blocks are available\n");
         throw std::runtime_error("Out of memory! No free blocks are available.");
     }
     PhysicalTokenBlock block = free_blocks.front();
@@ -105,14 +107,16 @@ PhysicalTokenBlock BlockAllocator::allocate() {
 // Free a block
 void BlockAllocator::free(PhysicalTokenBlock& block) {
     if (block.ref_count == 0) {
+        printf("block is already freed\n");
         throw std::runtime_error("Double free! Block is already freed.");
     }
     block.decr_ref_count();
     if (block.ref_count == 0) {
         printf("put block number: %d back to free_blocks\n", block.get_block_number());
-        free_blocks.push_front(block);
+        free_blocks.push_back(block);
     }else{
         // in current implementation this should not be the case
+        printf("block is not freed. Ref count: %d\n", block.ref_count);
         throw std::runtime_error("Block is not freed. Ref count: " + std::to_string(block.ref_count));
     }
 }
@@ -123,7 +127,9 @@ int BlockAllocator::get_num_free_blocks() const {
 
 PageManager::PageManager(int block_size, int num_total_blocks)
     : block_size(block_size), num_total_blocks(num_total_blocks),
-      block_allocator(block_size, num_total_blocks) {}
+      block_allocator(block_size, num_total_blocks) {
+        printf("page manager init with block_size: %d, num_total_blocks: %d\n", block_size, num_total_blocks);
+      }
 
 //return the physical number of this block
 int PageManager::allocate_one_block(const RequestGuid& request_guid) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4079c0c0d..180f8c470 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -29,6 +29,9 @@
 #include <stdexcept>
 #include <thread>
 #include <vector>
+#include <exception>
+#include <cstdlib>
+#include <execinfo.h> 
 
 namespace FlexFlow {
 
@@ -37,6 +40,12 @@ using tokenizers::Tokenizer;
 
 Legion::Logger log_req_mgr("RequestManager");
 
+void printStackTrace() {
+    void *array[10];
+    size_t size = backtrace(array, 10);   // Get stack frames
+    backtrace_symbols_fd(array, size, STDERR_FILENO);  // Print stack trace to stderr
+}
+
 bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
                std::shared_ptr<TokenTreeNode> const &rhs) {
   if (lhs->gumbel) {
@@ -613,6 +622,10 @@ void RequestManager::request_update_attainment(int batch_index, bool attained) {
 
 void RequestManager::request_complete_clean_up(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
+  if (profiling_requests[guid].finish_time != 0) {
+    printf("some request has been completed!!\n");
+  }
+
   profiling_requests[guid].finish_time =
       Realm::Clock::current_time_in_microseconds();
   Request &request = all_requests[guid];
@@ -1100,8 +1113,6 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
 
-      // printf("in prefilling: page_last_committed: %d, request->blocks.size(): %d\n", request->page_last_committed, request->blocks.size());
-      // assert(request->page_last_committed < static_cast<int>(request->blocks.size()));
       assert(request->llm_prefill_len + idx < request->tokens.size());
       append_token_to_block(*request, request->tokens[request->llm_prefill_len + idx], true);
     }
@@ -1997,9 +2008,9 @@ int RequestManager::append_token_to_block(Request &request, TokenId token, bool
   request.blocks.back().append_tokens({token}, is_commit);
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   int idx_logical = get_idx_last_logical_token(request);
-  // printf("idx_logical: %d\n", idx_logical);
+  assert(idx_logical >= 0);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
-  // printf("idx_physical: %d\n", idx_physical);
+  assert(idx_physical >= 0);
   return idx_physical;
 }
 
@@ -2019,14 +2030,14 @@ void RequestManager::reset_block_table(Request &request){
   request.blocks.back().reset_num_spec_tokens();
   printf("after reset, block now has %d tokens\n", request.blocks.back().get_num_tokens());
   printf("number of pages allocated: %d\n", page_manager->get_block_table_indices(request.guid).size());
-  printf("number of blocks: %d\n", request.blocks.size()); 
-  printf("num spec tokens: %d\n", request.blocks.back().get_num_spec_tokens());
-  printf("num committed tokens: %d\n", request.blocks.back().get_num_commit_tokens());
+  // printf("number of blocks: %d\n", request.blocks.size()); 
+  // printf("num spec tokens: %d\n", request.blocks.back().get_num_spec_tokens());
+  // printf("num committed tokens: %d\n", request.blocks.back().get_num_commit_tokens());
   // the indices of block table should be the same as the number of blocks
   std::vector<int> block_table = page_manager->get_block_table_indices(request.guid);
-  for (int i = 0; i < request.blocks.size(); i++) {
-    printf("block table indices: %d\n", block_table[i]);
-  }
+  // for (int i = 0; i < request.blocks.size(); i++) {
+  //   printf("block table indices: %d\n", block_table[i]);
+  // }
 
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   return;
@@ -2428,13 +2439,18 @@ void RequestManager::start_background_server(FFModel *model) {
   background_server_handler = runtime->execute_task(ctx, launcher);
   // Register callbacks for normal exit
   {
+    printf("called exit\n");
     int ret = std::atexit(RequestManager::terminate_background_server_at_exit);
+    printf("return from exit\n");
     assert(ret == 0); // make sure the callback is successfully registered
   }
   // Register callbacks for termination
   {
+    printf("called terminate\n");
     std::set_terminate([]() {
       RequestManager::terminate_background_server_at_exit();
+      printf("return from terminate\n");
+      printStackTrace();
       std::abort();
     });
   }
@@ -2706,11 +2722,25 @@ void RequestManager::terminate_background_server() {
            std::to_string(total_tokens / (total_time / 1e6)) + ")";
 
     double average_latency_per_request = 0;
+
+    // information dump
+    for (auto const &profiling_info : profiling_requests) {
+      int request_id = profiling_info.first;
+      Request &request = all_requests[request_id];
+      if (request.status != Request::COMPLETED) {
+        continue;
+      }
+    }
+
+
+
+
     std::string latency_per_request_ms = "\n latency_per_request_ms( ";
     for (auto const &profiling_info : profiling_requests) {
       double latency_ms = (profiling_info.second.finish_time -
                            profiling_info.second.start_time) /
                           1000.0;
+
       // latency_per_request_ms += "[" + std::to_string(profiling_info.first)
       // +
       // ","; latency_per_request_ms += std::to_string(latency_ms) + "] ";
@@ -2832,7 +2862,6 @@ void RequestManager::terminate_background_server() {
     goodput_str += std::to_string(goodput);
     goodput_str += ")";
     str += goodput_str;
-
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     request_queue_cv.notify_all();
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 5bdd92334..ba841bbe9 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -405,6 +405,9 @@ void RequestManager::load_batch_config_task(
             handle.incr_attention_metadata->kv_indices,
             handle.incr_attention_metadata->kv_last_page_len,
             handle.incr_attention_metadata->qk_indptr);
+
+            // check on error
+            checkCUDA(cudaGetLastError());
       }
 
       // prepare attention forward handler

From 3ad0ca501d20a43b3c6651414ca70ef081cc8ad4 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Sat, 12 Oct 2024 00:02:17 -0700
Subject: [PATCH 551/667] update

---
 src/ops/tree_inc_multihead_self_attention.cu | 10 ++++------
 src/runtime/request_manager.cc               |  9 ++++-----
 src/runtime/request_manager.cu               |  5 ++++-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d49de51c1..89bc27027 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -437,7 +437,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       cudaStream_t stream) {
-  printf("entered inference_kernel\n");
   //   int device;
   //   checkCUDA(cudaGetDevice(&device));
   //   cudaEvent_t t_start, t_end;
@@ -613,11 +612,10 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   //   delete[] temp_output;
   // }
-  cudaError_t err = cudaDeviceSynchronize();
-  if (err != cudaSuccess) {
-      std::cerr << "Kernel launch failed with error: " << cudaGetErrorString(err) << std::endl;
-  }
-  printf("exited inference_kernel\n");
+  // cudaError_t err = cudaDeviceSynchronize();
+  // if (err != cudaSuccess) {
+  //     std::cerr << "Kernel launch failed with error: " << cudaGetErrorString(err) << std::endl;
+  // }
 }
 
 } // namespace TreeIncMultiHeadAttention
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 180f8c470..2b20627fa 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1123,14 +1123,13 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     //update related page info in batch config
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
     printf("request: %d has %d kv pages after prefilling\n", request->guid, bc.requestsInfo[request_index].num_kv_pages);
-    if (bc.requestsInfo[request_index].num_kv_pages <= 0){
-      printf("request num tokens in batch: %d\n", bc.requestsInfo[request_index].num_tokens_in_batch);
-      printf("request: %d has %d kv pages after prefilling\n", request->guid, bc.requestsInfo[request_index].num_kv_pages);
+    // WARNING: it is possible that it has no tokens allocated!! but not allowed for flashinfer
+    if (bc.requestsInfo[request_index].num_kv_pages == 0) {
+      // turn this request into not available
+      bc.request_available[request_index] = false;
     }
-    assert(bc.requestsInfo[request_index].num_kv_pages > 0);
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
     printf("request: %d has %d kv last page len after prefilling\n", request->guid, bc.requestsInfo[request_index].kv_last_page_len);
-    assert(bc.requestsInfo[request_index].kv_last_page_len > 0);
     bc.requestsInfo[request_index].request_guid = request->guid;
   }
   bc.num_tokens = num_tokens;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index ba841bbe9..24f4a524f 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -407,7 +407,10 @@ void RequestManager::load_batch_config_task(
             handle.incr_attention_metadata->qk_indptr);
 
             // check on error
-            checkCUDA(cudaGetLastError());
+            cudaError_t error = cudaGetLastError();
+            if (error != cudaSuccess) {
+              printf("CUDA error in prepare_inference_params_kernel: %s\n", cudaGetErrorString(error));
+            }
       }
 
       // prepare attention forward handler

From 1bafe66225c6950b039923ed0a80bd345364b1f2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 12 Oct 2024 13:20:14 -0700
Subject: [PATCH 552/667] chore: minor

---
 src/runtime/request_manager.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cfecfb4e5..e7995a198 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2547,8 +2547,7 @@ void RequestManager::terminate_background_server() {
     ttft_per_request_ms += ")";
     str += ttft_per_request_ms;
 
-    std::string per_token_time_per_request_ms =
-        "\n per_token_time_per_request_ms( ";
+    std::string tpot_per_request_ms = "\n tpot_per_request_ms( ";
     for (auto const &profiling_info : profiling_requests) {
       double per_token_time_ms = 0;
       auto const &request = all_requests[profiling_info.first];
@@ -2558,10 +2557,10 @@ void RequestManager::terminate_background_server() {
             (profiling.finish_time - profiling.start_decoding_time) / 1000.0 /
             (request.tokens.size() - request.llm_prefill_len);
       }
-      per_token_time_per_request_ms += std::to_string(per_token_time_ms) + " ";
+      tpot_per_request_ms += std::to_string(per_token_time_ms) + " ";
     }
-    per_token_time_per_request_ms += ")";
-    str += per_token_time_per_request_ms;
+    tpot_per_request_ms += ")";
+    str += tpot_per_request_ms;
 
     average_latency_per_request /= total_requests;
     str += "\n average_latency_per_request_ms(" +

From 03e8c5bf9cba634939cadb567d781cb918348c29 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 12 Oct 2024 22:12:23 -0700
Subject: [PATCH 553/667] fix: some minor issue

---
 include/flexflow/batch_config.h | 2 +-
 src/ops/arg_topk.cu             | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 053fc002d..ff48bb17f 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -86,7 +86,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
-  inline static int const MAX_NUM_TOKENS = 2048;
+  inline static int const MAX_NUM_TOKENS = 1024;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
   inline static int const MAX_TREE_DEPTH = 8;
   inline static int const MAX_TREE_WIDTH = 16;
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 2d36aee80..fbeb5497c 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -95,9 +95,6 @@ void ArgTopK::forward_kernel(
     bool renormalize,
     /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
     cudaStream_t stream) {
-  if (bc->prompt_phase) {
-    return;
-  }
   assert(bc->num_active_requests() >= 0);
   if (m->device_resources.find(stream) == m->device_resources.end()) {
     m->device_resources[stream] = new raft::device_resources(stream);

From efce3e74de98363c6435a2a5e5ede4f8e76cb913 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 10:49:06 -0700
Subject: [PATCH 554/667] fix: reduce cudaGraph memory consumption

---
 src/ops/fused.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index fabd46b1e..9998831a3 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -610,7 +610,8 @@ __host__ void
 
   // bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE or bc->get_mode()
   // == TREE_VERIFY_MODE);
-  bool use_cuda_graph = (bc->get_mode() == TREE_SEARCH_MODE);
+  bool use_cuda_graph =
+      (bc->get_mode() == TREE_SEARCH_MODE && bc->prompt_phase == 0);
   // bool use_cuda_graph = (bc->get_mode() == TREE_VERIFY_MODE);
   // bool use_cuda_graph = false;
   bool captured = false;

From a2247000ea8b1695f9b18c0228516b4c5bbb0b19 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 13:23:46 -0700
Subject: [PATCH 555/667] feat: add max_output_length

---
 include/flexflow/batch_config.h              |  1 +
 include/flexflow/flexflow_c.h                |  3 ++
 include/flexflow/request_manager.h           |  4 +++
 inference/incr_decoding/incr_decoding.cc     |  8 +++++
 inference/spec_infer/spec_infer.cc           |  8 +++++
 inference/trace_generator/trace_generator.cc | 10 +++++-
 python/flexflow/core/flexflow_cffi.py        |  4 +++
 src/c/flexflow_c.cc                          |  7 ++++
 src/runtime/batch_config.cc                  |  4 +++
 src/runtime/request_manager.cc               | 34 +++++++++++++++++---
 10 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index ff48bb17f..4589f91f3 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -75,6 +75,7 @@ class BatchConfig {
   static int max_tokens_per_prefilling_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
+  static int max_output_length();
   static int get_max_tree_depth();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 5257a6c49..1da5f61d6 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -986,6 +986,9 @@ void flexflow_request_manager_set_max_tokens_per_prefilling_batch(
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+void flexflow_request_manager_set_max_output_length(
+    flexflow_request_manager_t handle_, int max_output_length);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 42b62fe3f..ebfd31b3d 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -216,6 +216,7 @@ struct Request {
   double get_length_weight();
   void set_slo_ratio(double slo_ratio_);
   double get_slo_ratio();
+  int decode_length() const;
 
   Request() {
     std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
@@ -271,6 +272,8 @@ class RequestManager {
   int get_max_spec_tree_token_num();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
+  void set_max_output_length(int max_output_length);
+  int get_max_output_length();
   void set_decoding_mode(DecodingMode mode);
   void set_verbose(bool verbose_);
   int get_k();
@@ -380,6 +383,7 @@ class RequestManager {
   int max_tokens_per_prefilling_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
+  int max_output_length;
   int max_tree_depth;
   int max_tree_width;
   int k;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 0669d2aeb..c342a7b70 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -50,6 +50,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_ssm_batch,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
+                      int &max_output_length,
                       int &sampling_seed,
                       bool &streaming_cache,
                       bool &slo_attainment_early_termination,
@@ -128,6 +129,10 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--sampling-seed")) {
       sampling_seed = std::stoi(argv[++i]);
       continue;
@@ -193,6 +198,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_ssm_batch = -1;
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
+  int max_output_length = 512;
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;
@@ -221,6 +227,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_ssm_batch,
                    max_tokens_per_prefilling_batch,
                    max_sequence_length,
+                   max_output_length,
                    sampling_seed,
                    streaming_cache,
                    slo_attainment_early_termination,
@@ -297,6 +304,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
   rm->set_decoding_mode(decoding_mode);
   rm->set_slo_violation_early_termination(slo_attainment_early_termination);
   rm->set_baseline_latency(baseline_latency_ms);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index a990dfe77..e3fe4a250 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -67,6 +67,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_ssm_batch,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
+                      int &max_output_length,
                       int &max_tree_width,
                       int &max_tree_depth,
                       int &expansion_degree,
@@ -148,6 +149,10 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-tree-width")) {
       max_tree_width = std::stoi(argv[++i]);
       continue;
@@ -362,6 +367,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_ssm_batch = -1;
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 512;
+  int max_output_length = 512;
   int expansion_degree = 3;
   int max_tree_depth = 8;
   int max_tree_width = 16;
@@ -393,6 +399,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_ssm_batch,
                    max_tokens_per_prefilling_batch,
                    max_sequence_length,
+                   max_output_length,
                    max_tree_width,
                    max_tree_depth,
                    expansion_degree,
@@ -438,6 +445,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 6f9214ce4..533f59864 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -63,6 +63,7 @@ void parse_input_args(char **argv,
                       bool &use_full_precision,
                       bool &verbose,
                       int &max_sequence_length,
+                      int &max_output_length,
                       double &scaling_factor) {
   for (int i = 1; i < argc; i++) {
     // llm model name
@@ -114,6 +115,10 @@ void parse_input_args(char **argv,
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--scaling-factor")) {
       scaling_factor = std::stod(argv[++i]);
       continue;
@@ -271,6 +276,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool use_full_precision = false;
   bool verbose = false;
   int max_sequence_length = 256;
+  int max_output_length = 512;
   double scaling_factor = 1.0;
 
   int max_requests_per_batch = 8;
@@ -302,6 +308,7 @@ void FlexFlow::top_level_task(Task const *task,
                    use_full_precision,
                    verbose,
                    max_sequence_length,
+                   max_output_length,
                    scaling_factor);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
@@ -328,6 +335,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_ssm_batch(max_tokens_per_ssm_batch);
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
@@ -460,7 +468,7 @@ void FlexFlow::top_level_task(Task const *task,
       std::string timestamp = log_json[i]["TIMESTAMP"].get<std::string>();
       EmissionTrace trace(prompt,
                           input_tokens.size(),
-                          max_sequence_length,
+                          max_output_length,
                           emission_machine.sample_slo_ratio(),
                           time_diff_ms(start_time, timestamp) * scaling_factor);
       traces.push_back(trace);
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index c09722fb4..49e689e06 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -4212,6 +4212,10 @@ def set_max_sequence_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_sequence_length(
             self.handle, max_length)
 
+    def set_max_output_length(self, max_length):
+        return ffc().flexflow_request_manager_set_max_output_length(
+            self.handle, max_length)
+
     def start_server(self, model):
         return ffc().flexflow_request_manager_start_background_server(
             self.handle, model.handle
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index f37955f02..882749fa8 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2615,6 +2615,13 @@ void flexflow_request_manager_set_max_sequence_length(
   DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length);
 }
 
+void flexflow_request_manager_set_max_output_length(
+    flexflow_request_manager_t handle_, int max_output_length) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_output_length(max_output_length);
+  DEBUG_PRINT("[RequestManager] set max_output_length %d", max_output_length);
+}
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 5cd4135d2..60665763e 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -104,6 +104,10 @@ int BatchConfig::max_sequence_length() {
   return RequestManager::get_request_manager()->get_max_sequence_length();
 }
 
+int BatchConfig::max_output_length() {
+  return RequestManager::get_request_manager()->get_max_output_length();
+}
+
 int BatchConfig::max_spec_tree_token_num() {
   return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e7995a198..ea703f3f5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -99,6 +99,10 @@ double Request::get_slo_ratio() {
   return slo_ratio;
 }
 
+int Request::decode_length() const {
+  return tokens.size() - llm_prefill_len;
+}
+
 RequestManager::RequestManager()
     : background_server_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -115,6 +119,7 @@ RequestManager::RequestManager()
   max_tokens_per_prefilling_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
+  max_output_length = -1;
   max_tree_depth = -1;
   max_tree_width = -1;
   k = -1;
@@ -186,6 +191,16 @@ int RequestManager::get_max_sequence_length() {
   return max_sequence_length;
 }
 
+void RequestManager::set_max_output_length(int max_output_length) {
+  assert(max_output_length > 0);
+  this->max_output_length = max_output_length;
+}
+
+int RequestManager::get_max_output_length() {
+  assert(max_output_length > 0);
+  return max_output_length;
+}
+
 void RequestManager::set_decoding_mode(DecodingMode mode) {
   assert(mode == INCREMENTAL_DECODING || mode == SPECULATIVE_DECODING);
   decoding_mode = mode;
@@ -306,7 +321,7 @@ bool RequestManager::get_spec_infer_old_version() {
 
 double RequestManager::get_request_expected_latency(Request &request) {
   return request.get_slo_ratio() * baseline_latency_ms *
-         (request.tokens.size() - request.llm_prefill_len);
+         request.decode_length();
 }
 
 Request &RequestManager::get_request_with_guid(RequestGuid guid) {
@@ -583,6 +598,13 @@ bool RequestManager::load_pending_request_to_batch() {
     RequestGuid guid = pending_request_queue.front().guid;
     pending_request_queue.pop();
     Request *request = &all_requests[guid];
+    if (request->tokens.size() > get_max_sequence_length()) {
+      std::cerr << "Request " << guid
+                << " exceeds the maximum sequence length: "
+                << request->tokens.size() << " > " << get_max_sequence_length()
+                << std::endl;
+      continue;
+    }
 
     request->status = Request::RUNNING;
     // Find an empty slot
@@ -949,6 +971,7 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
     profiling_requests[guid].llm_decoding_steps++;
     nb_requests_decoded++;
     if (request.tokens.back() == eos_token_id or
+        request.decode_length() >= get_max_output_length() or
         request.tokens.size() >= get_max_sequence_length()) {
       request_update_attainment(request_index, attained);
       request_completed = true;
@@ -1637,7 +1660,8 @@ bool RequestManager::update_llm_verify_results(
         break;
       }
     }
-    if (eos_token_found or request.tokens.size() >= get_max_sequence_length()) {
+    if (eos_token_found or request.decode_length() >= get_max_output_length() or
+        request.tokens.size() >= get_max_sequence_length()) {
       // Request is completed
       request_update_attainment(request_index, attained);
       request_completed = true;
@@ -2508,7 +2532,7 @@ void RequestManager::terminate_background_server() {
     }
     str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
     str += "\n total_requests(" + std::to_string(total_requests) + "/" +
-           std::to_string(profiling_requests.size()) + ")";
+           std::to_string(all_requests.size()) + ")";
     str += "\n total_tokens(" + std::to_string(total_tokens) + ")";
     // throughput
     str += "\n throughput_requests_per_sec(" +
@@ -2555,7 +2579,7 @@ void RequestManager::terminate_background_server() {
       if (profiling.start_decoding_time != 0) {
         per_token_time_ms =
             (profiling.finish_time - profiling.start_decoding_time) / 1000.0 /
-            (request.tokens.size() - request.llm_prefill_len);
+            request.decode_length();
       }
       tpot_per_request_ms += std::to_string(per_token_time_ms) + " ";
     }
@@ -2627,7 +2651,7 @@ void RequestManager::terminate_background_server() {
       Request &request = request_pair.second;
       if (request.attained) {
         attainment += 1;
-        goodput += request.tokens.size() - request.llm_prefill_len;
+        goodput += request.decode_length();
       }
     }
     attainment /= total_requests;

From 5e1cb7c8c424fbbdabd1a074962422b1e6ad59d2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 18:21:12 -0700
Subject: [PATCH 556/667] feat: added upper limit for number of tokens to
 attain slo

---
 src/runtime/request_manager.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ea703f3f5..98778420f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -747,8 +747,9 @@ void RequestManager::request_load_onto_batch(int batch_index) {
 }
 
 void RequestManager::update_token_tree_depth() {
-  ssm_tree_depth = min(get_max_tokens_per_batch() / get_num_active_requests(),
-                       get_max_tree_depth());
+  ssm_tree_depth = min(
+      int(std::ceil(get_max_tokens_per_batch() / get_num_active_requests())),
+      get_max_tree_depth());
 }
 
 void RequestManager::update_inference_results(InferenceResult const &result) {
@@ -2939,7 +2940,12 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
   // In function add_root_to_spec_token_tree
   double current_added = 1.0;
 
-  while (budget > 0 and current_added < num_tokens_to_decode) {
+  // The max token that can be added to the token tree when fulfilling the SLO
+  int max_token_toward_slo =
+      get_max_tokens_per_batch() / get_num_active_requests() * 2;
+
+  while (budget > 0 and max_token_toward_slo > 0 and
+         current_added < num_tokens_to_decode) {
     if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
       break;
     }
@@ -2949,6 +2955,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
     node_ptr->included = true;
     current_added += exp(log_acc_prob);
     budget--;
+    max_token_toward_slo--;
   }
 }
 

From e7a8613113e3e1469fd5493d9d02ed71f43201fd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 19:12:08 -0700
Subject: [PATCH 557/667] chore: minor

---
 src/runtime/request_manager.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 98778420f..e19265f13 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -747,9 +747,9 @@ void RequestManager::request_load_onto_batch(int batch_index) {
 }
 
 void RequestManager::update_token_tree_depth() {
-  ssm_tree_depth = min(
-      int(std::ceil(get_max_tokens_per_batch() / get_num_active_requests())),
-      get_max_tree_depth());
+  ssm_tree_depth = min(int(std::ceil(get_max_tokens_per_ssm_batch() /
+                                     get_num_active_requests())),
+                       get_max_tree_depth());
 }
 
 void RequestManager::update_inference_results(InferenceResult const &result) {

From 755d422295d58aba06bb84c697d7a2ad3912f77d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 20:24:24 -0700
Subject: [PATCH 558/667] feat: modify logic of early stop

---
 src/runtime/request_manager.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 98778420f..41c49d189 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1642,8 +1642,9 @@ bool RequestManager::update_llm_verify_results(
 
     request.decode_latency_ms =
         (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
-    bool attained =
-        request.decode_latency_ms <= get_request_expected_latency(request);
+    bool attained = request.decode_latency_ms <=
+                    get_request_expected_latency(request) +
+                        get_baseline_latency() * request.get_slo_ratio() * 6;
 
     // Initialize the token tree for the request
     init_token_tree(guid);

From c6b5deb0cf1eb3d194005de87f1a6b23e9bfa694 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 21:00:17 -0700
Subject: [PATCH 559/667] fix: load request as long as available

---
 src/runtime/request_manager.cc | 109 ++++++++++++++-------------------
 1 file changed, 45 insertions(+), 64 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e19265f13..786eaa4b3 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -571,6 +571,9 @@ BatchConfig
 
 // Return value: true if load a pending request to the batch
 bool RequestManager::load_pending_request_to_batch() {
+  if (num_running_requests >= get_max_requests_per_batch()) {
+    return false;
+  }
   std::unique_lock<std::mutex> lock(request_queue_mutex);
   if (pending_request_queue.empty()) {
     if (num_running_requests > 0) {
@@ -769,24 +772,20 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
   switch (request_manager_status) {
     case PREFILLING:
       if (decoding_mode == INCREMENTAL_DECODING) {
-        if (update_llm_prefill_results(result)) {
-          // This indicates that the prefilling of the current request
-          // finishes
-
-          // Check if there are more empty slots
-          if (num_running_requests < get_max_requests_per_batch() &&
-              load_pending_request_to_batch()) {
-            // Load the pending request to the batch
-            request_manager_status = PREFILLING;
-          } else {
-            // No more empty slots, start the decoding
-            while (!prefilled_requests.empty()) {
-              Request *request = prefilled_requests.front();
-              request_load_onto_batch(request->batch_index);
-              prefilled_requests.pop();
-            }
-            request_manager_status = DECODING;
+        // This indicates that the prefilling of the requests finishes
+        bool all_prefilled = update_llm_prefill_results(result);
+        // Check if there are more empty slots
+        if (load_pending_request_to_batch() or !all_prefilled) {
+          // Load the pending request to the batch
+          request_manager_status = PREFILLING;
+        } else {
+          // No more empty slots, start the decoding
+          while (!prefilled_requests.empty()) {
+            Request *request = prefilled_requests.front();
+            request_load_onto_batch(request->batch_index);
+            prefilled_requests.pop();
           }
+          request_manager_status = DECODING;
         }
         // Not completed, continue prefilling
       } else if (decoding_mode == SPECULATIVE_DECODING) {
@@ -804,31 +803,23 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
             prefill_model = LLM;
           }
         } else if (prefill_model == LLM) {
-          if (update_llm_prefill_results(result)) {
-            // This indicates that the prefilling phase finishes
-
-            // Check if there are more empty slots
-            if (num_running_requests < get_max_requests_per_batch() &&
-                load_pending_request_to_batch()) {
-              // Load the pending request to the batch
-              prefill_model = SSM;
-              current_ssm_step = 0;
-            } else {
-              // No more empty slots, start the speculation
-              while (!prefilled_requests.empty()) {
-                Request *request = prefilled_requests.front();
-                request_load_onto_batch(request->batch_index);
-                prefilled_requests.pop();
-              }
-              request_manager_status = SSM_SPEC;
-              // Reset the prefill_request
-              current_ssm_step = 0;
-              ssm_completed = false;
-            }
-          } else {
-            // Not completed, start the next iteration of prefilling
+          // This indicates that the prefilling of the requests finishes
+          bool all_prefilled = update_llm_prefill_results(result);
+          if (load_pending_request_to_batch() or !all_prefilled) {
+            request_manager_status = PREFILLING;
             prefill_model = SSM;
             current_ssm_step = 0;
+          } else {
+            // No more empty slots, start the speculation
+            while (!prefilled_requests.empty()) {
+              Request *request = prefilled_requests.front();
+              request_load_onto_batch(request->batch_index);
+              prefilled_requests.pop();
+            }
+            request_manager_status = SSM_SPEC;
+            // Reset the prefill_request
+            current_ssm_step = 0;
+            ssm_completed = false;
           }
         } else {
           assert(false && "Invalid prefill model.");
@@ -837,36 +828,26 @@ void RequestManager::update_inference_results(InferenceResult const &result) {
         assert(false && "Invalid inference mode.");
       }
       break;
-    case DECODING:
-      if (update_llm_decode_results(result)) {
-        // A request completed after the decode
-        if (load_pending_request_to_batch() == false) {
-          // No pending request to process, continue the speculation
-          request_manager_status = DECODING;
-        } else {
-          request_manager_status = PREFILLING;
-        }
+    case DECODING: {
+      bool request_completed = update_llm_decode_results(result);
+      if (load_pending_request_to_batch()) {
+        request_manager_status = PREFILLING;
+      } else {
+        request_manager_status = DECODING;
       }
-      break;
-    case LLM_VERIFY:
-      if (update_llm_verify_results(result)) {
-        // A request completed after the verification
-        if (load_pending_request_to_batch() == false) {
-          // No pending request to process, continue the speculation
-          request_manager_status = SSM_SPEC;
-          current_ssm_step = 0;
-          ssm_completed = false;
-        } else {
-          request_manager_status = PREFILLING;
-          prefill_model = SSM;
-          current_ssm_step = 0;
-        }
+    } break;
+    case LLM_VERIFY: {
+      bool request_completed = update_llm_verify_results(result);
+      if (load_pending_request_to_batch()) {
+        request_manager_status = PREFILLING;
+        prefill_model = SSM;
+        current_ssm_step = 0;
       } else {
         request_manager_status = SSM_SPEC;
         current_ssm_step = 0;
         ssm_completed = false;
       }
-      break;
+    } break;
     case SSM_SPEC:
       // Update current_ssm_step first because when we first call
       // update_ssm_inference_results, there's already a step of small model

From b1e51b22e5b2543c11967d021dedad617ac38d5b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 13 Oct 2024 21:35:46 -0700
Subject: [PATCH 560/667] fix: bug in early stop

---
 src/runtime/request_manager.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 87fbe002b..4a11540ea 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1642,9 +1642,12 @@ bool RequestManager::update_llm_verify_results(
 
     request.decode_latency_ms =
         (current_time - profiling_requests[guid].start_decoding_time) * 1e-3;
-    bool attained = request.decode_latency_ms <=
-                    get_request_expected_latency(request) +
-                        get_baseline_latency() * request.get_slo_ratio() * 6;
+    bool attained =
+        request.decode_latency_ms <= get_request_expected_latency(request);
+    bool current_attained =
+        request.decode_latency_ms <=
+        get_request_expected_latency(request) +
+            get_baseline_latency() * request.get_slo_ratio() * 6;
 
     // Initialize the token tree for the request
     init_token_tree(guid);
@@ -1668,7 +1671,7 @@ bool RequestManager::update_llm_verify_results(
       request_update_attainment(request_index, attained);
       request_completed = true;
       request_complete_clean_up(request_index);
-    } else if (!attained and slo_violation_early_termination) {
+    } else if (!current_attained and slo_violation_early_termination) {
       // Early drop that request
       request_update_attainment(request_index, attained);
       request_completed = true;

From 5975cd2c70770468c29823b9f59d34176dc3bcf3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 14 Oct 2024 01:10:03 -0700
Subject: [PATCH 561/667] feat: trace generator sample the prompt

---
 inference/trace_generator/trace_generator.cc | 131 +++++++++++++------
 src/runtime/request_manager.cc               |   6 +-
 2 files changed, 92 insertions(+), 45 deletions(-)

diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 533f59864..22bf6209f 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -21,6 +21,7 @@
 #include <cassert>
 #include <chrono>
 #include <filesystem>
+#include <fstream>
 #include <string>
 #include <vector>
 #include <wordexp.h>
@@ -31,11 +32,19 @@ using json = nlohmann::json;
 
 struct FilePaths {
   std::string cache_folder_path;
-  std::string prompt_file_path;
   std::string log_file_path;
   std::string emission_file_path;
 };
 
+struct Prompts {
+  std::vector<std::string> file_paths;
+  std::vector<double> proportions;
+  std::vector<double> slo_ratios;
+
+  std::vector<json> jsons;
+  std::vector<int> idxs;
+};
+
 struct ModelNames {
   std::string llm_model_name;
   std::vector<std::string> ssm_model_names;
@@ -56,9 +65,29 @@ struct ModelMeta {
   std::vector<std::string> ssm_model_weights_paths;
 };
 
+template <typename T>
+std::vector<T> split_by_comma(const std::string& input) {
+    std::vector<T> result;
+    std::stringstream ss(input);
+    std::string item;
+    while (std::getline(ss, item, ',')) {
+        std::stringstream item_stream(item);
+        if constexpr (std::is_same<T, double>::value) {
+            double value;
+            if (item_stream >> value) {
+                result.push_back(value);
+            }
+        } else if constexpr (std::is_same<T, std::string>::value) {
+            result.push_back(item);
+        }
+    }
+    return result;
+}
+
 void parse_input_args(char **argv,
                       int argc,
                       FilePaths &paths,
+                      Prompts &prompts,
                       ModelNames &model_names,
                       bool &use_full_precision,
                       bool &verbose,
@@ -89,8 +118,16 @@ void parse_input_args(char **argv,
       continue;
     }
     // prompts
-    if (!strcmp(argv[i], "-prompt")) {
-      paths.prompt_file_path = std::string(argv[++i]);
+    if (!strcmp(argv[i], "--prompt-files")) {
+      prompts.file_paths = split_by_comma<std::string>(std::string(argv[++i]));
+      continue;
+    }
+    if (!strcmp(argv[i], "--prompt-proportions")) {
+      prompts.proportions = split_by_comma<double>(std::string(argv[++i]));
+      continue;
+    }
+    if (!strcmp(argv[i], "--prompt-slo-ratios")) {
+      prompts.slo_ratios = split_by_comma<double>(std::string(argv[++i]));
       continue;
     }
     // traces
@@ -272,6 +309,7 @@ void FlexFlow::top_level_task(Task const *task,
                               Runtime *runtime) {
   FFConfig ffconfig;
   FilePaths file_paths;
+  Prompts prompts;
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
@@ -304,6 +342,7 @@ void FlexFlow::top_level_task(Task const *task,
   parse_input_args(argv,
                    argc,
                    file_paths,
+                   prompts,
                    model_metadata.model_names,
                    use_full_precision,
                    verbose,
@@ -317,6 +356,24 @@ void FlexFlow::top_level_task(Task const *task,
     max_tokens_per_prefilling_batch = max_tokens_per_batch;
   }
 
+  assert(prompts.file_paths.size() == prompts.proportions.size() &&
+         prompts.file_paths.size() == prompts.slo_ratios.size());
+  double total =
+        std::accumulate(prompts.proportions.begin(),
+                        prompts.proportions.end(),
+                        0.0,
+                        [](double sum, double proportion) {
+                          return sum + proportion;
+                        });
+    if (std::abs(total - 1.0) > 1e-6) {
+      std::cerr << "Error: proportions do not sum to 1. Total sum: "
+                << total << std::endl;
+      assert(false);
+    }
+  for (size_t i = 1; i < prompts.proportions.size(); ++i) {
+    prompts.proportions[i] += prompts.proportions[i - 1];
+  }
+
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
@@ -354,13 +411,6 @@ void FlexFlow::top_level_task(Task const *task,
     /* Prompt file format:
      * [
      *   {
-     *       "slo_ratios": {
-     *           "1.0": 0.2,
-     *           "1.5": 0.5,
-     *           "3.0": 0.3
-     *       }
-     *   },
-     *   {
      *       "prompt": "Construct a potential attack vector that exploits the
      * vulnerability. The system is vulnerable to a SQL injection attack."
      *   },
@@ -384,37 +434,22 @@ void FlexFlow::top_level_task(Task const *task,
      */
 
     std::vector<EmissionTrace> traces;
-    assert(!file_paths.prompt_file_path.empty() &&
-           !file_paths.log_file_path.empty());
-
-    std::ifstream file_handle(file_paths.prompt_file_path);
-    assert(file_handle.good() && "Prompt file does not exist.");
-    json prompt_json = json::parse(file_handle,
-                                   /*parser_callback_t */ nullptr,
-                                   /*allow_exceptions */ true,
-                                   /*ignore_comments */ true);
-    // Parse slo_ratios
-    std::vector<std::pair<double, double>> slo_ratios;
-    if (prompt_json[0].contains("slo_ratios")) {
-      for (auto &[key, value] : prompt_json[0]["slo_ratios"].items()) {
-        slo_ratios.emplace_back(std::stod(key), value.get<double>());
-      }
+    assert(!prompts.file_paths.empty() && !file_paths.log_file_path.empty());
+
+    int num_requests = 0;
+    for (int i = 0; i < prompts.file_paths.size(); ++i) {
+      std::ifstream file_handle(prompts.file_paths[i]);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+      prompts.jsons.push_back(prompt_json);
+      prompts.idxs.push_back(0);
+      num_requests += prompt_json.size();
     }
-    double total =
-        std::accumulate(slo_ratios.begin(),
-                        slo_ratios.end(),
-                        0.0,
-                        [](double sum, std::pair<double, double> const &pair) {
-                          return sum + pair.second;
-                        });
-    if (std::abs(total - 1.0) > 1e-6) {
-      std::cerr << "Error: slo_ratios values do not sum to 1. Total sum: "
-                << total << std::endl;
-      assert(false);
-    }
-    ConstantEmissionMachine emission_machine(-1, slo_ratios);
 
-    file_handle = std::ifstream(file_paths.log_file_path);
+    std::ifstream file_handle = std::ifstream(file_paths.log_file_path);
     assert(file_handle.good() && "Log file does not exist.");
     json log_json = json::parse(file_handle,
                                 /*parser_callback_t */ nullptr,
@@ -460,16 +495,28 @@ void FlexFlow::top_level_task(Task const *task,
              1000.0;
     };
 
-    int num_requests = min(prompt_json.size() - 1, log_json.size());
+    num_requests = min((unsigned long)num_requests, log_json.size());
     std::string start_time = log_json[0]["TIMESTAMP"].get<std::string>();
+    srand(time(0));
     for (int i = 0; i < num_requests; ++i) {
-      std::string prompt = prompt_json[i + 1]["prompt"].get<std::string>();
+      // sample from proportions
+      double sample = (double)rand() / RAND_MAX;
+      int ptr = 0;
+      for (size_t j = 0; j < prompts.proportions.size(); ++j) {
+        if (sample < prompts.proportions[j]) {
+          ptr = j;
+          break;
+        }
+      }
+      int& idx = prompts.idxs[ptr];
+      std::string prompt = prompts.jsons[ptr][idx]["prompt"].get<std::string>();
+      idx = (idx + 1) % prompts.jsons[ptr].size();
       std::vector<int32_t> input_tokens = rm->tokenize(prompt);
       std::string timestamp = log_json[i]["TIMESTAMP"].get<std::string>();
       EmissionTrace trace(prompt,
                           input_tokens.size(),
                           max_output_length,
-                          emission_machine.sample_slo_ratio(),
+                          prompts.slo_ratios[ptr],
                           time_diff_ms(start_time, timestamp) * scaling_factor);
       traces.push_back(trace);
     }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 786eaa4b3..e112d3202 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -750,9 +750,9 @@ void RequestManager::request_load_onto_batch(int batch_index) {
 }
 
 void RequestManager::update_token_tree_depth() {
-  ssm_tree_depth = min(int(std::ceil(get_max_tokens_per_ssm_batch() /
-                                     get_num_active_requests())),
-                       get_max_tree_depth());
+  ssm_tree_depth = min(
+      int(std::ceil(get_max_tokens_per_batch() / get_num_active_requests())),
+      get_max_tree_depth());
 }
 
 void RequestManager::update_inference_results(InferenceResult const &result) {

From b5b7594d810ab1721cecd6500b7e0add9a1a2f3c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 14 Oct 2024 12:49:59 -0700
Subject: [PATCH 562/667] feat: add mean tpot statistic

---
 src/runtime/request_manager.cc | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 903149212..f8abc0ec9 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -28,6 +28,7 @@
 #include <stack>
 #include <stdexcept>
 #include <thread>
+#include <unordered_map>
 #include <vector>
 
 namespace FlexFlow {
@@ -2541,6 +2542,10 @@ void RequestManager::terminate_background_server() {
     latency_per_request_ms += ")";
     str += latency_per_request_ms;
 
+    average_latency_per_request /= total_requests;
+    str += "\n average_latency_per_request_ms(" +
+           std::to_string(average_latency_per_request) + ")";
+
     std::string ttft_per_request_ms = "\n ttft_per_request_ms( ";
     for (auto const &profiling_info : profiling_requests) {
       double prefilling_time_ms = 0;
@@ -2557,6 +2562,7 @@ void RequestManager::terminate_background_server() {
     ttft_per_request_ms += ")";
     str += ttft_per_request_ms;
 
+    std::unordered_map<double, std::pair<int, double>> tpots;
     std::string tpot_per_request_ms = "\n tpot_per_request_ms( ";
     for (auto const &profiling_info : profiling_requests) {
       double per_token_time_ms = 0;
@@ -2568,13 +2574,21 @@ void RequestManager::terminate_background_server() {
             request.decode_length();
       }
       tpot_per_request_ms += std::to_string(per_token_time_ms) + " ";
+      auto &tpot = tpots[request.slo_ratio];
+      tpot.first++;
+      tpot.second += per_token_time_ms;
     }
     tpot_per_request_ms += ")";
     str += tpot_per_request_ms;
 
-    average_latency_per_request /= total_requests;
-    str += "\n average_latency_per_request_ms(" +
-           std::to_string(average_latency_per_request) + ")";
+    std::string average_tpot_per_slo_ms = "\n average_tpot_per_slo_ms( ";
+    for (auto const &kv : tpots) {
+      double average_tpot = kv.second.second / kv.second.first;
+      average_tpot_per_slo_ms += std::to_string(kv.first) + ":" +
+                                 std::to_string(average_tpot) + " ";
+    }
+    average_tpot_per_slo_ms += ")";
+    str += average_tpot_per_slo_ms;
 
     std::string req_per_step = "\n requests_per_step( ";
     for (int nb : profiling.requests_per_step) {

From 57cfe1bd29a26b27d3fa4930f1ba38a5838f7127 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 14 Oct 2024 12:50:39 -0700
Subject: [PATCH 563/667] style: format

---
 inference/trace_generator/trace_generator.cc | 60 ++++++++++----------
 src/runtime/request_manager.cc               |  4 +-
 2 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 22bf6209f..853487cc4 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -66,22 +66,22 @@ struct ModelMeta {
 };
 
 template <typename T>
-std::vector<T> split_by_comma(const std::string& input) {
-    std::vector<T> result;
-    std::stringstream ss(input);
-    std::string item;
-    while (std::getline(ss, item, ',')) {
-        std::stringstream item_stream(item);
-        if constexpr (std::is_same<T, double>::value) {
-            double value;
-            if (item_stream >> value) {
-                result.push_back(value);
-            }
-        } else if constexpr (std::is_same<T, std::string>::value) {
-            result.push_back(item);
-        }
+std::vector<T> split_by_comma(std::string const &input) {
+  std::vector<T> result;
+  std::stringstream ss(input);
+  std::string item;
+  while (std::getline(ss, item, ',')) {
+    std::stringstream item_stream(item);
+    if constexpr (std::is_same<T, double>::value) {
+      double value;
+      if (item_stream >> value) {
+        result.push_back(value);
+      }
+    } else if constexpr (std::is_same<T, std::string>::value) {
+      result.push_back(item);
     }
-    return result;
+  }
+  return result;
 }
 
 void parse_input_args(char **argv,
@@ -358,18 +358,16 @@ void FlexFlow::top_level_task(Task const *task,
 
   assert(prompts.file_paths.size() == prompts.proportions.size() &&
          prompts.file_paths.size() == prompts.slo_ratios.size());
-  double total =
-        std::accumulate(prompts.proportions.begin(),
-                        prompts.proportions.end(),
-                        0.0,
-                        [](double sum, double proportion) {
-                          return sum + proportion;
-                        });
-    if (std::abs(total - 1.0) > 1e-6) {
-      std::cerr << "Error: proportions do not sum to 1. Total sum: "
-                << total << std::endl;
-      assert(false);
-    }
+  double total = std::accumulate(
+      prompts.proportions.begin(),
+      prompts.proportions.end(),
+      0.0,
+      [](double sum, double proportion) { return sum + proportion; });
+  if (std::abs(total - 1.0) > 1e-6) {
+    std::cerr << "Error: proportions do not sum to 1. Total sum: " << total
+              << std::endl;
+    assert(false);
+  }
   for (size_t i = 1; i < prompts.proportions.size(); ++i) {
     prompts.proportions[i] += prompts.proportions[i - 1];
   }
@@ -441,9 +439,9 @@ void FlexFlow::top_level_task(Task const *task,
       std::ifstream file_handle(prompts.file_paths[i]);
       assert(file_handle.good() && "Prompt file does not exist.");
       json prompt_json = json::parse(file_handle,
-                                    /*parser_callback_t */ nullptr,
-                                    /*allow_exceptions */ true,
-                                    /*ignore_comments */ true);
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
       prompts.jsons.push_back(prompt_json);
       prompts.idxs.push_back(0);
       num_requests += prompt_json.size();
@@ -508,7 +506,7 @@ void FlexFlow::top_level_task(Task const *task,
           break;
         }
       }
-      int& idx = prompts.idxs[ptr];
+      int &idx = prompts.idxs[ptr];
       std::string prompt = prompts.jsons[ptr][idx]["prompt"].get<std::string>();
       idx = (idx + 1) % prompts.jsons[ptr].size();
       std::vector<int32_t> input_tokens = rm->tokenize(prompt);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f8abc0ec9..2ddcfe964 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2584,8 +2584,8 @@ void RequestManager::terminate_background_server() {
     std::string average_tpot_per_slo_ms = "\n average_tpot_per_slo_ms( ";
     for (auto const &kv : tpots) {
       double average_tpot = kv.second.second / kv.second.first;
-      average_tpot_per_slo_ms += std::to_string(kv.first) + ":" +
-                                 std::to_string(average_tpot) + " ";
+      average_tpot_per_slo_ms +=
+          std::to_string(kv.first) + ":" + std::to_string(average_tpot) + " ";
     }
     average_tpot_per_slo_ms += ")";
     str += average_tpot_per_slo_ms;

From ad9b240db79245fa45564613e806ecf2aba53586 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 14 Oct 2024 22:55:14 -0700
Subject: [PATCH 564/667] fix: modify token add toward slo

---
 src/runtime/request_manager.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2ddcfe964..a3b96db90 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2941,7 +2941,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
 
   // The max token that can be added to the token tree when fulfilling the SLO
   int max_token_toward_slo =
-      get_max_tokens_per_batch() / get_num_active_requests() * 2;
+      int(get_max_tokens_per_batch() / get_num_active_requests());
 
   while (budget > 0 and max_token_toward_slo > 0 and
          current_added < num_tokens_to_decode) {

From 9cf66c15e567d29bc81dbb6cffe4aa49efd94759 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 15 Oct 2024 16:19:23 -0700
Subject: [PATCH 565/667] fix: max_spec_tree_token_num

---
 src/runtime/request_manager.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a3b96db90..1e3335d28 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -236,7 +236,9 @@ void RequestManager::set_max_tree_depth(int max_tree_depth) {
          "Invalid max_tree_depth");
   this->max_tree_depth = max_tree_depth;
   if (max_tree_width > 0) {
-    max_spec_tree_token_num = max_tree_depth * max_tree_width;
+    // 8 is k of topk, if max_tree_width <= k, we will fill the second level
+    max_spec_tree_token_num =
+        max_tree_depth * max_tree_width + (max_tree_width <= 8);
     assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
   }
 }
@@ -254,7 +256,9 @@ void RequestManager::set_max_tree_width(int max_tree_width) {
          "Invalid max_tree_width");
   this->max_tree_width = max_tree_width;
   if (max_tree_depth > 0) {
-    max_spec_tree_token_num = max_tree_depth * max_tree_width;
+    // 8 is k of topk, if max_tree_width <= k, we will fill the second level
+    max_spec_tree_token_num =
+        max_tree_depth * max_tree_width + (max_tree_width <= 8);
     assert(max_spec_tree_token_num <= BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
   }
 }

From c8d442d10fa6b5aa78ac30b4e49a2cb3f05f5a23 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 16 Oct 2024 22:35:50 +0000
Subject: [PATCH 566/667] feat: add two naive scheduling policies

---
 include/flexflow/request_manager.h |  9 +++
 inference/spec_infer/spec_infer.cc | 16 +++++
 src/runtime/request_manager.cc     | 94 ++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ebfd31b3d..c151cdfbc 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -297,7 +297,11 @@ class RequestManager {
   void
       set_slo_violation_early_termination(bool slo_violation_early_termination);
   void set_spec_infer_old_version(bool spec_infer_old_version);
+  void set_greedy_schedule(bool greedy_schedule);
+  void set_equal_schedule(bool equal_schedule);
   bool get_spec_infer_old_version();
+  bool get_greedy_schedule();
+  bool get_equal_schedule();
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
@@ -403,6 +407,8 @@ class RequestManager {
   bool memory_occupancy = false;
   bool slo_violation_early_termination = false;
   bool spec_infer_old_version = false;
+  bool greedy_schedule = false;
+  bool equal_schedule = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -526,9 +532,12 @@ class RequestManager {
   void add_tokens_to_spec_token_tree_old_version(
       InferenceResult const &ssm_inference_result);
   void prune_token_tree();
+  void prune_token_tree_equal();
+  void prune_token_tree_greedy();
   void add_tokens_toward_slo(RequestGuid guid, int &budget);
   void add_tokens_toward_memory_occupancy(int budget);
   void add_tokens_toward_goodput(int budget);
+  void add_tokens_toward_goodput_per_request(int budget, int request_index);
   void update_token_tree_depth();
 
   /* ---------- Spec Decoding Helper Functions ---------- */
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index e3fe4a250..7b6bad7c4 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -81,6 +81,8 @@ void parse_input_args(char **argv,
                       int &llm_verify_latency_ms,
                       double &request_per_second,
                       bool &spec_infer_old_version,
+                      bool &greedy_schedule,
+                      bool &equal_schedule,
                       std::string &emission_file_path) {
   for (int i = 1; i < argc; i++) {
     // llm model name
@@ -206,6 +208,14 @@ void parse_input_args(char **argv,
       spec_infer_old_version = true;
       continue;
     }
+    if (!strcmp(argv[i], "--greedy-schedule")) {
+      greedy_schedule = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--equal-schedule")) {
+      equal_schedule = true;
+      continue;
+    }
     if (!strcmp(argv[i], "--emission-file-path")) {
       emission_file_path = std::string(argv[++i]);
       continue;
@@ -383,6 +393,8 @@ void FlexFlow::top_level_task(Task const *task,
   int llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
   bool spec_infer_old_version = false;
+  bool greedy_schedule = false;
+  bool equal_schedule = false;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -413,6 +425,8 @@ void FlexFlow::top_level_task(Task const *task,
                    llm_verify_latency_ms,
                    request_per_second,
                    spec_infer_old_version,
+                   greedy_schedule,
+                   equal_schedule,
                    emission_file_path);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
@@ -460,6 +474,8 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_ssm_spec_latency(ssm_spec_latency_ms);
   rm->set_llm_verify_latency(llm_verify_latency_ms);
   rm->set_spec_infer_old_version(spec_infer_old_version);
+  rm->set_greedy_schedule(greedy_schedule);
+  rm->set_equal_schedule(equal_schedule);
   rm->register_output_filepath(file_paths.output_file_path);
 
   // Create LLM model
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1e3335d28..0b3b8aaa2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -320,10 +320,26 @@ void RequestManager::set_spec_infer_old_version(bool spec_infer_old_version_) {
   spec_infer_old_version = spec_infer_old_version_;
 }
 
+void RequestManager::set_greedy_scheduler(bool greedy_scheduler_) {
+  greedy_scheduler = greedy_scheduler_;
+}
+
+void RequestManager::set_equal_schedule(bool equal_schedule_) {
+  equal_schedule = equal_schedule_;
+}
+
 bool RequestManager::get_spec_infer_old_version() {
   return spec_infer_old_version;
 }
 
+bool RequestManager::get_greedy_scheduler() {
+  return greedy_scheduler;
+}
+
+bool RequestManager::get_equal_schedule() {
+  return equal_schedule;
+}
+
 double RequestManager::get_request_expected_latency(Request &request) {
   return request.get_slo_ratio() * baseline_latency_ms *
          request.decode_length();
@@ -2888,6 +2904,12 @@ void RequestManager::add_tokens_to_spec_token_tree_old_version(
 }
 
 void RequestManager::prune_token_tree() {
+  if (get_greedy_schedule()) {
+    return prune_token_tree_greedy();
+  } else if (get_equal_schedule()) {
+    return prune_token_tree_equal();
+  }
+
   // Each reqeust has at least one token
   int budget = get_max_tokens_per_batch() - num_available_requests;
   assert(budget >= 0);
@@ -2933,6 +2955,48 @@ void RequestManager::prune_token_tree() {
   }
 }
 
+void RequestManager::prune_token_tree_equal() {
+  // Each reqeust has at least one token
+  int const equal_budget =
+      get_max_tokens_per_batch() / get_num_active_requests();
+  assert(equal_budget >= 0);
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+    int budget = equal_budget;
+    assert(budget >= 0);
+    if (budget > 0) {
+      add_tokens_toward_goodput_per_request(budget, request_index);
+    }
+  }
+}
+
+void RequestManager::prune_token_tree_greedy() {
+  // Each reqeust has at least one token
+  int budget = get_max_tokens_per_batch();
+  assert(budget >= 0);
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       ++request_index) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    RequestGuid guid = guid_of_requests[request_index];
+    Request &request = all_requests[guid];
+    assert(request.status == Request::RUNNING);
+  }
+
+  if (budget > 0) {
+    add_tokens_toward_goodput(budget);
+  }
+}
+
 void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
   Request &request = all_requests[guid];
   double num_tokens_to_decode = (ssm_spec_latency_ms + llm_verify_latency_ms) *
@@ -3108,6 +3172,36 @@ void RequestManager::add_tokens_toward_goodput(int budget) {
   }
 }
 
+void RequestManager::add_tokens_toward_goodput_per_request(int budget,
+                                                           int request_index) {
+  RequestGuid guid = guid_of_requests[request_index];
+  Request &request = all_requests[guid];
+  assert(request.status == Request::RUNNING);
+  if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
+    continue;
+  }
+
+  auto &pq = request.token_tree_nodes_acc_prob_pair_pq;
+
+  // Perform dequeue and enqueue until the budget is used up
+  while (budget > 0 and !pq.empty()) {
+    auto [node_ptr, acc_log_prob] = pq.top();
+    pq.pop();
+    node_ptr->included = true;
+    budget--;
+  }
+
+  // Clear the priority queue in each requests
+  std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
+      _prealloc_vector;
+  _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);
+  request.token_tree_nodes_acc_prob_pair_pq = std::priority_queue<
+      std::pair<std::shared_ptr<TokenTreeNode>, double>,
+      std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>,
+      SharedTokenTreeNodePtrDoubleLess>(SharedTokenTreeNodePtrDoubleLess(),
+                                        std::move(_prealloc_vector));
+}
+
 std::ostream &operator<<(std::ostream &os, TokenTree const &token_tree) {
   os << "Token tree: " << std::endl;
   int layer_idx = 0;

From 19e41d61cd90b1d4c78aa3efd73914a0f17cb929 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Sun, 20 Oct 2024 20:27:53 -0700
Subject: [PATCH 567/667] add some docuementation and delete print

---
 include/flexflow/page_manager.h              | 20 +-----
 src/ops/tree_inc_multihead_self_attention.cu |  3 -
 src/runtime/page_manager.cc                  |  5 +-
 src/runtime/request_manager.cc               | 53 +++-------------
 src/runtime/request_manager.cu               | 65 +-------------------
 5 files changed, 15 insertions(+), 131 deletions(-)

diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
index fe6792574..54b661e02 100644
--- a/include/flexflow/page_manager.h
+++ b/include/flexflow/page_manager.h
@@ -1,18 +1,3 @@
-/* Copyright 2023 CMU, Stanford, Facebook, LANL
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 #pragma once
 
 #include "flexflow/batch_config.h"
@@ -31,7 +16,7 @@ using TokenId = BatchConfig::TokenId;
 
 /**
  * @class LogicalTokenBlock
- * @brief A class to represent a logical block of tokens similar to virtual memory address
+ * @brief A class to represent a sequence of tokens for each request
  */
 class LogicalTokenBlock {
 public:
@@ -70,7 +55,6 @@ class LogicalTokenBlock {
     int num_tokens; // the number of tokens currently stored in the block
     int num_commit_tokens; // the number of tokens inside this block that are already committed
     int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
-
     std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
 };
 
@@ -132,9 +116,9 @@ class PageManager {
     using RequestGuid = BatchConfig::RequestGuid;
     PageManager(int block_size, int num_total_blocks);
 
-
     int allocate_one_block(const RequestGuid& request_guid);
     void free_request(const RequestGuid& request_guid);
+    //used for the case that we want to free the last num_blocks that stores spec tokens(which are the tokens are not yet committed)
     void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
     std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;
 
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 89bc27027..6b7a14903 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -97,9 +97,6 @@ __global__ void
   if (offset < kv_hidden_size) {
     int start = kv_indptr[req_idx_compact];
     int end = kv_indptr[req_idx_compact + 1] - 1;
-    if (start > end) {
-      printf("Invalid kv_indptr: %d %d\n", start, end);
-    }
     assert(start <= end && "Invalid kv_indptr");
     assert(start + (token_abs_idx / kPagesize) <= end &&
            "Invalid page index");
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 6cb3c2b56..f59837f30 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -112,7 +112,7 @@ void BlockAllocator::free(PhysicalTokenBlock& block) {
     }
     block.decr_ref_count();
     if (block.ref_count == 0) {
-        printf("put block number: %d back to free_blocks\n", block.get_block_number());
+        // printf("put block number: %d back to free_blocks\n", block.get_block_number());
         free_blocks.push_back(block);
     }else{
         // in current implementation this should not be the case
@@ -128,7 +128,6 @@ int BlockAllocator::get_num_free_blocks() const {
 PageManager::PageManager(int block_size, int num_total_blocks)
     : block_size(block_size), num_total_blocks(num_total_blocks),
       block_allocator(block_size, num_total_blocks) {
-        printf("page manager init with block_size: %d, num_total_blocks: %d\n", block_size, num_total_blocks);
       }
 
 //return the physical number of this block
@@ -138,7 +137,6 @@ int PageManager::allocate_one_block(const RequestGuid& request_guid) {
     PhysicalTokenBlock block = block_allocator.allocate();
     block_table.push_back(block);
     block_tables[request_guid] = block_table;
-    printf("request_guid: %d, block_number: %d\n", request_guid, block.get_block_number());
     return block.get_block_number();
 }
 
@@ -184,7 +182,6 @@ std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request
     std::vector<int> indices;
     const auto& it = block_tables.find(request_guid);
     if (it == block_tables.end()) {
-        printf("page manager not found request_guid: %d\n", request_guid);
         return indices;
     }
     const auto& block_table = it->second;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index acfa25cde..a0f805b69 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -329,8 +329,8 @@ void RequestManager::set_spec_infer_old_version(bool spec_infer_old_version_) {
   spec_infer_old_version = spec_infer_old_version_;
 }
 
-void RequestManager::set_greedy_scheduler(bool greedy_scheduler_) {
-  greedy_scheduler = greedy_scheduler_;
+void RequestManager::set_greedy_schedule(bool greedy_scheduler_) {
+  greedy_schedule = greedy_scheduler_;
 }
 
 void RequestManager::set_equal_schedule(bool equal_schedule_) {
@@ -341,8 +341,8 @@ bool RequestManager::get_spec_infer_old_version() {
   return spec_infer_old_version;
 }
 
-bool RequestManager::get_greedy_scheduler() {
-  return greedy_scheduler;
+bool RequestManager::get_greedy_schedule() {
+  return greedy_schedule;
 }
 
 bool RequestManager::get_equal_schedule() {
@@ -683,7 +683,6 @@ void RequestManager::request_complete_clean_up(int batch_index) {
 
   // page attention: free the pages
   PageManager *page_manager = PageManager::get_page_manager();
-  printf("free request %d\n", guid);
   page_manager->free_request(guid);
 
   // Find the sos and eos in the sequence
@@ -1148,14 +1147,11 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     }
     //update related page info in batch config
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
-    printf("request: %d has %d kv pages after prefilling\n", request->guid, bc.requestsInfo[request_index].num_kv_pages);
-    // WARNING: it is possible that it has no tokens allocated!! but not allowed for flashinfer
     if (bc.requestsInfo[request_index].num_kv_pages == 0) {
-      // turn this request into not available
+      // turn this request into not available for one round
       bc.request_available[request_index] = false;
     }
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
-    printf("request: %d has %d kv last page len after prefilling\n", request->guid, bc.requestsInfo[request_index].kv_last_page_len);
     bc.requestsInfo[request_index].request_guid = request->guid;
   }
   bc.num_tokens = num_tokens;
@@ -1164,7 +1160,6 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     std::cout << "prepare_llm_prefilling_batch NEW batchconfig:" << std::endl;
     bc.print();
   }
-  printf("there are %d requests in the batch in prefilling stage\n", bc.num_available_requests);
   return bc;
 }
 
@@ -1550,7 +1545,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
-    //page attention: before commit token, reset the pages assigned by cleaning all the tokens
+    //before commit token, reset the pages assigned by cleaning all the tokens
     std::vector<int> block_table_before_commit = page_manager->get_block_table_indices(guid);
     // also need to reset the pages
     reset_block_table(request);
@@ -1576,17 +1571,11 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
       
-      // assert(request.page_last_committed < request.blocks.size());
-      printf("in verify: page_last_committed: %d, request->blocks.size(): %d\n", request.page_last_committed, request.blocks.size());
       int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index;
-      if (idx_from_logical < 0) {
-        printf("idx_from_logical: %d, from_index: %d, first_token_offset_in_batch: %d\n", idx_from_logical, committed_token.from_index, request.first_token_offset_in_batch);
-      }
       assert(idx_from_logical >= 0);
       assert(idx_from_logical / kPagesize < block_table_before_commit.size());
       int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
-      printf("id to physical: %d, from physical: %d\n", idx_to_physical, idx_from_physical);
 
 
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
@@ -1615,15 +1604,11 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           token_tree_index++;
 
           // Append the token to the block
-          // printf("in verify spec tree: page_last_committed: %d, request->blocks.size(): %d ", request.page_last_committed, request.blocks.size());
-          // printf("in verify spec tree: last page len: %d\n", get_len_last_block(request));
-          // assert(request.page_last_committed < request.blocks.size());
           append_token_to_block(request, tree_node->id, false);
         }
       }
       layer_index++;
     }
-    printf("there are %d tokens in the token tree\n", token_tree_index);
     new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
     request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
@@ -1973,6 +1958,7 @@ int RequestManager::get_len_last_block(Request &request) const {
   return request.blocks.back().get_num_tokens();
 }
 
+// get the index of the last token in the request
 int RequestManager::get_idx_last_logical_token(Request &request) const {
   if (request.blocks.empty()) {
     printf("Error: request.blocks is empty\n");
@@ -1987,8 +1973,6 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   PageManager *page_manager = PageManager::get_page_manager();
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
   if (request.blocks.size() != block_table_indices.size()) {
-    printf("page manager get block table indices: %d, request.blocks.size(): %d\n", page_manager->get_block_table_indices(request.guid).size(), request.blocks.size());
-    printf("request.blocks.size(): %d, block_table_indices.size(): %d\n", request.blocks.size(), block_table_indices.size());
     assert(request.blocks.size() == block_table_indices.size());
   }
   return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
@@ -2007,25 +1991,18 @@ void RequestManager::_append_block_to_request(
   request.blocks.push_back(block);
   page_manager->allocate_one_block(request.guid);
   std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
-  // for (int i = 0; i < block_table_indices.size(); i++) {
-  //   printf("block table indices: %d\n", block_table_indices[i]);
-  // }
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
     request.page_last_committed++;
     int size_blocks = request.blocks.size();
-    if (request.page_last_committed >= size_blocks) {
-      printf("request page_last_committed: %d, size_blocks) {: %d\n", request.page_last_committed, size_blocks);
-      assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-    }
+    assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
   }
 }
 
 //this function is used for appending a token to the last logical block and also the last physical block
 //it will return the physical position of this token
 int RequestManager::append_token_to_block(Request &request, TokenId token, bool is_commit) {
-  // assert(request.page_last_committed < request.blocks.size());
   PageManager *page_manager = PageManager::get_page_manager();
   if (request.blocks.empty() ||
       request.blocks.back().is_full()) {
@@ -2058,16 +2035,8 @@ void RequestManager::reset_block_table(Request &request){
     request.blocks.erase(request.blocks.begin() + request.page_last_committed + 1, request.blocks.end());
   }
   request.blocks.back().reset_num_spec_tokens();
-  printf("after reset, block now has %d tokens\n", request.blocks.back().get_num_tokens());
-  printf("number of pages allocated: %d\n", page_manager->get_block_table_indices(request.guid).size());
-  // printf("number of blocks: %d\n", request.blocks.size()); 
-  // printf("num spec tokens: %d\n", request.blocks.back().get_num_spec_tokens());
-  // printf("num committed tokens: %d\n", request.blocks.back().get_num_commit_tokens());
   // the indices of block table should be the same as the number of blocks
   std::vector<int> block_table = page_manager->get_block_table_indices(request.guid);
-  // for (int i = 0; i < request.blocks.size(); i++) {
-  //   printf("block table indices: %d\n", block_table[i]);
-  // }
 
   assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
   return;
@@ -2469,17 +2438,13 @@ void RequestManager::start_background_server(FFModel *model) {
   background_server_handler = runtime->execute_task(ctx, launcher);
   // Register callbacks for normal exit
   {
-    printf("called exit\n");
     int ret = std::atexit(RequestManager::terminate_background_server_at_exit);
-    printf("return from exit\n");
     assert(ret == 0); // make sure the callback is successfully registered
   }
   // Register callbacks for termination
   {
-    printf("called terminate\n");
     std::set_terminate([]() {
       RequestManager::terminate_background_server_at_exit();
-      printf("return from terminate\n");
       printStackTrace();
       std::abort();
     });
@@ -3395,7 +3360,7 @@ void RequestManager::add_tokens_toward_goodput_per_request(int budget,
   Request &request = all_requests[guid];
   assert(request.status == Request::RUNNING);
   if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
-    continue;
+    return;
   }
 
   auto &pq = request.token_tree_nodes_acc_prob_pair_pq;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 24f4a524f..16876c124 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -91,8 +91,8 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
   q_indptr_h[0] = 0;
   kv_indptr_h[0] = 0;
   qk_indptr_h[0] = 0;
-  int cnt_1 = 0, q_lens = 0, qk_lens = 0;
-  int indices_offset = 0, indices_lens = 0, kv_len = 0;
+  int q_lens = 0, qk_lens = 0;
+  int indices_offset = 0, indices_lens = 0;
   for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
     if (batch_config->request_available[req_idx]) {
       int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
@@ -110,10 +110,6 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       assert(batch_config->requestsInfo[req_idx].num_kv_pages == (kv_len + kPagesize - 1) / kPagesize);
       assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= kPagesize);
       std::vector<int32_t> kv_indices = pm -> get_block_table_indices(batch_config->requestsInfo[req_idx].request_guid);
-      // printf("request_guid: %d\n", batch_config->requestsInfo[req_idx].request_guid);
-      // printf("kv_indices.size() = %d, kv_len = %d\n", kv_indices.size(), kv_len);
-      // printf("kv last page len = %d\n", batch_config->requestsInfo[req_idx].kv_last_page_len);
-      // printf("num_kv_pages = %d\n", batch_config->requestsInfo[req_idx].num_kv_pages);
       assert(kv_indices.size() == (kv_len + kPagesize - 1) / kPagesize);
       for (int i = indices_offset; i < indices_lens; i++) {
         kv_indices_h[i] = kv_indices[i - indices_offset];
@@ -122,14 +118,6 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       kv_last_page_len_h[indptr_idx] = batch_config->requestsInfo[req_idx].kv_last_page_len;
       indptr_idx++;
     }
-    // }else{
-    //   q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx];
-    //   q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx];
-    //   kv_indptr_h[indptr_idx + 1] = kv_indptr_h[indptr_idx];
-    //   qk_indptr_h[indptr_idx + 1] = 0;
-    //   kv_last_page_len_h[indptr_idx] = 0;
-    //   indptr_idx++;
-    // }
   }
 
   // do the copy
@@ -405,12 +393,6 @@ void RequestManager::load_batch_config_task(
             handle.incr_attention_metadata->kv_indices,
             handle.incr_attention_metadata->kv_last_page_len,
             handle.incr_attention_metadata->qk_indptr);
-
-            // check on error
-            cudaError_t error = cudaGetLastError();
-            if (error != cudaSuccess) {
-              printf("CUDA error in prepare_inference_params_kernel: %s\n", cudaGetErrorString(error));
-            }
       }
 
       // prepare attention forward handler
@@ -637,8 +619,6 @@ void RequestManager::load_batch_config_task(
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     PageManager *pm = PageManager::get_page_manager();
-    // hardcode request 
-    // printf("request has allocated %d pages\n", pm -> get_block_table_indices(1000000).size());
     static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
     static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_NUM_TOKENS];
     static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
@@ -666,20 +646,7 @@ void RequestManager::load_batch_config_task(
             round_up_pages(BatchConfig::max_sequence_length() +
                            BatchConfig::max_spec_tree_token_num());
 
-        int parallelism = batch_size;
-        // prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
-        //                                   min(CUDA_NUM_THREADS, parallelism),
-        //                                   0,
-        //                                   stream>>>(
-        //     batch_size,
-        //     request_infos,
-        //     request_available,
-        //     max_num_pages,
-        //     handle.tree_verify_attention_metadata->q_indptr,
-        //     handle.tree_verify_attention_metadata->kv_indptr,
-        //     handle.tree_verify_attention_metadata->kv_indices,
-        //     handle.tree_verify_attention_metadata->kv_last_page_len,
-        //     handle.tree_verify_attention_metadata->qk_indptr);
+        // int parallelism = batch_size;
         prepare_inference_params_kernel_h(batch_config,
                                           pm,
                                           handle,
@@ -747,26 +714,6 @@ void RequestManager::load_batch_config_task(
                   ->prompt_handler_collections[batch_size]);
         }
 
-        // static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
-        //     kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
-        // q_indptr_h[0] = 0;
-        // kv_indptr_h[0] = 0;
-        // for (int req_idx = 0, indptr_idx = 0;
-        //      req_idx < batch_config->max_requests_per_batch();
-        //      req_idx++) {
-        //   if (batch_config->request_available[req_idx]) {
-        //     int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-        //     int kv_len =
-        //         batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-        //         batch_config->requestsInfo[req_idx]
-        //             .first_token_index_in_request;
-        //     q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-        //     kv_indptr_h[indptr_idx + 1] =
-        //         kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
-        //     indptr_idx++;
-        //   }
-        // }
-
         handler->SetCUDAStream(stream);
         handler->BeginForward<half, int32_t>(
             static_cast<void *>(
@@ -782,12 +729,6 @@ void RequestManager::load_batch_config_task(
             handle.tree_verify_attention_metadata->num_kv_heads(),
             handle.tree_verify_attention_metadata->head_dim(),
             kPagesize);
-
-            cudaError_t syncErr = cudaDeviceSynchronize();
-            if (syncErr != cudaSuccess) {
-              printf("Kernel execution error: %s\n", cudaGetErrorString(syncErr));
-              assert(false);
-            }
       }
     }
   }

From b1793fb50447b8c6da0d945721be3146a202b1b8 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Sun, 20 Oct 2024 21:16:41 -0700
Subject: [PATCH 568/667] add additional flag max-kv-cache-size

---
 include/flexflow/batch_config.h              |  1 +
 include/flexflow/flexflow_c.h                |  3 +++
 include/flexflow/request_manager.h           |  5 +++++
 inference/incr_decoding/incr_decoding.cc     |  8 ++++++++
 inference/spec_infer/spec_infer.cc           |  8 ++++++++
 inference/trace_generator/trace_generator.cc |  7 +++++++
 python/flexflow/core/flexflow_cffi.py        |  4 ++++
 src/c/flexflow_c.cc                          |  7 +++++++
 src/runtime/batch_config.cc                  |  4 ++++
 src/runtime/request_manager.cc               | 10 ++++++++++
 10 files changed, 57 insertions(+)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index eb600d726..a98f8f5ca 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -76,6 +76,7 @@ class BatchConfig {
   static int max_spec_tree_token_num();
   static int max_sequence_length();
   static int max_output_length();
+  static int max_kv_cache_size();
   static int get_max_tree_depth();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 1da5f61d6..fbd92d3f4 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -989,6 +989,9 @@ void flexflow_request_manager_set_max_sequence_length(
 void flexflow_request_manager_set_max_output_length(
     flexflow_request_manager_t handle_, int max_output_length);
 
+void flexflow_request_manager_set_max_kv_cache_size(
+    flexflow_request_manager_t handle_, int max_kv_cache_size);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 53efa8e74..a0cf2eb7b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -277,8 +277,12 @@ class RequestManager {
   int get_max_spec_tree_token_num();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
+  void set_max_kv_cache_size(int max_kv_cache_size);
+  int get_max_kv_cache_size();
   void set_max_output_length(int max_output_length);
   int get_max_output_length();
+  void set_max_kv_cache_size(int max_kv_cache_size);
+  int get_max_kv_cache_size();
   void set_decoding_mode(DecodingMode mode);
   void set_verbose(bool verbose_);
   int get_k();
@@ -393,6 +397,7 @@ class RequestManager {
   int max_spec_tree_token_num;
   int max_sequence_length;
   int max_output_length;
+  int max_kv_cache_size;
   int max_tree_depth;
   int max_tree_width;
   int k;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c342a7b70..3b24611b4 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -51,6 +51,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &max_output_length,
+                      int &max_kv_cache_size,
                       int &sampling_seed,
                       bool &streaming_cache,
                       bool &slo_attainment_early_termination,
@@ -133,6 +134,10 @@ void parse_input_args(char **argv,
       max_output_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-kv-cache-size")) {
+      max_kv_cache_size = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--sampling-seed")) {
       sampling_seed = std::stoi(argv[++i]);
       continue;
@@ -199,6 +204,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
   int max_output_length = 512;
+  int max_kv_cache_size = -1; //if -1, then use the default value
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;
@@ -228,6 +234,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_prefilling_batch,
                    max_sequence_length,
                    max_output_length,
+                   max_kv_cache_size,
                    sampling_seed,
                    streaming_cache,
                    slo_attainment_early_termination,
@@ -305,6 +312,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_max_output_length(max_output_length);
+  rm->set_max_kv_cache_size(max_kv_cache_size);
   rm->set_decoding_mode(decoding_mode);
   rm->set_slo_violation_early_termination(slo_attainment_early_termination);
   rm->set_baseline_latency(baseline_latency_ms);
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 7b6bad7c4..df7b171a2 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -68,6 +68,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &max_output_length,
+                      int &max_kv_cache_size,
                       int &max_tree_width,
                       int &max_tree_depth,
                       int &expansion_degree,
@@ -155,6 +156,10 @@ void parse_input_args(char **argv,
       max_output_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-kv-cache-size")) {
+      max_kv_cache_size = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--max-tree-width")) {
       max_tree_width = std::stoi(argv[++i]);
       continue;
@@ -378,6 +383,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 512;
   int max_output_length = 512;
+  int max_kv_cache_size = -1; // if -1, then use the default value
   int expansion_degree = 3;
   int max_tree_depth = 8;
   int max_tree_width = 16;
@@ -412,6 +418,7 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_prefilling_batch,
                    max_sequence_length,
                    max_output_length,
+                   max_kv_cache_size,
                    max_tree_width,
                    max_tree_depth,
                    expansion_degree,
@@ -460,6 +467,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_max_output_length(max_output_length);
+  rm->set_max_kv_cache_size(max_kv_cache_size);
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 853487cc4..3783b30f2 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -93,6 +93,7 @@ void parse_input_args(char **argv,
                       bool &verbose,
                       int &max_sequence_length,
                       int &max_output_length,
+                      int &max_kv_cache_size,
                       double &scaling_factor) {
   for (int i = 1; i < argc; i++) {
     // llm model name
@@ -156,6 +157,10 @@ void parse_input_args(char **argv,
       max_output_length = std::stoi(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--max-kv-cache-size")) {
+      max_kv_cache_size = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--scaling-factor")) {
       scaling_factor = std::stod(argv[++i]);
       continue;
@@ -315,6 +320,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool verbose = false;
   int max_sequence_length = 256;
   int max_output_length = 512;
+  int max_kv_cache_size = -1;
   double scaling_factor = 1.0;
 
   int max_requests_per_batch = 8;
@@ -391,6 +397,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tokens_per_prefilling_batch(max_tokens_per_prefilling_batch);
   rm->set_max_sequence_length(max_sequence_length);
   rm->set_max_output_length(max_output_length);
+  rm->set_max_kv_cache_size(max_kv_cache_size);
   rm->set_max_tree_depth(max_tree_depth);
   rm->set_max_tree_width(max_tree_width);
   rm->set_verbose(verbose);
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 49e689e06..29eeff05f 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -4215,6 +4215,10 @@ def set_max_sequence_length(self, max_length):
     def set_max_output_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_output_length(
             self.handle, max_length)
+    
+    def set_max_kv_cache_size(self, max_size):
+        return ffc().flexflow_request_manager_set_max_kv_cache_size(
+            self.handle, max_size)
 
     def start_server(self, model):
         return ffc().flexflow_request_manager_start_background_server(
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 882749fa8..f7acdd54c 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2622,6 +2622,13 @@ void flexflow_request_manager_set_max_output_length(
   DEBUG_PRINT("[RequestManager] set max_output_length %d", max_output_length);
 }
 
+void flexflow_request_manager_set_max_kv_cache_size(
+    flexflow_request_manager_t handle_, int max_kv_cache_size) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_max_kv_cache_size(max_kv_cache_size);
+  DEBUG_PRINT("[RequestManager] set max_kv_cache_size %d", max_kv_cache_size);
+}
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 46f8a77c1..45fa14d36 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -108,6 +108,10 @@ int BatchConfig::max_output_length() {
   return RequestManager::get_request_manager()->get_max_output_length();
 }
 
+int BatchConfig::max_kv_cache_size() {
+  return RequestManager::get_request_manager()->get_max_kv_cache_size();
+}
+
 int BatchConfig::max_spec_tree_token_num() {
   return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a0f805b69..c5689c50c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -130,6 +130,7 @@ RequestManager::RequestManager()
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
   max_output_length = -1;
+  max_kv_cache_size = -1;
   max_tree_depth = -1;
   max_tree_width = -1;
   k = -1;
@@ -211,6 +212,15 @@ int RequestManager::get_max_output_length() {
   return max_output_length;
 }
 
+void RequestManager::set_max_kv_cache_size(int max_kv_cache_size) {
+  this->max_kv_cache_size = max_kv_cache_size;
+}
+
+int RequestManager::get_max_kv_cache_size() {
+  return max_kv_cache_size;
+}
+
+
 void RequestManager::set_decoding_mode(DecodingMode mode) {
   assert(mode == INCREMENTAL_DECODING || mode == SPECULATIVE_DECODING);
   decoding_mode = mode;

From 26cbf6a0da0f3e5fed228fc109b2d07a848b5a76 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 21 Oct 2024 13:55:05 -0700
Subject: [PATCH 569/667] chore: typo

---
 src/runtime/request_manager.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0b3b8aaa2..130d10abb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -320,8 +320,8 @@ void RequestManager::set_spec_infer_old_version(bool spec_infer_old_version_) {
   spec_infer_old_version = spec_infer_old_version_;
 }
 
-void RequestManager::set_greedy_scheduler(bool greedy_scheduler_) {
-  greedy_scheduler = greedy_scheduler_;
+void RequestManager::set_greedy_schedule(bool greedy_schedule_) {
+  greedy_schedule = greedy_schedule_;
 }
 
 void RequestManager::set_equal_schedule(bool equal_schedule_) {
@@ -332,8 +332,8 @@ bool RequestManager::get_spec_infer_old_version() {
   return spec_infer_old_version;
 }
 
-bool RequestManager::get_greedy_scheduler() {
-  return greedy_scheduler;
+bool RequestManager::get_greedy_schedule() {
+  return greedy_schedule;
 }
 
 bool RequestManager::get_equal_schedule() {
@@ -3178,7 +3178,7 @@ void RequestManager::add_tokens_toward_goodput_per_request(int budget,
   Request &request = all_requests[guid];
   assert(request.status == Request::RUNNING);
   if (request.token_tree_nodes_acc_prob_pair_pq.empty()) {
-    continue;
+    return;
   }
 
   auto &pq = request.token_tree_nodes_acc_prob_pair_pq;
@@ -3191,7 +3191,7 @@ void RequestManager::add_tokens_toward_goodput_per_request(int budget,
     budget--;
   }
 
-  // Clear the priority queue in each requests
+  // Clear the priority queue in the request
   std::vector<std::pair<std::shared_ptr<TokenTreeNode>, double>>
       _prealloc_vector;
   _prealloc_vector.reserve(BatchConfig::MAX_SPEC_TREE_TOKEN_NUM);

From 3ec91d9902cee1ae13e4482110a434b9135e430a Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Tue, 15 Oct 2024 04:06:17 +0000
Subject: [PATCH 570/667] fix tokenizer conversion

---
 python/flexflow/serve/serve.py | 41 ++++++++++++++--------------------
 src/runtime/request_manager.cc | 21 ++++++++++-------
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index ac622b333..e76584eb4 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -27,11 +27,12 @@
     MPTConfig,
 )
 from flexflow.core import *
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM
+from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
 import sys, torch, shutil, hashlib
 from typing import Union, List
-
+from huggingface_hub import snapshot_download
 
 class GenerationConfig:
     """A class to store the sampling configs."""
@@ -261,29 +262,21 @@ def download_hf_tokenizer_if_needed(self):
         )
 
         if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
-                print(
-                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
-                )
-            # Download tokenizer from HuggingFace, or load it from the local folder
-            if self.model_type == ModelType.LLAMA:
-                hf_tokenizer = LlamaTokenizer.from_pretrained(
-                    self.model_name, use_fast=True
-                )
+            print(
+                f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
+            )
+            # Load/download the tokenizer files
+            target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]
+            if os.path.exists(self.model_name):
+                hf_tokenizer_path = self.model_name
             else:
-                hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # Print log message to notify user download of tokenizer has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading tokenizer. Saving it now...")
-            # Save tokenizer
-            hf_tokenizer.save_pretrained(self.tokenizer_path)
-            print("Done saving HF tokenizer.")
+                hf_tokenizer_path = snapshot_download(repo_id=self.model_name, allow_patterns=target_tokenizer_files)
+            for file in target_tokenizer_files:
+                src_path = os.path.join(hf_tokenizer_path, file)
+                dst_path = os.path.join(self.tokenizer_path, file)
+                if os.path.exists(src_path):
+                    shutil.copy(src_path, dst_path)
+            print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 130d10abb..779b3268f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -388,14 +388,19 @@ void RequestManager::register_tokenizer(ModelType type,
   std::string tokenizer_folder =
       (!path.empty() && path.back() != '/') ? path + '/' : path;
   if (model_type == ModelType::LLAMA) {
-    bool path_to_file = !path.empty() &&
-                        (path.size() >= strlen("tokenizer.model")) &&
-                        path.find("tokenizer.model") ==
-                            (path.size() - strlen("tokenizer.model"));
-    std::string tokenizer_filepath =
-        path_to_file ? path : tokenizer_folder + "tokenizer.model";
-    this->tokenizer_ =
-        Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
+    std::filesystem::path tokenizer_json_path;
+    if (std::filesystem::is_directory(tokenizer_folder)) {
+      tokenizer_json_path =
+          std::filesystem::path(tokenizer_folder) / "tokenizer.json";
+    } else {
+      tokenizer_json_path = tokenizer_folder;
+    }
+    if (!std::filesystem::exists(tokenizer_json_path)) {
+      std::cerr << "Failed to open file: " << tokenizer_json_path << std::endl;
+      assert(false);
+    }
+    this->tokenizer_ = Tokenizer::FromBlobJSON(
+        LoadBytesFromFile(tokenizer_json_path.string()));
   } else if (model_type == ModelType::OPT) {
     std::string vocab_file = tokenizer_folder + "vocab.json";
     std::string merges_file = tokenizer_folder + "merges.txt";

From 3f0383e1af7b73195deda0833cbca8d92f5b697f Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Tue, 15 Oct 2024 04:28:39 +0000
Subject: [PATCH 571/667] update

---
 python/flexflow/serve/serve.py |  2 +-
 src/runtime/request_manager.cc | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index e76584eb4..5f8b07915 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -266,7 +266,7 @@ def download_hf_tokenizer_if_needed(self):
                 f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
             )
             # Load/download the tokenizer files
-            target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]
+            target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt"]
             if os.path.exists(self.model_name):
                 hf_tokenizer_path = self.model_name
             else:
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 779b3268f..2b20b2b2e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -388,19 +388,27 @@ void RequestManager::register_tokenizer(ModelType type,
   std::string tokenizer_folder =
       (!path.empty() && path.back() != '/') ? path + '/' : path;
   if (model_type == ModelType::LLAMA) {
+    // try with tokenizer.json first
     std::filesystem::path tokenizer_json_path;
     if (std::filesystem::is_directory(tokenizer_folder)) {
-      tokenizer_json_path =
-          std::filesystem::path(tokenizer_folder) / "tokenizer.json";
+      tokenizer_json_path = std::filesystem::path(tokenizer_folder) / "tokenizer.json";
     } else {
       tokenizer_json_path = tokenizer_folder;
     }
-    if (!std::filesystem::exists(tokenizer_json_path)) {
-      std::cerr << "Failed to open file: " << tokenizer_json_path << std::endl;
-      assert(false);
+    if (std::filesystem::exists(tokenizer_json_path)) {
+      // load from tokenizer.json
+      this->tokenizer_ = Tokenizer::FromBlobJSON(LoadBytesFromFile(tokenizer_json_path.string()));
+    } else {
+      // load from tokenizer.model
+      std::filesystem::path tokenizer_model_path =
+          tokenizer_folder / "tokenizer.model";
+      if (!std::filesystem::exists(tokenizer_model_path)) {
+        std::cerr << "Failed to open file: " << tokenizer_model_path
+                  << std::endl;
+        assert(false);
+      }
+      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_model_path.string()));
     }
-    this->tokenizer_ = Tokenizer::FromBlobJSON(
-        LoadBytesFromFile(tokenizer_json_path.string()));
   } else if (model_type == ModelType::OPT) {
     std::string vocab_file = tokenizer_folder + "vocab.json";
     std::string merges_file = tokenizer_folder + "merges.txt";

From 3f590ae2cd5b5b7326620aaa8abdfafffab089bc Mon Sep 17 00:00:00 2001
From: zhihao <email>
Date: Tue, 15 Oct 2024 13:54:40 +0000
Subject: [PATCH 572/667] update

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index c151cdfbc..a5374939b 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -415,6 +415,7 @@ class RequestManager {
   ModelType model_type;
   int bos_token_id;
   int eos_token_id;
+  bool old_llama_tokenizer = false;
   std::string output_filepath;
   std::queue<Request> pending_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2b20b2b2e..be5454a06 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -391,23 +391,32 @@ void RequestManager::register_tokenizer(ModelType type,
     // try with tokenizer.json first
     std::filesystem::path tokenizer_json_path;
     if (std::filesystem::is_directory(tokenizer_folder)) {
-      tokenizer_json_path = std::filesystem::path(tokenizer_folder) / "tokenizer.json";
+      tokenizer_json_path =
+          std::filesystem::path(tokenizer_folder) / "tokenizer.json";
     } else {
       tokenizer_json_path = tokenizer_folder;
     }
     if (std::filesystem::exists(tokenizer_json_path)) {
+      old_llama_tokenizer = true;
       // load from tokenizer.json
-      this->tokenizer_ = Tokenizer::FromBlobJSON(LoadBytesFromFile(tokenizer_json_path.string()));
+      this->tokenizer_ = Tokenizer::FromBlobJSON(
+          LoadBytesFromFile(tokenizer_json_path.string()));
     } else {
       // load from tokenizer.model
-      std::filesystem::path tokenizer_model_path =
-          tokenizer_folder / "tokenizer.model";
+      std::filesystem::path tokenizer_model_path;
+      if (std::filesystem::is_directory(tokenizer_folder)) {
+        tokenizer_model_path =
+            std::filesystem::path(tokenizer_folder) / "tokenizer.model";
+      } else {
+        tokenizer_model_path = tokenizer_folder;
+      }
       if (!std::filesystem::exists(tokenizer_model_path)) {
         std::cerr << "Failed to open file: " << tokenizer_model_path
                   << std::endl;
         assert(false);
       }
-      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_model_path.string()));
+      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
+          LoadBytesFromFile(tokenizer_model_path.string()));
     }
   } else if (model_type == ModelType::OPT) {
     std::string vocab_file = tokenizer_folder + "vocab.json";

From 14eb152bcfe0fd1f689d8183f9f7cc894a34fcbc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 14:57:52 +0000
Subject: [PATCH 573/667] update

---
 include/flexflow/batch_config.h    |  2 +-
 include/flexflow/inference.h       |  6 +++--
 include/flexflow/request_manager.h |  1 +
 inference/spec_infer/spec_infer.cc |  2 +-
 src/runtime/request_manager.cc     | 37 +++++++++++++++---------------
 5 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 4589f91f3..980a02e42 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -87,7 +87,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
-  inline static int const MAX_NUM_TOKENS = 1024;
+  inline static int const MAX_NUM_TOKENS = 2048;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
   inline static int const MAX_TREE_DEPTH = 8;
   inline static int const MAX_TREE_WIDTH = 16;
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 5a6dd8f2b..7277c7e2f 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -45,14 +45,16 @@ struct GenerationConfig {
 
 struct GenerationRequest {
   std::string prompt;
+  bool add_special_tokens = true;
   double slo_ratio;
   double emission_time_ms;
 
   GenerationRequest(std::string const &prompt_,
                     double slo_ratio_,
-                    double emission_time_ms_)
+                    double emission_time_ms_,
+                    bool add_special_tokens_ = true)
       : prompt(prompt_), slo_ratio(slo_ratio_),
-        emission_time_ms(emission_time_ms_) {}
+        emission_time_ms(emission_time_ms_), add_special_tokens(add_special_tokens_) {}
 };
 
 struct GenerationResult {
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a5374939b..d5ebcf4c8 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -140,6 +140,7 @@ struct Request {
   int ssm_prefill_len = 0;
   int llm_prefill_len = 0;
   bool attained = true;
+  bool add_special_tokens = true;
 
   int first_token_offset_in_batch = 0;
   int num_tokens_in_batch = 0;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 7b6bad7c4..35cff67ce 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -613,7 +613,7 @@ void FlexFlow::top_level_task(Task const *task,
       std::vector<double> timestamps, ratios;
       for (auto const &json_obj : trace_json) {
         EmissionTrace trace(json_obj);
-        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0));
+        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0, false));
         timestamps.push_back(trace.emission_time_ms);
         ratios.push_back(trace.slo_ratio);
       }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index be5454a06..be81cd7a2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -385,8 +385,8 @@ void RequestManager::register_tokenizer(ModelType type,
   this->model_type = type;
   this->bos_token_id = bos_token_id;
   this->eos_token_id = eos_token_id;
-  std::string tokenizer_folder =
-      (!path.empty() && path.back() != '/') ? path + '/' : path;
+  std::filesystem::path tokenizer_folder(path);
+
   if (model_type == ModelType::LLAMA) {
     // try with tokenizer.json first
     std::filesystem::path tokenizer_json_path;
@@ -397,7 +397,6 @@ void RequestManager::register_tokenizer(ModelType type,
       tokenizer_json_path = tokenizer_folder;
     }
     if (std::filesystem::exists(tokenizer_json_path)) {
-      old_llama_tokenizer = true;
       // load from tokenizer.json
       this->tokenizer_ = Tokenizer::FromBlobJSON(
           LoadBytesFromFile(tokenizer_json_path.string()));
@@ -415,25 +414,23 @@ void RequestManager::register_tokenizer(ModelType type,
                   << std::endl;
         assert(false);
       }
+      old_llama_tokenizer = true;
       this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
           LoadBytesFromFile(tokenizer_model_path.string()));
     }
   } else if (model_type == ModelType::OPT) {
-    std::string vocab_file = tokenizer_folder + "vocab.json";
-    std::string merges_file = tokenizer_folder + "merges.txt";
-    std::string added_tokens_file =
-        tokenizer_folder + "special_tokens_map.json";
-    std::filesystem::path path1(vocab_file);
-    std::filesystem::path path2(merges_file);
-    std::filesystem::path path3(added_tokens_file);
-    assert(std::filesystem::exists(path1) &&
+    std::filesystem::path vocab_file = tokenizer_folder / "vocab.json";
+    std::filesystem::path merges_file = tokenizer_folder / "merges.txt";
+    std::filesystem::path added_tokens_file =
+        tokenizer_folder / "special_tokens_map.json";
+    assert(std::filesystem::exists(vocab_file) &&
            "Vocab file vocab.json does not exist at the specified path");
-    assert(std::filesystem::exists(path2) &&
+    assert(std::filesystem::exists(merges_file) &&
            "Merge file merges.txt does not exist at the specified path");
     // opt_tokenizer = new OptTokenizer(vocab_file, merges_file);
-    std::string vocab = LoadBytesFromFile(path1.string());
-    std::string merges = LoadBytesFromFile(path2.string());
-    std::string added_tokens = LoadBytesFromFile(path3.string());
+    std::string vocab = LoadBytesFromFile(vocab_file.string());
+    std::string merges = LoadBytesFromFile(merges_file.string());
+    std::string added_tokens = LoadBytesFromFile(added_tokens_file.string());
 
     this->tokenizer_ =
         Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
@@ -477,7 +474,8 @@ RequestManager::RequestGuid
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+  request.add_special_tokens = req.add_special_tokens;
+  if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
@@ -701,8 +699,9 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   } else {
     eos_it = request.tokens.end();
   }
-  std::string output =
-      this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
+  // std::string output =
+  //     this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
+  std::string output = this->tokenizer_->Decode(request.tokens);
 
   {
     std::lock_guard<std::mutex> const lock(request_result_mutex);
@@ -752,7 +751,7 @@ void RequestManager::request_complete_clean_up(int batch_index) {
       *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
           << std::endl;
     }
-    *os << "<boq>" << output << "<eoq>" << std::endl << std::endl;
+    *os << output << std::endl << std::endl;
 
     if (!output_filepath.empty()) {
       output_file.close();

From 13615f407da065c8757df15d6ea33aaade9458ae Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 15:51:45 +0000
Subject: [PATCH 574/667] add special tokens

---
 inference/incr_decoding/incr_decoding.cc | 15 +++++++++++----
 inference/spec_infer/spec_infer.cc       | 15 +++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index c342a7b70..f525d2408 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -58,7 +58,8 @@ void parse_input_args(char **argv,
                       int &ssm_spec_latency_ms,
                       int &llm_verify_latency_ms,
                       double &request_per_second,
-                      std::string &emission_file_path) {
+                      std::string &emission_file_path,
+                      bool &add_special_tokens) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -165,6 +166,10 @@ void parse_input_args(char **argv,
       emission_file_path = std::string(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--no-special-tokens")) {
+      add_special_tokens = false;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -208,6 +213,7 @@ void FlexFlow::top_level_task(Task const *task,
   int ssm_spec_latency_ms = 20;
   int llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
+  bool add_special_tokens = true;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -235,7 +241,8 @@ void FlexFlow::top_level_task(Task const *task,
                    ssm_spec_latency_ms,
                    llm_verify_latency_ms,
                    request_per_second,
-                   emission_file_path);
+                   emission_file_path,
+                   add_special_tokens);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -392,7 +399,7 @@ void FlexFlow::top_level_task(Task const *task,
       }
       for (size_t i = 1; i < prompt_json.size(); ++i) {
         requests.push_back(GenerationRequest(
-            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
+            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0, add_special_tokens));
       }
       PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
       // ConstantEmissionMachine emission_machine(-1, slo_ratios);
@@ -407,7 +414,7 @@ void FlexFlow::top_level_task(Task const *task,
       std::vector<double> timestamps, ratios;
       for (auto const &json_obj : trace_json) {
         EmissionTrace trace(json_obj);
-        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0));
+        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
         timestamps.push_back(trace.emission_time_ms);
         ratios.push_back(trace.slo_ratio);
       }
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 35cff67ce..4318bab5f 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -83,7 +83,8 @@ void parse_input_args(char **argv,
                       bool &spec_infer_old_version,
                       bool &greedy_schedule,
                       bool &equal_schedule,
-                      std::string &emission_file_path) {
+                      std::string &emission_file_path,
+                      bool &add_special_tokens) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -220,6 +221,10 @@ void parse_input_args(char **argv,
       emission_file_path = std::string(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--no-special-tokens")) {
+      add_special_tokens = false;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -395,6 +400,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool spec_infer_old_version = false;
   bool greedy_schedule = false;
   bool equal_schedule = false;
+  bool add_special_tokens = true;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -427,7 +433,8 @@ void FlexFlow::top_level_task(Task const *task,
                    spec_infer_old_version,
                    greedy_schedule,
                    equal_schedule,
-                   emission_file_path);
+                   emission_file_path,
+                   add_special_tokens);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -598,7 +605,7 @@ void FlexFlow::top_level_task(Task const *task,
       }
       for (size_t i = 1; i < prompt_json.size(); ++i) {
         requests.push_back(GenerationRequest(
-            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0));
+            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0, add_special_tokens));
       }
       PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
       //   ConstantEmissionMachine emission_machine(-1, slo_ratios);
@@ -613,7 +620,7 @@ void FlexFlow::top_level_task(Task const *task,
       std::vector<double> timestamps, ratios;
       for (auto const &json_obj : trace_json) {
         EmissionTrace trace(json_obj);
-        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0, false));
+        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
         timestamps.push_back(trace.emission_time_ms);
         ratios.push_back(trace.slo_ratio);
       }

From 5a0c1ca31dfeb4a3a09388bb33677277610fc5dc Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Sun, 29 Sep 2024 06:28:22 +0000
Subject: [PATCH 575/667] Update LLAMA tokenizer (#1524)

* fix tokenizer conversion

* update

* update

* update

* fix

* fix

* lint

* simplify api

* fix

* fix

* fix

* update to 12.1 (#1512)

* fix deadlock?

* remove barrier where not strictly needed

---------

Co-authored-by: zhihao <email>
---
 .gitignore                                    |   2 +
 include/flexflow/flexflow_c.h                 |  36 +++++
 include/flexflow/inference.h                  |  39 +++++-
 include/flexflow/layer.h                      |   3 +
 include/flexflow/model.h                      |  83 ++++++------
 include/flexflow/operator.h                   |  16 ++-
 .../ops/inc_multihead_self_attention.h        |  12 +-
 .../ops/inc_multihead_self_attention_params.h |   6 +-
 .../ops/spec_inc_multihead_self_attention.h   |   8 +-
 ...spec_inc_multihead_self_attention_params.h |   4 +-
 .../ops/tree_inc_multihead_self_attention.h   |   8 +-
 ...tree_inc_multihead_self_attention_params.h |   5 +-
 inference/models/falcon.cc                    |   6 +-
 inference/models/falcon.h                     |  24 +++-
 inference/models/llama.cc                     |   6 +-
 inference/models/llama.h                      |  24 +++-
 inference/models/mpt.cc                       |   6 +-
 inference/models/mpt.h                        |   2 +
 inference/models/opt.cc                       |  12 +-
 inference/models/opt.h                        |   5 +-
 inference/models/starcoder.cc                 |   2 +-
 inference/models/starcoder.h                  |   2 +
 python/flexflow/core/flexflow_cffi.py         | 124 ++++++++++++++----
 python/flexflow/serve/models/falcon.py        |  22 ++--
 python/flexflow/serve/models/llama.py         |  22 ++--
 python/flexflow/serve/models/mpt.py           |  12 +-
 python/flexflow/serve/models/opt.py           |  12 +-
 python/flexflow/serve/models/starcoder.py     |  10 +-
 src/c/flexflow_c.cc                           |  90 ++++++++++++-
 src/ops/inc_multihead_self_attention.cc       |  69 +++++++---
 src/ops/inc_multihead_self_attention.cpp      | 123 ++++++++---------
 src/ops/inc_multihead_self_attention.cu       |  10 +-
 src/ops/spec_inc_multihead_self_attention.cc  |  69 +++++++---
 src/ops/spec_inc_multihead_self_attention.cpp |   2 +-
 src/ops/spec_inc_multihead_self_attention.cu  |   4 +-
 src/ops/tree_inc_multihead_self_attention.cc  |  71 +++++++---
 src/ops/tree_inc_multihead_self_attention.cpp |   2 +-
 src/ops/tree_inc_multihead_self_attention.cu  |   4 +-
 src/runtime/graph.cc                          |  84 ++++++++++--
 src/runtime/layer.cc                          |  17 +++
 40 files changed, 769 insertions(+), 289 deletions(-)

diff --git a/.gitignore b/.gitignore
index 34ecb8e0d..f21d30b2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,3 +190,5 @@ python/flexflow/version.txt
 
 inference_tensors
 tests/inference/python_test_configs/*.json
+
+core.*
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 1da5f61d6..9423d7b4c 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -445,6 +445,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -466,6 +472,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -487,6 +499,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -508,6 +526,12 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -530,6 +554,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -552,6 +582,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 7277c7e2f..8450f610d 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -160,8 +160,43 @@ class TraceEmissionMachine : public EmissionMachine {
   double sample_slo_ratio() override;
 };
 
-#include <string>
-#include <vector>
+struct RotaryEmbeddingMeta {
+  bool apply_rotary_embedding = false;
+  float rope_theta = 10000.0f;
+  std::string rope_type = "default";
+  float factor = 8.0f;
+  float low_freq_factor = 1.0f;
+  float high_freq_factor = 4.0f;
+  int original_max_position_embeddings = 8192;
+
+  RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
+                      float rope_theta_ = 10000.0f,
+                      std::string rope_type_ = "default",
+                      float factor_ = 8.0f,
+                      float low_freq_factor_ = 1.0f,
+                      float high_freq_factor_ = 4.0f,
+                      int original_max_position_embeddings_ = 8192)
+      : apply_rotary_embedding(apply_rotary_embedding_),
+        rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
+        low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
+        original_max_position_embeddings(original_max_position_embeddings_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  RotaryEmbeddingMeta const &meta) {
+    os << std::boolalpha // To print bool as true/false instead of 1/0
+       << "RotaryEmbeddingMeta {\n"
+       << "  apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
+       << "  rope_theta: " << meta.rope_theta << ",\n"
+       << "  rope_type: \"" << meta.rope_type << "\",\n"
+       << "  factor: " << meta.factor << ",\n"
+       << "  low_freq_factor: " << meta.low_freq_factor << ",\n"
+       << "  high_freq_factor: " << meta.high_freq_factor << ",\n"
+       << "  original_max_position_embeddings: "
+       << meta.original_max_position_embeddings << "\n"
+       << "}";
+    return os;
+  }
+};
 
 std::string join_path(std::vector<std::string> const &paths);
 
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 69a57e4e1..9d9045a44 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -32,11 +32,13 @@ class Layer {
   void add_float_property(std::string const &key, float value);
   void add_int_vector_property(std::string const &key,
                                std::vector<int> const &value);
+  void add_string_property(std::string const &key, std::string const &value);
   void add_initializer(std::string const &key, Initializer *initializer);
   bool get_int_property(std::string const &key, long long &value) const;
   bool get_float_property(std::string const &key, float &value) const;
   bool get_int_vector_property(std::string const &key,
                                std::vector<int> &value) const;
+  bool get_string_property(std::string const &key, std::string &value) const;
   bool get_initializer(std::string const &key, Initializer *&initializer) const;
   Tensor get_parameter(int index);
   void print();
@@ -59,6 +61,7 @@ class Layer {
   std::unordered_map<std::string, float> float_properties;
   std::unordered_map<std::string, Initializer *> initializers;
   std::unordered_map<std::string, std::vector<int>> int_vector_properties;
+  std::unordered_map<std::string, std::string> string_properties;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index e7974756b..59477ed00 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -709,43 +709,44 @@ class FFModel {
                              DataType data_type = DT_NONE,
                              Initializer *kernel_initializer = NULL,
                              char const *name = NULL);
-  Tensor inc_multihead_self_attention(Tensor const input,
-                                      int embed_dim,
-                                      int num_heads,
-                                      int kdim = 0,
-                                      int vdim = 0,
-                                      float dropout = 0.0f,
-                                      bool bias = false,
-                                      bool add_bias_kv = false,
-                                      bool add_zero_attn = false,
-                                      DataType data_type = DT_NONE,
-                                      Initializer *kernel_initializer = NULL,
-                                      bool apply_rotary_embedding = false,
-                                      bool scaling_query = false,
-                                      float scaling_factor = 1.0f,
-                                      bool qk_prod_scaling = true,
-                                      bool position_bias = false,
-                                      bool streaming_cache = false,
-                                      char const *name = NULL);
-  Tensor
-      spec_inc_multihead_self_attention(Tensor const input,
-                                        int embed_dim,
-                                        int num_heads,
-                                        int kdim = 0,
-                                        int vdim = 0,
-                                        float dropout = 0.0f,
-                                        bool bias = false,
-                                        bool add_bias_kv = false,
-                                        bool add_zero_attn = false,
-                                        DataType data_type = DT_NONE,
-                                        Initializer *kernel_initializer = NULL,
-                                        bool apply_rotary_embedding = false,
-                                        bool scaling_query = false,
-                                        float scaling_factor = 1.0f,
-                                        bool qk_prod_scaling = true,
-                                        bool position_bias = false,
-                                        bool streaming_cache = false,
-                                        char const *name = NULL);
+  Tensor inc_multihead_self_attention(
+      const Tensor input,
+      int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
+  Tensor spec_inc_multihead_self_attention(
+      const Tensor input,
+      int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
   Tensor inc_multihead_self_attention_verify(
       Tensor const input,
       int embed_dim,
@@ -758,7 +759,7 @@ class FFModel {
       bool add_zero_attn = false,
       DataType data_type = DT_NONE,
       Initializer *kernel_initializer = NULL,
-      bool apply_rotary_embedding = false,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
       bool scaling_query = false,
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
@@ -776,7 +777,7 @@ class FFModel {
                                    bool add_zero_attn = false,
                                    DataType data_type = DT_NONE,
                                    Initializer *kernel_initializer = NULL,
-                                   bool apply_rotary_embedding = false,
+                                   RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
                                    bool scaling_query = false,
                                    float scaling_factor = 1.0f,
                                    bool qk_prod_scaling = true,
@@ -796,7 +797,7 @@ class FFModel {
                                          bool add_zero_attn = false,
                                          DataType data_type = DT_NONE,
                                          Initializer *kernel_initializer = NULL,
-                                         bool apply_rotary_embedding = false,
+                                         RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
                                          bool scaling_query = false,
                                          float scaling_factor = 1.0f,
                                          bool qk_prod_scaling = true,
@@ -816,7 +817,7 @@ class FFModel {
       bool add_zero_attn = false,
       DataType data_type = DT_NONE,
       Initializer *kernel_initializer = NULL,
-      bool apply_rotary_embedding = false,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
       bool scaling_query = false,
       float scaling_factor = 1.0f,
       bool qk_prod_scaling = true,
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 311699d92..34387b87b 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -304,8 +304,20 @@ class Op {
         assert(false && "Tensor data type not supported");
       }
     }
-    // only dump the weights once
-    if (m->decoding_step == 0) {
+
+    // only dump the weights in the forward pass, at the first step
+    // note that we do not save the weight gradients, since we only support
+    // finetuning LoRA weights, which are not FF tensors.
+    // Set FF_DEBG_NO_WEIGHTS=1 or to FF_DEBG_NO_WEIGHTS=true to disable saving
+    // weights
+    bool do_not_save_weights =
+        (std::getenv("FF_DEBG_NO_WEIGHTS") &&
+         (std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "1" ||
+          std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "true"));
+    if (fwd_pass && m->decoding_step == 0 && !do_not_save_weights) {
+      fs::path dst_filepath_weights =
+          get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
+          layername;
       for (int i = 0; i < weight_tensors.size(); i++) {
         std::string filename = base_filepath + "_weight_" + std::to_string(i);
         if (weight_tensors[i].data_type == DT_FLOAT) {
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 8db1c072d..8bc3b15a3 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -40,7 +40,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool _qkv_bias,
                             bool _final_bias,
                             bool _add_zero_attn,
-                            bool _apply_rotary_embedding,
+                            RotaryEmbeddingMeta _rotary_embedding_meta,
                             bool _scaling_query,
                             float _scaling_factor,
                             bool _qk_prod_scaling,
@@ -63,7 +63,7 @@ class IncMultiHeadSelfAttention : public Op {
                             bool _qkv_bias,
                             bool _final_bias,
                             bool _add_zero_attn,
-                            bool _apply_rotary_embedding,
+                            RotaryEmbeddingMeta _rotary_embedding_meta,
                             bool _scaling_query,
                             float _scaling_factor,
                             bool _qk_prod_scaling,
@@ -129,8 +129,8 @@ class IncMultiHeadSelfAttention : public Op {
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
+  bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
@@ -153,7 +153,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 int _qk_dim,
                                 int _v_dim,
                                 int _o_dim,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _qkv_bias,
                                 bool _scaling_query,
                                 bool _qk_prod_scaling,
@@ -180,7 +180,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
       local_hidden_size;
   bool *has_load_weights;
-  bool *apply_rotary_embedding;
+  RotaryEmbeddingMeta *rotary_embedding_meta;
   bool *qkv_bias;
   bool *final_bias;
   bool *scaling_query;
diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
index 7c259a0a9..809c4f19e 100644
--- a/include/flexflow/ops/inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -3,6 +3,7 @@
 
 #include "flexflow/ffconst.h"
 #include "flexflow/fftype.h"
+#include "flexflow/inference.h"
 #include "flexflow/parallel_tensor.h"
 
 namespace FlexFlow {
@@ -12,8 +13,9 @@ struct IncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
       tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
+      position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   DataType quantization_type;
   bool offload, streaming_cache;
   char name[MAX_OPNAME];
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index b08e161c5..625cc9ee2 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -36,7 +36,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
@@ -56,7 +56,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
@@ -122,8 +122,8 @@ class SpecIncMultiHeadSelfAttention : public Op {
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
+  bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   bool streaming_cache;
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index 2def2a51c..f79b3c6aa 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -11,8 +11,8 @@ struct SpecIncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
   int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   bool streaming_cache;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 6126183d1..3edf4dbd7 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -36,7 +36,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
@@ -58,7 +58,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 bool _qkv_bias,
                                 bool _final_bias,
                                 bool _add_zero_attn,
-                                bool _apply_rotary_embedding,
+                                RotaryEmbeddingMeta _rotary_embedding_meta,
                                 bool _scaling_query,
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
@@ -124,8 +124,8 @@ class TreeIncMultiHeadSelfAttention : public Op {
   int num_q_heads, num_kv_heads, tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias;
-  bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
-      qk_prod_scaling, position_bias;
+  bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   int hidden_size, qk_dim, v_dim, o_dim;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
index d1a51b8b8..3906210d4 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
@@ -12,8 +12,9 @@ struct TreeIncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
       tensor_parallelism_degree;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-      scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
+      position_bias;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   DataType quantization_type;
   bool offload;
   char name[MAX_OPNAME];
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 4bd71421d..d6b6e6a14 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -112,7 +112,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
+            falcon_config.rotary_embedding_meta,
             false,   /*scaling query*/
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
@@ -138,7 +138,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
+            falcon_config.rotary_embedding_meta,
             false,   /*scaling query*/
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
@@ -163,7 +163,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
+            falcon_config.rotary_embedding_meta,
             false,   /*scaling query*/
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index e7aa4fecf..393462633 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -50,6 +50,26 @@ class FALCON {
                         : model_config["num_hidden_layers"];
           parallel_attn = model_config["parallel_attn"];
           vocab_size = model_config["vocab_size"];
+          rotary_embedding_meta.apply_rotary_embedding = true;
+          if (model_config.find("rope_theta") != model_config.end()) {
+            rotary_embedding_meta.rope_theta = model_config["rope_theta"];
+          } else {
+            rotary_embedding_meta.rope_theta = 10000.0f;
+          }
+          if (model_config.find("scaling_factor") != model_config.end() &&
+              !model_config["scaling_factor"].is_null()) {
+            rotary_embedding_meta.rope_type =
+                model_config["scaling_factor"]["rope_type"];
+            rotary_embedding_meta.factor =
+                model_config["scaling_factor"]["factor"];
+            rotary_embedding_meta.low_freq_factor =
+                model_config["scaling_factor"]["low_freq_factor"];
+            rotary_embedding_meta.high_freq_factor =
+                model_config["scaling_factor"]["high_freq_factor"];
+            rotary_embedding_meta.original_max_position_embeddings =
+                model_config["scaling_factor"]
+                            ["original_max_position_embeddings"];
+          }
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -75,7 +95,8 @@ class FALCON {
       std::cout << "\tn_layer: " << n_layer << std::endl;
       std::cout << "\tparallel_attn: " << parallel_attn << std::endl;
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
-
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
       std::cout << "\tk_of_arg_topk: " << k_of_arg_topk << std::endl;
@@ -84,6 +105,7 @@ class FALCON {
     bool bias, multi_query, parallel_attn;
     int hidden_size, n_head, n_head_kv, n_layer, vocab_size;
     float layer_norm_epsilon;
+    RotaryEmbeddingMeta rotary_embedding_meta;
     // int max_seq_len, max_num_tokens;
     int k_of_arg_topk;
   };
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 5a3c6ed00..a9a111a2f 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -109,7 +109,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
+            llama_config.rotary_embedding_meta,
             false,   /*scaling query*/
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
@@ -134,7 +134,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
-            true,    /*apply_rotary_embedding*/
+            llama_config.rotary_embedding_meta,
             false,   /*scaling query*/
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
@@ -158,7 +158,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             false,           /*add_zero_attn*/
             DT_NONE,         /*data_type*/
             nullptr,         /*kernel_initializer*/
-            true,            /*apply_rotary_embedding*/
+            llama_config.rotary_embedding_meta,
             false,           /*scaling query*/
             1.0f,            /*scaling factor*/
             true,            /*qk_prod_scaling*/
diff --git a/inference/models/llama.h b/inference/models/llama.h
index a5b2c4a40..3f11ca96d 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -44,6 +44,26 @@ class LLAMA {
           hidden_size = model_config["hidden_size"];
           rms_norm_eps = model_config["rms_norm_eps"];
           intermediate_size = model_config["intermediate_size"];
+          rotary_embedding_meta.apply_rotary_embedding = true;
+          if (model_config.find("rope_theta") != model_config.end()) {
+            rotary_embedding_meta.rope_theta = model_config["rope_theta"];
+          } else {
+            rotary_embedding_meta.rope_theta = 10000.0f;
+          }
+          if (model_config.find("scaling_factor") != model_config.end() &&
+              !model_config["scaling_factor"].is_null()) {
+            rotary_embedding_meta.rope_type =
+                model_config["scaling_factor"]["rope_type"];
+            rotary_embedding_meta.factor =
+                model_config["scaling_factor"]["factor"];
+            rotary_embedding_meta.low_freq_factor =
+                model_config["scaling_factor"]["low_freq_factor"];
+            rotary_embedding_meta.high_freq_factor =
+                model_config["scaling_factor"]["high_freq_factor"];
+            rotary_embedding_meta.original_max_position_embeddings =
+                model_config["scaling_factor"]
+                            ["original_max_position_embeddings"];
+          }
         } catch (json::exception const &e) {
           std::cerr << "Error parsing LLAMA config from JSON file: " << e.what()
                     << std::endl;
@@ -68,7 +88,8 @@ class LLAMA {
       std::cout << "\thidden_size: " << hidden_size << std::endl;
       std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl;
       std::cout << "\tintermediate_size: " << intermediate_size << std::endl;
-
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
       std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl;
@@ -79,6 +100,7 @@ class LLAMA {
     int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads,
         hidden_size, intermediate_size;
     float rms_norm_eps;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_llama_model(FFModel &ff,
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index d13fb6bae..fd49f2b84 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -108,7 +108,7 @@ void MPT::create_mpt_model(FFModel &ff,
             false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
@@ -132,7 +132,7 @@ void MPT::create_mpt_model(FFModel &ff,
             false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
@@ -156,7 +156,7 @@ void MPT::create_mpt_model(FFModel &ff,
             false,
             DT_NONE, /*data_type*/
             NULL,
-            false,
+            mpt_config.rotary_embedding_meta,
             /*scaling query*/ true,
             /*scaling factor*/
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 8a42b0e2d..bd7a9410f 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -37,6 +37,7 @@ class MPT {
           n_heads = model_config["n_heads"];
           n_layers = model_config["n_layers"];
           vocab_size = model_config["vocab_size"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -62,6 +63,7 @@ class MPT {
     // int max_seq_len, max_num_tokens;
     int k_of_arg_topk;
     int hidden_size, n_heads, n_layers, vocab_size;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_mpt_model(FFModel &ff,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 837c8de0c..4b7476ce3 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -116,8 +116,8 @@ void OPT::create_opt_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
@@ -140,8 +140,8 @@ void OPT::create_opt_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
@@ -164,8 +164,8 @@ void OPT::create_opt_model(FFModel &ff,
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
-            false,   /*apply_rotary_embedding*/
-            true,    /*scaling query*/
+            opt_config.rotary_embedding_meta,
+            true, /*scaling query*/
             pow((opt_config.hidden_size / opt_config.num_attention_heads),
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
diff --git a/inference/models/opt.h b/inference/models/opt.h
index bc142d7d0..90443e872 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -45,6 +45,7 @@ class OPT {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           word_embed_proj_dim = model_config["word_embed_proj_dim"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing JSON file: " << e.what() << std::endl;
           assert(false);
@@ -77,7 +78,8 @@ class OPT {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim
                 << std::endl;
-
+      std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta
+                << std::endl;
       // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl;
       // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl;
       std::cout << "\tk_of_arg_topk : " << k_of_arg_topk << std::endl;
@@ -89,6 +91,7 @@ class OPT {
     float dropout;
     int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads,
         num_hidden_layers, vocab_size, word_embed_proj_dim;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_opt_model(FFModel &ff,
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index dbce90b7c..7a6e679df 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -120,7 +120,7 @@ void STARCODER::create_starcoder_model(
             false,                       /*add_zero_attn*/
             DT_NONE,                     /*data_type*/
             nullptr,                     /*kernel_initializer*/
-            false,                       /*apply_rotary_embedding*/
+            startcoder_config.rotary_embedding_meta, /*apply_rotary_embedding*/
             false,                       /*scaling query*/
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index e56e0f098..7241acde3 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -41,6 +41,7 @@ class STARCODER {
           intermediate_size = model_config["n_inner"];
           dropout_p = model_config["attn_pdrop"];
           max_position_embeddings = model_config["n_positions"];
+          rotary_embedding_meta.apply_rotary_embedding = false;
         } catch (json::exception const &e) {
           std::cerr << "Error parsing STARCODER config from JSON file: "
                     << e.what() << std::endl;
@@ -63,6 +64,7 @@ class STARCODER {
     int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
         intermediate_size, max_position_embeddings;
     float layer_norm_epsilon, dropout_p;
+    RotaryEmbeddingMeta rotary_embedding_meta;
   };
 
   static void create_starcoder_model(FFModel &ff,
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 49e689e06..cd39f8da0 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -1241,6 +1241,46 @@ def get_weights(self, ffmodel):
         assert ret_val == True
         return np_array
 
+# -----------------------------------------------------------------------
+# Request
+# -----------------------------------------------------------------------
+
+
+class Request:
+    """A class to record the metadata of an inference or finetuning request."""
+
+    def __init__(
+        self,
+        req_type: RequestType,
+        prompt: str = None,
+        max_sequence_length: int = 128,
+        peft_model_id: PEFTModelID = None,
+        dataset_filepath: str = None,
+        max_training_steps: int = 1,
+    ):
+        self.req_type = req_type
+        self.prompt = prompt
+        self.max_sequence_length = max_sequence_length
+        self.peft_model_id = peft_model_id
+        self.dataset_filepath = dataset_filepath
+        self.max_training_steps = max_training_steps
+
+
+# -----------------------------------------------------------------------
+# RotaryEmbeddingMeta
+# -----------------------------------------------------------------------
+
+
+@dataclass
+class RotaryEmbeddingMeta:
+    apply_rotary_embedding: bool = False
+    rope_theta: float = 10000.0
+    rope_type: str = "default"
+    factor: float = 8.0
+    low_freq_factor: float = 1.0
+    high_freq_factor: float = 4.0
+    original_max_position_embeddings: int = 8192
+
 
 # -----------------------------------------------------------------------
 # FFModel
@@ -2676,7 +2716,7 @@ def inc_multihead_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -2720,8 +2760,8 @@ def inc_multihead_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -2756,7 +2796,13 @@ def inc_multihead_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -2779,7 +2825,7 @@ def spec_inc_multihead_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -2824,8 +2870,8 @@ def spec_inc_multihead_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -2860,7 +2906,13 @@ def spec_inc_multihead_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -2884,7 +2936,7 @@ def inc_multihead_self_attention_verify(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -2928,8 +2980,8 @@ def inc_multihead_self_attention_verify(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -2964,7 +3016,13 @@ def inc_multihead_self_attention_verify(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -2988,7 +3046,7 @@ def groupquery_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3036,8 +3094,8 @@ def groupquery_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3073,7 +3131,13 @@ def groupquery_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3098,7 +3162,7 @@ def spec_inc_multiquery_self_attention(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3145,8 +3209,8 @@ def spec_inc_multiquery_self_attention(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3182,7 +3246,13 @@ def spec_inc_multiquery_self_attention(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
@@ -3206,7 +3276,7 @@ def inc_multiquery_self_attention_verify(
         add_zero_attn=False,
         data_type=DataType.DT_NONE,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
+        rotary_embedding_meta=RotaryEmbeddingMeta(),
         scaling_query=False,
         scaling_factor=1.0,
         qk_prod_scaling=True,
@@ -3253,8 +3323,8 @@ def inc_multiquery_self_attention_verify(
         :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
+        :type rotary_embedding_meta: RotaryEmbeddingMeta
 
         :param scaling_query: Whether to apply scaling query. Default is False.
         :type scaling_query: bool
@@ -3290,7 +3360,13 @@ def inc_multiquery_self_attention_verify(
             add_zero_attn,
             c_data_type,
             kernel_init_handle,
-            apply_rotary_embedding,
+            rotary_embedding_meta.apply_rotary_embedding,
+            rotary_embedding_meta.rope_theta,
+            get_c_name(rotary_embedding_meta.rope_type),
+            rotary_embedding_meta.factor,
+            rotary_embedding_meta.low_freq_factor,
+            rotary_embedding_meta.high_freq_factor,
+            rotary_embedding_meta.original_max_position_embeddings,
             scaling_query,
             scaling_factor,
             qk_prod_scaling,
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 7d8091726..1b5491f3c 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -41,6 +41,17 @@ def __init__(self, hf_config):
         )
         self.parallel_attn = hf_config.parallel_attn
         self.vocab_size = hf_config.vocab_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(
+            apply_rotary_embedding=True,
+            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+        )
+        if "rope_scaling" in hf_config.__dict__:
+            if hf_config.rope_scaling is not None:
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
+                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = self.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -54,8 +65,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -63,11 +72,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.falcon_config = FalconConfig(hf_config)
-        # self.falcon_config.max_seq_length = max_seq_length
-        # self.falcon_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -152,7 +158,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.falcon_config.rotary_embedding_meta,
                     name=f"layers_{i}_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
@@ -169,7 +175,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.falcon_config.rotary_embedding_meta,
                     name=f"layers_{i}_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
@@ -186,7 +192,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.falcon_config.rotary_embedding_meta,
                     name=f"layers_{i}_attention",
                 )
             else:
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 503a4b40f..c8b5bfb11 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -19,8 +19,6 @@
 
 class LLAMAConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -29,6 +27,17 @@ def __init__(self, hf_config):
         self.hidden_size = hf_config.hidden_size
         self.rms_norm_eps = hf_config.rms_norm_eps
         self.intermediate_size = hf_config.intermediate_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(
+            apply_rotary_embedding=True,
+            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+        )
+        if "rope_scaling" in hf_config.__dict__:
+            if hf_config.rope_scaling is not None:
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
+                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = (
@@ -55,11 +64,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.llama_config = LLAMAConfig(hf_config)
-        # self.llama_config.max_seq_length = max_seq_length
-        # self.llama_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -144,7 +150,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.llama_config.rotary_embedding_meta,
                     name=f"layers_{i}_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
@@ -163,7 +169,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.llama_config.rotary_embedding_meta,
                     name=f"layers_{i}_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
@@ -182,7 +188,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    True,  # apply_rotary_embedding
+                    self.llama_config.rotary_embedding_meta,
                     name=f"layers_{i}_attention",
                 )
             else:
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 92867fd49..e7d2c1990 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -19,8 +19,6 @@
 
 class MPTConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -28,6 +26,7 @@ def __init__(self, hf_config):
         self.n_heads = hf_config.n_heads
         self.n_layers = hf_config.n_layers
         self.vocab_size = hf_config.vocab_size
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.n_heads
         self.num_key_value_heads = hf_config.n_heads
@@ -50,11 +49,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.mpt_config = MPTConfig(hf_config)
-        # self.mpt_config.max_seq_length = max_seq_length
-        # self.mpt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -142,7 +138,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
@@ -163,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
@@ -184,7 +180,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.mpt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.mpt_config.hidden_size / self.mpt_config.n_heads)
                     ** (-0.5),  # scaling_factor
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index b715f5f35..a121bf399 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -34,6 +34,7 @@ def __init__(self, hf_config):
         self.num_hidden_layers = hf_config.num_hidden_layers
         self.vocab_size = hf_config.vocab_size
         self.word_embed_proj_dim = hf_config.word_embed_proj_dim
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = hf_config.num_attention_heads
@@ -47,8 +48,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -56,11 +55,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.opt_config = OPTConfig(hf_config)
-        # self.opt_config.max_seq_length = max_seq_length
-        # self.opt_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -158,7 +154,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
@@ -178,7 +174,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
@@ -198,7 +194,7 @@ def build_model(self, max_tokens_per_batch):
                     False,  # add_zero_attn
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
-                    False,  # apply_rotary_embedding
+                    self.opt_config.rotary_embedding_meta,
                     True,  # scaling_query
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index dee5a5a2d..9272addb3 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -19,8 +19,6 @@
 
 class STARCODERConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
         self.max_beam_width = 1
         self.max_beam_depth = 8
         self.max_spec_tree_token_num = 20
@@ -32,6 +30,7 @@ def __init__(self, hf_config):
         self.vocab_size = hf_config.vocab_size
         self.intermediate_size = hf_config.n_inner
         self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head
+        self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False)
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -45,8 +44,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -54,11 +51,8 @@ def __init__(
         self.mode = mode
         self.generation_config = generation_config
         self.ffconfig = ffconfig
-        # self.max_batch_size = max_batch_size
         self.data_type = data_type
         self.starcoder_config = STARCODERConfig(hf_config)
-        # self.starcoder_config.max_seq_length = max_seq_length
-        # self.starcoder_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
@@ -158,7 +152,7 @@ def build_model(self, max_tokens_per_batch):
                 False,  # add_zero_attn
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
-                False,  # apply_rotary_embedding
+                self.starcoder_config.rotary_embedding_meta,
                 name=f"layers_{i}_attention",
             )
 
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 882749fa8..b81552043 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -1199,6 +1199,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1209,6 +1215,13 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor = handle->inc_multihead_self_attention(input,
                                                        embed_dim,
                                                        num_heads,
@@ -1220,7 +1233,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
                                                        add_zero_attn,
                                                        data_type,
                                                        kernel_initializer,
-                                                       apply_rotary_embedding,
+                                                       rotary_embedding_meta,
                                                        scaling_query,
                                                        scaling_factor,
                                                        qk_prod_scaling,
@@ -1244,6 +1257,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1254,6 +1273,13 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->spec_inc_multihead_self_attention(input,
                                                 embed_dim,
@@ -1266,7 +1292,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
                                                 add_zero_attn,
                                                 data_type,
                                                 kernel_initializer,
-                                                apply_rotary_embedding,
+                                                rotary_embedding_meta,
                                                 scaling_query,
                                                 scaling_factor,
                                                 qk_prod_scaling,
@@ -1290,6 +1316,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1299,6 +1331,13 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->inc_multihead_self_attention_verify(input,
                                                   embed_dim,
@@ -1311,7 +1350,7 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
                                                   add_zero_attn,
                                                   data_type,
                                                   kernel_initializer,
-                                                  apply_rotary_embedding,
+                                                  rotary_embedding_meta,
                                                   scaling_query,
                                                   scaling_factor,
                                                   qk_prod_scaling,
@@ -1335,6 +1374,12 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1345,6 +1390,13 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor = handle->groupquery_self_attention(input,
                                                     embed_dim,
                                                     num_q_heads,
@@ -1357,7 +1409,7 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
                                                     add_zero_attn,
                                                     data_type,
                                                     kernel_initializer,
-                                                    apply_rotary_embedding,
+                                                    rotary_embedding_meta,
                                                     scaling_query,
                                                     scaling_factor,
                                                     qk_prod_scaling,
@@ -1382,6 +1434,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1392,6 +1450,13 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->spec_inc_multiquery_self_attention(input,
                                                  embed_dim,
@@ -1405,7 +1470,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
                                                  add_zero_attn,
                                                  data_type,
                                                  kernel_initializer,
-                                                 apply_rotary_embedding,
+                                                 rotary_embedding_meta,
                                                  scaling_query,
                                                  scaling_factor,
                                                  qk_prod_scaling,
@@ -1430,6 +1495,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
     enum DataType data_type,
     flexflow_initializer_t kernel_initializer_,
     bool apply_rotary_embedding,
+    float rope_theta,
+    char const *rope_type,
+    float rope_factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -1439,6 +1510,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
   Tensor input = FFCObjectWrapper::unwrap(input_);
   Initializer *kernel_initializer =
       FFCObjectWrapper::unwrap(kernel_initializer_);
+  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
+                                            rope_theta,
+                                            rope_type,
+                                            rope_factor,
+                                            low_freq_factor,
+                                            high_freq_factor,
+                                            original_max_position_embeddings);
   Tensor tensor =
       handle->inc_multiquery_self_attention_verify(input,
                                                    embed_dim,
@@ -1452,7 +1530,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
                                                    add_zero_attn,
                                                    data_type,
                                                    kernel_initializer,
-                                                   apply_rotary_embedding,
+                                                   rotary_embedding_meta,
                                                    scaling_query,
                                                    scaling_factor,
                                                    qk_prod_scaling,
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index d55473231..6a98d26f7 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -65,7 +65,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                              bool add_zero_attn,
                                              DataType data_type,
                                              Initializer *kernel_initializer,
-                                             bool apply_rotary_embedding,
+                                             RotaryEmbeddingMeta rotary_embedding_meta,
                                              bool scaling_query,
                                              float scaling_factor,
                                              bool qk_prod_scaling,
@@ -84,7 +84,7 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                    add_zero_attn,
                                    data_type,
                                    kernel_initializer,
-                                   apply_rotary_embedding,
+                                   rotary_embedding_meta,
                                    scaling_query,
                                    scaling_factor,
                                    qk_prod_scaling,
@@ -105,7 +105,7 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
                                           bool add_zero_attn,
                                           DataType data_type,
                                           Initializer *kernel_initializer,
-                                          bool apply_rotary_embedding,
+                                          RotaryEmbeddingMeta rotary_embedding_meta,,
                                           bool scaling_query,
                                           float scaling_factor,
                                           bool qk_prod_scaling,
@@ -200,7 +200,17 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
   li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
@@ -238,8 +248,18 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
   bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
@@ -270,7 +290,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
                                        qkv_bias,
                                        final_bias,
                                        add_zero_attn,
-                                       apply_rotary_embedding,
+                                       rotary_embedding_meta,
                                        scaling_query,
                                        scaling_factor,
                                        qk_prod_scaling,
@@ -296,7 +316,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -319,7 +339,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
@@ -409,7 +429,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -433,7 +453,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
@@ -529,7 +549,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 other.qkv_bias,
                                 other.final_bias,
                                 other.add_zero_attn,
-                                other.apply_rotary_embedding,
+                                other.rotary_embedding_meta,
                                 other.scaling_query,
                                 other.scaling_factor,
                                 other.qk_prod_scaling,
@@ -559,7 +579,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 params.qkv_bias,
                                 params.final_bias,
                                 params.add_zero_attn,
-                                params.apply_rotary_embedding,
+                                params.rotary_embedding_meta,
                                 params.scaling_query,
                                 params.scaling_factor,
                                 params.qk_prod_scaling,
@@ -906,7 +926,19 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -925,7 +957,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -958,7 +990,14 @@ size_t hash<FlexFlow::IncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qkv_bias);
   hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index ed2caea7e..5e07fa214 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -21,6 +21,7 @@
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
+#include <math_constants.h>
 
 namespace FlexFlow {
 
@@ -124,57 +125,17 @@ __global__ void scaling_query_kernel(DT *input_ptr,
   }
 }
 
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  hipFloatComplex *complex_input,
-                                  /* Reserved: BatchConfig Updated */
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qProjSize,
-                                  int kProjSize,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i,
-      num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qProjSize : kProjSize;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    hipFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = hipCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
 template <typename DT>
 __global__ void
     apply_rotary_embedding_hf(DT *input_ptr,
                               hipFloatComplex *complex_input,
                               BatchConfig::PerTokenInfo const *tokenInfos,
+                              float rope_theta,
+                              bool llama3_rope,
+                              float factor,
+                              float low_freq_factor,
+                              float high_freq_factor,
+                              int original_max_position_embeddings,
                               int qProjSize,
                               int kProjSize,
                               int num_tokens,
@@ -209,7 +170,29 @@ __global__ void
 
     // float before_real = complex_input[i].x, before_complex =
     int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
+
+    float freq =
+        pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i
+
+    if (llama3_rope) {
+      float pi = CUDART_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
+      }
+    }
+
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
 
     complex_input[i] = hipCmulf(complex_input[i], complex_pos);
@@ -335,22 +318,29 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                        m->scaling_factor,
                        m->local_hidden_size);
   }
-  if (*m->apply_rotary_embedding) {
+  if (m->rotary_embedding_meta->apply_rotary_embedding) {
     /*q&k*/
-    parallelism = num_tokens * m->local_hidden_size;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       m->complex_input,
-                       m->token_infos,
-                       m->qProjSize,
-                       m->kProjSize,
-                       num_tokens,
-                       q_array_size,
-                       m->local_hidden_size);
+    parallelism = num_tokens * m->hidden_size;
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(apply_rotary_embedding_hf),
+        GET_BLOCKS(parallelism),
+        min(CUDA_NUM_THREADS, parallelism),
+        0,
+        stream,
+        output_ptr,
+        m->complex_input,
+        m->token_infos,
+        m->rotary_embedding_meta->rope_theta,
+        (m->rotary_embedding_meta->rope_type == "llama3"),
+        m->rotary_embedding_meta->factor,
+        m->rotary_embedding_meta->low_freq_factor,
+        m->rotary_embedding_meta->high_freq_factor,
+        m->rotary_embedding_meta->original_max_position_embeddings,
+        m->qProjSize,
+        m->kProjSize,
+        num_tokens,
+        q_array_size,
+        m->hidden_size);
   }
 }
 
@@ -840,7 +830,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -868,7 +858,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _kProjSize,
     int _vProjSize,
     int _oProjSize,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _qkv_bias,
     bool _scaling_query,
     bool _qk_prod_scaling,
@@ -929,8 +919,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // has_load_weights = (bool *)calloc(1, sizeof(bool));
   //*has_load_weights = false;
-  apply_rotary_embedding = (bool *)calloc(1, sizeof(bool));
-  *apply_rotary_embedding = _apply_rotary_embedding;
+  rotary_embedding_meta =
+      (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
+  *rotary_embedding_meta = _rotary_embedding_meta;
   qkv_bias = (bool *)calloc(1, sizeof(bool));
   *qkv_bias = _qkv_bias;
   scaling_query = (bool *)calloc(1, sizeof(bool));
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 4e4f249ea..7472b61f0 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -23,6 +23,7 @@
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/cuda_helper.h"
+#include <math_constants.h>
 
 namespace FlexFlow {
 
@@ -373,7 +374,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->qk_dim,
                                     attn->v_dim,
                                     attn->o_dim,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
@@ -399,7 +400,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     int _qk_dim,
     int _v_dim,
     int _o_dim,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _qkv_bias,
     bool _scaling_query,
     bool _qk_prod_scaling,
@@ -454,8 +455,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // has_load_weights = (bool *)calloc(1, sizeof(bool));
   //*has_load_weights = false;
-  apply_rotary_embedding = (bool *)calloc(1, sizeof(bool));
-  *apply_rotary_embedding = _apply_rotary_embedding;
+  rotary_embedding_meta =
+      (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
+  *rotary_embedding_meta = _rotary_embedding_meta;
   qkv_bias = (bool *)calloc(1, sizeof(bool));
   *qkv_bias = _qkv_bias;
   scaling_query = (bool *)calloc(1, sizeof(bool));
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index cfcf783e9..599fb9b5e 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -64,7 +64,7 @@ Tensor
                                                bool add_zero_attn,
                                                DataType data_type,
                                                Initializer *kernel_initializer,
-                                               bool apply_rotary_embedding,
+                                               RotaryEmbeddingMeta rotary_embedding_meta,
                                                bool scaling_query,
                                                float scaling_factor,
                                                bool qk_prod_scaling,
@@ -83,7 +83,7 @@ Tensor
                                             add_zero_attn,
                                             data_type,
                                             kernel_initializer,
-                                            apply_rotary_embedding,
+                                            rotary_embedding_meta,
                                             scaling_query,
                                             scaling_factor,
                                             qk_prod_scaling,
@@ -105,7 +105,7 @@ Tensor
                                                 bool add_zero_attn,
                                                 DataType data_type,
                                                 Initializer *kernel_initializer,
-                                                bool apply_rotary_embedding,
+                                                RotaryEmbeddingMeta rotary_embedding_meta,
                                                 bool scaling_query,
                                                 float scaling_factor,
                                                 bool qk_prod_scaling,
@@ -188,7 +188,17 @@ Tensor
   li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
@@ -223,8 +233,18 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   layer->get_int_property("scaling_query", value);
   bool scaling_query = (bool)value;
   float scaling_factor;
@@ -248,7 +268,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
                                            qkv_bias,
                                            final_bias,
                                            add_zero_attn,
-                                           apply_rotary_embedding,
+                                           rotary_embedding_meta,
                                            scaling_query,
                                            scaling_factor,
                                            qk_prod_scaling,
@@ -271,7 +291,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -291,7 +311,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
@@ -372,7 +392,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -393,7 +413,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
@@ -481,7 +501,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     other.qkv_bias,
                                     other.final_bias,
                                     other.add_zero_attn,
-                                    other.apply_rotary_embedding,
+                                    other.rotary_embedding_meta,
                                     other.scaling_query,
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
@@ -508,7 +528,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.qkv_bias,
                                     params.final_bias,
                                     params.add_zero_attn,
-                                    params.apply_rotary_embedding,
+                                    params.rotary_embedding_meta,
                                     params.scaling_query,
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
@@ -832,7 +852,19 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs,
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -853,7 +885,7 @@ SpecIncMultiHeadSelfAttentionParams
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -882,7 +914,14 @@ size_t hash<FlexFlow::SpecIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qkv_bias);
   hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index 9cfea2f61..e797d40d3 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -596,7 +596,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 41bbabe00..0c37b6f80 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -314,7 +314,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
+  // bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
@@ -386,7 +386,7 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->qk_dim,
                                     attn->v_dim,
                                     attn->o_dim,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 331b2faf6..3bc0c2d82 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -66,7 +66,7 @@ Tensor FFModel::inc_multihead_self_attention_verify(
     bool add_zero_attn,
     DataType data_type,
     Initializer *kernel_initializer,
-    bool apply_rotary_embedding,
+    RotaryEmbeddingMeta rotary_embedding_meta,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -84,7 +84,7 @@ Tensor FFModel::inc_multihead_self_attention_verify(
                                               add_zero_attn,
                                               data_type,
                                               kernel_initializer,
-                                              apply_rotary_embedding,
+                                              rotary_embedding_meta,
                                               scaling_query,
                                               scaling_factor,
                                               qk_prod_scaling,
@@ -105,7 +105,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
     bool add_zero_attn,
     DataType data_type,
     Initializer *kernel_initializer,
-    bool apply_rotary_embedding,
+    RotaryEmbeddingMeta rotary_embedding_meta,
     bool scaling_query,
     float scaling_factor,
     bool qk_prod_scaling,
@@ -197,10 +197,19 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   li->add_int_property("final_bias", final_bias);
   li->add_int_property("add_zero_attn", add_zero_attn);
   li->add_float_property("dropout", dropout);
-  li->add_int_property("apply_rotary_embedding", apply_rotary_embedding);
+  li->add_int_property("apply_rotary_embedding",
+                       rotary_embedding_meta.apply_rotary_embedding);
+  li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  li->add_string_property("rope_type", rotary_embedding_meta.rope_type);
+  li->add_float_property("factor", rotary_embedding_meta.factor);
+  li->add_float_property("low_freq_factor",
+                         rotary_embedding_meta.low_freq_factor);
+  li->add_float_property("high_freq_factor",
+                         rotary_embedding_meta.high_freq_factor);
+  li->add_int_property("original_max_position_embeddings",
+                       rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
   li->add_float_property("scaling_factor", scaling_factor);
-  li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);
   li->add_int_property("offload", offload);
@@ -233,9 +242,18 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
   bool final_bias = (bool)value;
   layer->get_int_property("add_zero_attn", value);
   bool add_zero_attn = (bool)value;
+  RotaryEmbeddingMeta rotary_embedding_meta;
   layer->get_int_property("apply_rotary_embedding", value);
-  bool apply_rotary_embedding = (bool)value;
-  layer->get_int_property("scaling_query", value);
+  rotary_embedding_meta.apply_rotary_embedding = (bool)value;
+  layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta);
+  layer->get_string_property("rope_type", rotary_embedding_meta.rope_type);
+  layer->get_float_property("factor", rotary_embedding_meta.factor);
+  layer->get_float_property("low_freq_factor",
+                            rotary_embedding_meta.low_freq_factor);
+  layer->get_float_property("high_freq_factor",
+                            rotary_embedding_meta.high_freq_factor);
+  layer->get_int_property("original_max_position_embeddings", value);
+  rotary_embedding_meta.original_max_position_embeddings = (int)value;
   bool scaling_query = (bool)value;
   float scaling_factor;
   layer->get_float_property("scaling_factor", scaling_factor);
@@ -261,7 +279,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
                                            qkv_bias,
                                            final_bias,
                                            add_zero_attn,
-                                           apply_rotary_embedding,
+                                           rotary_embedding_meta,
                                            scaling_query,
                                            scaling_factor,
                                            qk_prod_scaling,
@@ -286,7 +304,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -308,7 +326,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
@@ -398,7 +416,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     bool _qkv_bias,
     bool _final_bias,
     bool _add_zero_attn,
-    bool _apply_rotary_embedding,
+    RotaryEmbeddingMeta _rotary_embedding_meta,
     bool _scaling_query,
     float _scaling_factor,
     bool _qk_prod_scaling,
@@ -421,7 +439,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       qkv_bias(_qkv_bias), final_bias(_final_bias),
       add_zero_attn(_add_zero_attn),
-      apply_rotary_embedding(_apply_rotary_embedding),
+      rotary_embedding_meta(_rotary_embedding_meta),
       hidden_size(_input->dims[0].size), qk_dim(_kdim), v_dim(_vdim),
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
@@ -515,7 +533,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     other.qkv_bias,
                                     other.final_bias,
                                     other.add_zero_attn,
-                                    other.apply_rotary_embedding,
+                                    other.rotary_embedding_meta,
                                     other.scaling_query,
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
@@ -544,7 +562,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     params.qkv_bias,
                                     params.final_bias,
                                     params.add_zero_attn,
-                                    params.apply_rotary_embedding,
+                                    params.rotary_embedding_meta,
                                     params.scaling_query,
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
@@ -891,7 +909,19 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs,
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
-         lhs.apply_rotary_embedding == rhs.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.apply_rotary_embedding ==
+             rhs.rotary_embedding_meta.apply_rotary_embedding &&
+         lhs.rotary_embedding_meta.rope_theta ==
+             rhs.rotary_embedding_meta.rope_theta &&
+         lhs.rotary_embedding_meta.rope_type ==
+             rhs.rotary_embedding_meta.rope_type &&
+         lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor &&
+         lhs.rotary_embedding_meta.low_freq_factor ==
+             rhs.rotary_embedding_meta.low_freq_factor &&
+         lhs.rotary_embedding_meta.high_freq_factor ==
+             rhs.rotary_embedding_meta.high_freq_factor &&
+         lhs.rotary_embedding_meta.original_max_position_embeddings ==
+             rhs.rotary_embedding_meta.original_max_position_embeddings &&
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
@@ -911,7 +941,7 @@ TreeIncMultiHeadSelfAttentionParams
   params.qkv_bias = this->qkv_bias;
   params.final_bias = this->final_bias;
   params.add_zero_attn = this->add_zero_attn;
-  params.apply_rotary_embedding = this->apply_rotary_embedding;
+  params.rotary_embedding_meta = this->rotary_embedding_meta;
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
@@ -939,7 +969,14 @@ size_t hash<FlexFlow::TreeIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qkv_bias);
   hash_combine(key, params.final_bias);
   hash_combine(key, params.add_zero_attn);
-  hash_combine(key, params.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding);
+  hash_combine(key, params.rotary_embedding_meta.rope_theta);
+  hash_combine(key, params.rotary_embedding_meta.rope_type);
+  hash_combine(key, params.rotary_embedding_meta.factor);
+  hash_combine(key, params.rotary_embedding_meta.low_freq_factor);
+  hash_combine(key, params.rotary_embedding_meta.high_freq_factor);
+  hash_combine(key,
+               params.rotary_embedding_meta.original_max_position_embeddings);
   hash_combine(key, params.scaling_query);
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index ee37c425a..f748dafd6 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -607,7 +607,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->kProjSize,
                                     attn->vProjSize,
                                     attn->oProjSize,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a2272e5f2..b5815c7b0 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -522,7 +522,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  bool use_bias = *m->qkv_bias || *m->final_bias;
+  // bool use_bias = *m->qkv_bias || *m->final_bias;
 
   //   int device;
   //   checkCUDA(cudaGetDevice(&device));
@@ -600,7 +600,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->qk_dim,
                                     attn->v_dim,
                                     attn->o_dim,
-                                    attn->apply_rotary_embedding,
+                                    attn->rotary_embedding_meta,
                                     attn->qkv_bias,
                                     attn->scaling_query,
                                     attn->qk_prod_scaling,
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 299330c9e..4ef9d620b 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2335,7 +2335,16 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2363,7 +2372,16 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2388,7 +2406,16 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qkv_bias);
         sez.serialize(attn->final_bias);
         sez.serialize(attn->add_zero_attn);
-        sez.serialize(attn->apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding);
+        sez.serialize(attn->rotary_embedding_meta.rope_theta);
+        sez.serialize(attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(),
+                      attn->rotary_embedding_meta.rope_type.size());
+        sez.serialize(attn->rotary_embedding_meta.factor);
+        sez.serialize(attn->rotary_embedding_meta.low_freq_factor);
+        sez.serialize(attn->rotary_embedding_meta.high_freq_factor);
+        sez.serialize(
+            attn->rotary_embedding_meta.original_max_position_embeddings);
         sez.serialize(attn->scaling_query);
         sez.serialize(attn->scaling_factor);
         sez.serialize(attn->qk_prod_scaling);
@@ -2808,9 +2835,10 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
+        bool qkv_bias, final_bias, add_zero_attn,
             scaling_query, qk_prod_scaling, offload, streaming_cache,
             position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2825,7 +2853,17 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qkv_bias);
         dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2850,7 +2888,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
@@ -2870,6 +2908,7 @@ void FFModel::deserialize_graph_optimal_view(
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
             scaling_query, qk_prod_scaling, position_bias, streaming_cache;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
         dez.deserialize(transformer_layer_id);
@@ -2883,7 +2922,17 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qkv_bias);
         dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2905,7 +2954,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
@@ -2922,8 +2971,9 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
-            scaling_query, qk_prod_scaling, offload, position_bias;
+        bool qkv_bias, final_bias, add_zero_attn, scaling_query,
+            qk_prod_scaling, offload, position_bias;
+        RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
         dez.deserialize(id);
@@ -2938,7 +2988,17 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qkv_bias);
         dez.deserialize(final_bias);
         dez.deserialize(add_zero_attn);
-        dez.deserialize(apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.apply_rotary_embedding);
+        dez.deserialize(rotary_embedding_meta.rope_theta);
+        size_t rope_type_len;
+        char rope_type[1024] = {0};
+        dez.deserialize(rope_type_len);
+        dez.deserialize(rope_type, rope_type_len);
+        rotary_embedding_meta.rope_type = std::string(rope_type);
+        dez.deserialize(rotary_embedding_meta.factor);
+        dez.deserialize(rotary_embedding_meta.low_freq_factor);
+        dez.deserialize(rotary_embedding_meta.high_freq_factor);
+        dez.deserialize(rotary_embedding_meta.original_max_position_embeddings);
         dez.deserialize(scaling_query);
         dez.deserialize(scaling_factor);
         dez.deserialize(qk_prod_scaling);
@@ -2962,7 +3022,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.final_bias = final_bias;
         params.add_zero_attn = add_zero_attn;
         params.layer_guid = layer_guid;
-        params.apply_rotary_embedding = apply_rotary_embedding;
+        params.rotary_embedding_meta = rotary_embedding_meta;
         params.scaling_query = scaling_query;
         params.scaling_factor = scaling_factor;
         params.qk_prod_scaling = qk_prod_scaling;
diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc
index 8f33f6db8..72e71688c 100644
--- a/src/runtime/layer.cc
+++ b/src/runtime/layer.cc
@@ -87,6 +87,11 @@ void Layer::add_int_vector_property(std::string const &key,
   int_vector_properties[key] = value;
 }
 
+void Layer::add_string_property(std::string const &key,
+                                std::string const &value) {
+  string_properties[key] = value;
+}
+
 void Layer::add_initializer(std::string const &key, Initializer *initializer) {
   initializers[key] = initializer;
 }
@@ -125,6 +130,18 @@ bool Layer::get_int_vector_property(std::string const &key,
   }
 }
 
+bool Layer::get_string_property(std::string const &key,
+                                std::string &value) const {
+  auto const &it = string_properties.find(key);
+  if (it == string_properties.end()) {
+    assert(false);
+    return false;
+  } else {
+    value = it->second;
+    return true;
+  }
+}
+
 bool Layer::get_initializer(std::string const &key,
                             Initializer *&initializer) const {
   auto const &it = initializers.find(key);

From f11bcf01e7d8bc49df311ffc9e4e2442f608e62f Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 18:58:43 +0000
Subject: [PATCH 576/667] rope

---
 .../inc_multihead_self_attention_kernels.cu   | 76 +++++++------------
 1 file changed, 28 insertions(+), 48 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index e65f2c060..fee1fa380 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -25,6 +25,7 @@
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/cuda_helper.h"
+#include <math_constants.h>
 
 namespace FlexFlow {
 
@@ -135,53 +136,6 @@ __global__ void scaling_query_kernel(DT *input_ptr,
   }
 }
 
-template <typename DT>
-__global__ void
-    apply_rotary_embedding_native(DT *input_ptr,
-                                  cuFloatComplex *complex_input,
-                                  BatchConfig::PerTokenInfo const *tokenInfos,
-                                  int qk_dim,
-                                  int num_q_heads,
-                                  int num_tokens,
-                                  int num_kv_heads,
-                                  int q_block_size,
-                                  int k_block_size,
-                                  int q_array_size) {
-  CUDA_KERNEL_LOOP(
-      i, num_tokens * (qk_dim * num_q_heads + qk_dim * num_kv_heads) / 2) {
-    // create complex number
-    bool q_tensor = i < (q_array_size / 2);
-    int proj_size = q_tensor ? qk_dim : qk_dim;
-    int real_i = q_tensor ? i : i - q_array_size / 2;
-
-    int head_idx = real_i / (num_tokens * proj_size / 2);
-    int idx = real_i % (num_tokens * proj_size / 2);
-    int real_part_index = idx * 2 +
-                          head_idx * (q_tensor ? q_block_size : k_block_size) +
-                          (q_tensor ? 0 : q_array_size);
-
-    int complex_part_index = real_part_index + 1;
-
-    complex_input[i] = {input_ptr[real_part_index],
-                        input_ptr[complex_part_index]};
-
-    int token_idx =
-        (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
-    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
-
-    // float before_real = complex_input[i].x, before_complex =
-    // complex_input[i].y;
-
-    int pos_i = real_i % (proj_size / 2);
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
-    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
-
-    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
-    input_ptr[real_part_index] = complex_input[i].x;
-    input_ptr[complex_part_index] = complex_input[i].y;
-  }
-}
-
 template <typename DT>
 void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
                  BatchConfig const *bc,
@@ -303,6 +257,12 @@ template <typename DT>
 __global__ void apply_pos_encoding_to_tokens_in_batch_kernel(
     DT *input_ptr,
     BatchConfig::PerTokenInfo const *tokenInfos,
+    float rope_theta,
+    bool llama3_rope,
+    float factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     int qk_dim,
     int num_tokens,
     size_t q_array_size,
@@ -333,7 +293,27 @@ __global__ void apply_pos_encoding_to_tokens_in_batch_kernel(
 
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
 
-    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    float freq = pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size));
+
+    if (llama3_rope) {
+      float pi = CUDART_PI_F;
+      float wavelen = 2 * pi / freq;
+      float low_freq_wavelen =
+          original_max_position_embeddings / low_freq_factor;
+      float high_freq_wavelen =
+          original_max_position_embeddings / high_freq_factor;
+      if (wavelen < high_freq_wavelen) {
+      } else if (wavelen > low_freq_wavelen) {
+        freq = freq / factor;
+      } else {
+        assert(low_freq_wavelen != high_freq_wavelen);
+        float smooth =
+            (original_max_position_embeddings / wavelen - low_freq_factor) /
+            (high_freq_factor - low_freq_factor);
+        freq = ((1 - smooth) * freq / factor + smooth * freq);
+      }
+    }
+
     cuFloatComplex complex_pos = {cos(freq), sin(freq)};
 
     cii = cuCmulf(cii, complex_pos);

From 674eed7a178c3b5d44aa6eeff718f5c453fdf768 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 19:19:03 +0000
Subject: [PATCH 577/667] fix

---
 include/flexflow/operator.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 34387b87b..311699d92 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -304,20 +304,8 @@ class Op {
         assert(false && "Tensor data type not supported");
       }
     }
-
-    // only dump the weights in the forward pass, at the first step
-    // note that we do not save the weight gradients, since we only support
-    // finetuning LoRA weights, which are not FF tensors.
-    // Set FF_DEBG_NO_WEIGHTS=1 or to FF_DEBG_NO_WEIGHTS=true to disable saving
-    // weights
-    bool do_not_save_weights =
-        (std::getenv("FF_DEBG_NO_WEIGHTS") &&
-         (std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "1" ||
-          std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "true"));
-    if (fwd_pass && m->decoding_step == 0 && !do_not_save_weights) {
-      fs::path dst_filepath_weights =
-          get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
-          layername;
+    // only dump the weights once
+    if (m->decoding_step == 0) {
       for (int i = 0; i < weight_tensors.size(); i++) {
         std::string filename = base_filepath + "_weight_" + std::to_string(i);
         if (weight_tensors[i].data_type == DT_FLOAT) {

From 2dab7cb7776a8508e0b79a8771f743a98460f306 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 19:29:18 +0000
Subject: [PATCH 578/667] fix

---
 src/ops/inc_multihead_self_attention.cc                 | 2 +-
 src/ops/kernels/inc_multihead_self_attention_kernels.cu | 9 ++++++++-
 src/ops/spec_inc_multihead_self_attention.cu            | 2 +-
 src/ops/tree_inc_multihead_self_attention.cu            | 2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 6a98d26f7..552548694 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -105,7 +105,7 @@ Tensor FFModel::groupquery_self_attention(const Tensor input,
                                           bool add_zero_attn,
                                           DataType data_type,
                                           Initializer *kernel_initializer,
-                                          RotaryEmbeddingMeta rotary_embedding_meta,,
+                                          RotaryEmbeddingMeta rotary_embedding_meta,
                                           bool scaling_query,
                                           float scaling_factor,
                                           bool qk_prod_scaling,
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index fee1fa380..b2115ab44 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -329,7 +329,7 @@ void apply_pos_encoding_to_tokens_in_batch(
     DT *output_ptr,
     cudaStream_t stream) {
   // apply rotary embedding if needed
-  if (!*m->apply_rotary_embedding) {
+  if (!m->rotary_embedding_meta->apply_rotary_embedding) {
     return;
   }
   int num_tokens = bc->num_active_tokens();
@@ -338,6 +338,7 @@ void apply_pos_encoding_to_tokens_in_batch(
   }
   int parallelism = num_tokens * m->local_hidden_size;
   size_t q_array_size = m->qk_dim * num_tokens * m->num_q_heads;
+  bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3");
   apply_pos_encoding_to_tokens_in_batch_kernel<<<GET_BLOCKS(parallelism),
                                                  min(CUDA_NUM_THREADS,
                                                      parallelism),
@@ -345,6 +346,12 @@ void apply_pos_encoding_to_tokens_in_batch(
                                                  stream>>>(
       output_ptr,
       m->token_infos,
+      m->rotary_embedding_meta->rope_theta,
+      llama3_rope,
+      m->rotary_embedding_meta->factor,
+      m->rotary_embedding_meta->low_freq_factor,
+      m->rotary_embedding_meta->high_freq_factor,
+      m->rotary_embedding_meta->original_max_position_embeddings,
       m->qk_dim,
       num_tokens,
       q_array_size,
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index 0c37b6f80..6d7bf1364 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -314,7 +314,7 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
     GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  // bool use_bias = *m->qkv_bias || *m->final_bias;
+  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   cudaEvent_t t_start, t_end;
   if (m->profiling) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index b5815c7b0..7266eb78c 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -522,7 +522,7 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
     GenericTensorAccessorR const &bias) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  // bool use_bias = *m->qkv_bias || *m->final_bias;
+  bool use_bias = *m->qkv_bias || *m->final_bias;
 
   //   int device;
   //   checkCUDA(cudaGetDevice(&device));

From 92199d03c26437439eae9ecc52677d97ac4368f7 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 19:30:43 +0000
Subject: [PATCH 579/667] linting

---
 include/flexflow/inference.h                  |  3 +-
 include/flexflow/model.h                      | 79 ++++++++++---------
 ...spec_inc_multihead_self_attention_params.h |  3 +-
 inference/incr_decoding/incr_decoding.cc      | 10 ++-
 inference/models/falcon.cc                    | 28 +++----
 inference/models/llama.cc                     | 28 +++----
 inference/models/starcoder.cc                 | 22 +++---
 inference/spec_infer/spec_infer.cc            | 10 ++-
 src/ops/inc_multihead_self_attention.cc       | 76 +++++++++---------
 src/ops/spec_inc_multihead_self_attention.cc  | 78 +++++++++---------
 src/runtime/graph.cc                          |  5 +-
 src/runtime/request_manager.cc                |  3 +-
 12 files changed, 179 insertions(+), 166 deletions(-)

diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 8450f610d..4b1120887 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -54,7 +54,8 @@ struct GenerationRequest {
                     double emission_time_ms_,
                     bool add_special_tokens_ = true)
       : prompt(prompt_), slo_ratio(slo_ratio_),
-        emission_time_ms(emission_time_ms_), add_special_tokens(add_special_tokens_) {}
+        emission_time_ms(emission_time_ms_),
+        add_special_tokens(add_special_tokens_) {}
 };
 
 struct GenerationResult {
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 59477ed00..32177a383 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -765,45 +765,46 @@ class FFModel {
       bool qk_prod_scaling = true,
       bool position_bias = false,
       char const *name = NULL);
-  Tensor groupquery_self_attention(Tensor const input,
-                                   int embed_dim,
-                                   int num_q_heads,
-                                   int num_kv_heads,
-                                   int kdim = 0,
-                                   int vdim = 0,
-                                   float dropout = 0.0f,
-                                   bool bias = false,
-                                   bool add_bias_kv = false,
-                                   bool add_zero_attn = false,
-                                   DataType data_type = DT_NONE,
-                                   Initializer *kernel_initializer = NULL,
-                                   RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
-                                   bool scaling_query = false,
-                                   float scaling_factor = 1.0f,
-                                   bool qk_prod_scaling = true,
-                                   bool position_bias = false,
-                                   bool streaming_cache = false,
-                                   char const *name = NULL);
-  Tensor
-      spec_inc_multiquery_self_attention(Tensor const input,
-                                         int embed_dim,
-                                         int num_q_heads,
-                                         int num_kv_heads,
-                                         int kdim = 0,
-                                         int vdim = 0,
-                                         float dropout = 0.0f,
-                                         bool bias = false,
-                                         bool add_bias_kv = false,
-                                         bool add_zero_attn = false,
-                                         DataType data_type = DT_NONE,
-                                         Initializer *kernel_initializer = NULL,
-                                         RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
-                                         bool scaling_query = false,
-                                         float scaling_factor = 1.0f,
-                                         bool qk_prod_scaling = true,
-                                         bool position_bias = false,
-                                         bool streaming_cache = false,
-                                         char const *name = NULL);
+  Tensor groupquery_self_attention(
+      Tensor const input,
+      int embed_dim,
+      int num_q_heads,
+      int num_kv_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
+  Tensor spec_inc_multiquery_self_attention(
+      Tensor const input,
+      int embed_dim,
+      int num_q_heads,
+      int num_kv_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = false,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      DataType data_type = DT_NONE,
+      Initializer *kernel_initializer = NULL,
+      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
+      bool scaling_query = false,
+      float scaling_factor = 1.0f,
+      bool qk_prod_scaling = true,
+      bool position_bias = false,
+      bool streaming_cache = false,
+      char const *name = NULL);
   Tensor inc_multiquery_self_attention_verify(
       Tensor const input,
       int embed_dim,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index f79b3c6aa..87f509831 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -11,7 +11,8 @@ struct SpecIncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
   int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
   float dropout, scaling_factor;
-  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
+      position_bias;
   RotaryEmbeddingMeta rotary_embedding_meta;
   bool streaming_cache;
   char name[MAX_OPNAME];
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index f525d2408..80b9f3c6f 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -398,8 +398,11 @@ void FlexFlow::top_level_task(Task const *task,
         assert(false);
       }
       for (size_t i = 1; i < prompt_json.size(); ++i) {
-        requests.push_back(GenerationRequest(
-            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0, add_special_tokens));
+        requests.push_back(
+            GenerationRequest(prompt_json[i]["prompt"].get<std::string>(),
+                              -1.0,
+                              0,
+                              add_special_tokens));
       }
       PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
       // ConstantEmissionMachine emission_machine(-1, slo_ratios);
@@ -414,7 +417,8 @@ void FlexFlow::top_level_task(Task const *task,
       std::vector<double> timestamps, ratios;
       for (auto const &json_obj : trace_json) {
         EmissionTrace trace(json_obj);
-        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
+        requests.push_back(
+            GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
         timestamps.push_back(trace.emission_time_ms);
         ratios.push_back(trace.slo_ratio);
       }
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index d6b6e6a14..35e7c7725 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -113,11 +113,11 @@ void FALCON::create_falcon_model(FFModel &ff,
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
             falcon_config.rotary_embedding_meta,
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            false,   /*streaming_cache*/
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            false, /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
@@ -139,10 +139,10 @@ void FALCON::create_falcon_model(FFModel &ff,
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
             falcon_config.rotary_embedding_meta,
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
@@ -164,11 +164,11 @@ void FALCON::create_falcon_model(FFModel &ff,
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
             falcon_config.rotary_embedding_meta,
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
-            false,   /*streaming_cache*/
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
+            false, /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index a9a111a2f..0ec3542fa 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -110,10 +110,10 @@ void LLAMA::create_llama_model(FFModel &ff,
             DT_NONE, /*data_type*/
             NULL,    /*kernel_initializer*/
             llama_config.rotary_embedding_meta,
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             streaming_cache,
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
@@ -135,10 +135,10 @@ void LLAMA::create_llama_model(FFModel &ff,
             DT_NONE, /*data_type*/
             nullptr, /*kernel_initializer*/
             llama_config.rotary_embedding_meta,
-            false,   /*scaling query*/
-            1.0f,    /*scaling factor*/
-            true,    /*qk_prod_scaling*/
-            false,   /*position_bias*/
+            false, /*scaling query*/
+            1.0f,  /*scaling factor*/
+            true,  /*qk_prod_scaling*/
+            false, /*position_bias*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
@@ -152,12 +152,12 @@ void LLAMA::create_llama_model(FFModel &ff,
             llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
-            0.0f,            /*dropout*/
-            false,           /*qkv_bias*/
-            false,           /*final_bias*/
-            false,           /*add_zero_attn*/
-            DT_NONE,         /*data_type*/
-            nullptr,         /*kernel_initializer*/
+            0.0f,    /*dropout*/
+            false,   /*qkv_bias*/
+            false,   /*final_bias*/
+            false,   /*add_zero_attn*/
+            DT_NONE, /*data_type*/
+            nullptr, /*kernel_initializer*/
             llama_config.rotary_embedding_meta,
             false,           /*scaling query*/
             1.0f,            /*scaling factor*/
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 7a6e679df..555ef7920 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -114,18 +114,18 @@ void STARCODER::create_starcoder_model(
                 startcoder_config.num_attention_heads,
             startcoder_config.hidden_size /
                 startcoder_config.num_attention_heads,
-            startcoder_config.dropout_p, /*dropout*/
-            true,                        /*bias*/
-            false,                       /*add_bias_kv*/
-            false,                       /*add_zero_attn*/
-            DT_NONE,                     /*data_type*/
-            nullptr,                     /*kernel_initializer*/
+            startcoder_config.dropout_p,             /*dropout*/
+            true,                                    /*bias*/
+            false,                                   /*add_bias_kv*/
+            false,                                   /*add_zero_attn*/
+            DT_NONE,                                 /*data_type*/
+            nullptr,                                 /*kernel_initializer*/
             startcoder_config.rotary_embedding_meta, /*apply_rotary_embedding*/
-            false,                       /*scaling query*/
-            1.0f,                        /*scaling factor*/
-            true,                        /*qk_prod_scaling*/
-            false,                       /*position_bias*/
-            false,                       /*streaming_cache*/
+            false,                                   /*scaling query*/
+            1.0f,                                    /*scaling factor*/
+            true,                                    /*qk_prod_scaling*/
+            false,                                   /*position_bias*/
+            false,                                   /*streaming_cache*/
             std::string("layers_" + std::to_string(i) + "_attention")
                 .c_str() /*name*/
         );
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 4318bab5f..78fa85ab2 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -604,8 +604,11 @@ void FlexFlow::top_level_task(Task const *task,
         assert(false);
       }
       for (size_t i = 1; i < prompt_json.size(); ++i) {
-        requests.push_back(GenerationRequest(
-            prompt_json[i]["prompt"].get<std::string>(), -1.0, 0, add_special_tokens));
+        requests.push_back(
+            GenerationRequest(prompt_json[i]["prompt"].get<std::string>(),
+                              -1.0,
+                              0,
+                              add_special_tokens));
       }
       PoissonEmissionMachine emission_machine(request_per_second, slo_ratios);
       //   ConstantEmissionMachine emission_machine(-1, slo_ratios);
@@ -620,7 +623,8 @@ void FlexFlow::top_level_task(Task const *task,
       std::vector<double> timestamps, ratios;
       for (auto const &json_obj : trace_json) {
         EmissionTrace trace(json_obj);
-        requests.push_back(GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
+        requests.push_back(
+            GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
         timestamps.push_back(trace.emission_time_ms);
         ratios.push_back(trace.slo_ratio);
       }
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 552548694..bfcc7dc4c 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -54,24 +54,25 @@ bool IncMultiHeadSelfAttentionParams::is_valid(
   return is_valid;
 }
 
-Tensor FFModel::inc_multihead_self_attention(const Tensor input,
-                                             int embed_dim,
-                                             int num_heads,
-                                             int kdim,
-                                             int vdim,
-                                             float dropout,
-                                             bool qkv_bias,
-                                             bool final_bias,
-                                             bool add_zero_attn,
-                                             DataType data_type,
-                                             Initializer *kernel_initializer,
-                                             RotaryEmbeddingMeta rotary_embedding_meta,
-                                             bool scaling_query,
-                                             float scaling_factor,
-                                             bool qk_prod_scaling,
-                                             bool position_bias,
-                                             bool streaming_cache,
-                                             char const *name) {
+Tensor FFModel::inc_multihead_self_attention(
+    const Tensor input,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   return groupquery_self_attention(input,
                                    embed_dim,
                                    num_heads,
@@ -93,25 +94,26 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input,
                                    name);
 }
 
-Tensor FFModel::groupquery_self_attention(const Tensor input,
-                                          int embed_dim,
-                                          int num_q_heads,
-                                          int num_kv_heads,
-                                          int kdim,
-                                          int vdim,
-                                          float dropout,
-                                          bool qkv_bias,
-                                          bool final_bias,
-                                          bool add_zero_attn,
-                                          DataType data_type,
-                                          Initializer *kernel_initializer,
-                                          RotaryEmbeddingMeta rotary_embedding_meta,
-                                          bool scaling_query,
-                                          float scaling_factor,
-                                          bool qk_prod_scaling,
-                                          bool position_bias,
-                                          bool streaming_cache,
-                                          char const *name) {
+Tensor FFModel::groupquery_self_attention(
+    const Tensor input,
+    int embed_dim,
+    int num_q_heads,
+    int num_kv_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 599fb9b5e..303fb9aa7 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -52,25 +52,25 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid(
   return is_valid;
 }
 
-Tensor
-    FFModel::spec_inc_multihead_self_attention(Tensor const input,
-                                               int embed_dim,
-                                               int num_heads,
-                                               int kdim,
-                                               int vdim,
-                                               float dropout,
-                                               bool qkv_bias,
-                                               bool final_bias,
-                                               bool add_zero_attn,
-                                               DataType data_type,
-                                               Initializer *kernel_initializer,
-                                               RotaryEmbeddingMeta rotary_embedding_meta,
-                                               bool scaling_query,
-                                               float scaling_factor,
-                                               bool qk_prod_scaling,
-                                               bool position_bias,
-                                               bool streaming_cache,
-                                               char const *name) {
+Tensor FFModel::spec_inc_multihead_self_attention(
+    Tensor const input,
+    int embed_dim,
+    int num_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   return spec_inc_multiquery_self_attention(input,
                                             embed_dim,
                                             num_heads,
@@ -92,26 +92,26 @@ Tensor
                                             name);
 }
 
-Tensor
-    FFModel::spec_inc_multiquery_self_attention(Tensor const input,
-                                                int embed_dim,
-                                                int num_q_heads,
-                                                int num_kv_heads,
-                                                int kdim,
-                                                int vdim,
-                                                float dropout,
-                                                bool qkv_bias,
-                                                bool final_bias,
-                                                bool add_zero_attn,
-                                                DataType data_type,
-                                                Initializer *kernel_initializer,
-                                                RotaryEmbeddingMeta rotary_embedding_meta,
-                                                bool scaling_query,
-                                                float scaling_factor,
-                                                bool qk_prod_scaling,
-                                                bool position_bias,
-                                                bool streaming_cache,
-                                                char const *name) {
+Tensor FFModel::spec_inc_multiquery_self_attention(
+    Tensor const input,
+    int embed_dim,
+    int num_q_heads,
+    int num_kv_heads,
+    int kdim,
+    int vdim,
+    float dropout,
+    bool qkv_bias,
+    bool final_bias,
+    bool add_zero_attn,
+    DataType data_type,
+    Initializer *kernel_initializer,
+    RotaryEmbeddingMeta rotary_embedding_meta,
+    bool scaling_query,
+    float scaling_factor,
+    bool qk_prod_scaling,
+    bool position_bias,
+    bool streaming_cache,
+    char const *name) {
   if (data_type == DT_NONE) {
     data_type = input->data_type;
   }
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 4ef9d620b..326f446aa 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2835,9 +2835,8 @@ void FFModel::deserialize_graph_optimal_view(
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
             tensor_parallelism_degree;
         float dropout, scaling_factor;
-        bool qkv_bias, final_bias, add_zero_attn,
-            scaling_query, qk_prod_scaling, offload, streaming_cache,
-            position_bias;
+        bool qkv_bias, final_bias, add_zero_attn, scaling_query,
+            qk_prod_scaling, offload, streaming_cache, position_bias;
         RotaryEmbeddingMeta rotary_embedding_meta;
         DataType quantization_type;
         size_t id, transformer_layer_id, deserialized_model_id;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index be81cd7a2..01188f1fd 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -475,7 +475,8 @@ RequestManager::RequestGuid
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
   request.add_special_tokens = req.add_special_tokens;
-  if (bos_token_id >= 0 && request.add_special_tokens && model_type != ModelType::FALCON) {
+  if (bos_token_id >= 0 && request.add_special_tokens &&
+      model_type != ModelType::FALCON) {
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);

From 3f611025280c75485363ca3bff495993c6d48099 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 19:42:38 +0000
Subject: [PATCH 580/667] fix

---
 python/flexflow/core/flexflow_cffi.py | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index cd39f8da0..737874565 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -15,6 +15,7 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+from dataclasses import dataclass
 import warnings
 import numpy as np
 from .flexflow_logger import fflogger
@@ -1241,31 +1242,6 @@ def get_weights(self, ffmodel):
         assert ret_val == True
         return np_array
 
-# -----------------------------------------------------------------------
-# Request
-# -----------------------------------------------------------------------
-
-
-class Request:
-    """A class to record the metadata of an inference or finetuning request."""
-
-    def __init__(
-        self,
-        req_type: RequestType,
-        prompt: str = None,
-        max_sequence_length: int = 128,
-        peft_model_id: PEFTModelID = None,
-        dataset_filepath: str = None,
-        max_training_steps: int = 1,
-    ):
-        self.req_type = req_type
-        self.prompt = prompt
-        self.max_sequence_length = max_sequence_length
-        self.peft_model_id = peft_model_id
-        self.dataset_filepath = dataset_filepath
-        self.max_training_steps = max_training_steps
-
-
 # -----------------------------------------------------------------------
 # RotaryEmbeddingMeta
 # -----------------------------------------------------------------------

From 7f1c4e314f3f8e1702874c67d969df9076dcc6bb Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <gabriele.oliaro@snowflake.com>
Date: Tue, 22 Oct 2024 19:47:45 +0000
Subject: [PATCH 581/667] fix

---
 src/ops/tree_inc_multihead_self_attention.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 3bc0c2d82..0e1c83b6e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -209,6 +209,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   li->add_int_property("original_max_position_embeddings",
                        rotary_embedding_meta.original_max_position_embeddings);
   li->add_int_property("scaling_query", scaling_query);
+  li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("quantization_type", quantization_type);

From 69f41f5652efeab30fcd7e931369bc368534fb5a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 22 Oct 2024 17:58:13 -0700
Subject: [PATCH 582/667] feat: set concurrency_barrier for nccl op

---
 include/flexflow/optimizer.h                         | 11 ++++++++---
 .../parallel_ops/kernels/allreduce_kernels.h         |  8 ++++++--
 src/ops/fused.cu                                     |  2 +-
 src/parallel_ops/allreduce.cc                        |  4 ++--
 src/parallel_ops/kernels/allreduce_kernels.cu        | 12 ++++++++++--
 src/runtime/model.cc                                 |  9 +++++++++
 src/runtime/optimizer.cc                             |  4 ++--
 src/runtime/optimizer_kernel.cu                      | 12 ++++++++++--
 8 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h
index bab7e6e4e..4917df73c 100644
--- a/include/flexflow/optimizer.h
+++ b/include/flexflow/optimizer.h
@@ -20,7 +20,8 @@
 #include "legion.h"
 
 namespace FlexFlow {
-
+using Legion::Context;
+using Legion::Runtime;
 class FFModel;
 class OpMeta;
 
@@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(SGDOptimizer const *op,
+  static void nccl_update_task_gpu(Context ctx,
+                                   Runtime *runtime,
+                                   SGDOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
@@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(AdamOptimizer const *op,
+  static void nccl_update_task_gpu(Context ctx,
+                                   Runtime *runtime,
+                                   AdamOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
index 676429f8b..b8af8e833 100644
--- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
@@ -31,12 +31,16 @@ class AllReduceMeta : public OpMeta {
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta *m,
+void inference_kernel_wrapper(Context ctx,
+                              Runtime *runtime,
+                              AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output);
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
+void forward_kernel_wrapper(Context ctx,
+                            Runtime *runtime,
+                            AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
 
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 9998831a3..186df30c8 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1114,7 +1114,7 @@ __host__ void
             assert(fused->op_num_outputs[op] == 1);
             AllReduceMeta *m = (AllReduceMeta *)metas->meta[op];
             Kernels::AllReduce::inference_kernel_wrapper(
-                m, bc, my_input_accessor[0], my_output_accessor[0]);
+                ctx, runtime, m, bc, my_input_accessor[0], my_output_accessor[0]);
             break;
           }
           default: {
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 7ff0bb2b0..c7b7a7433 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -345,7 +345,7 @@ void AllReduce::inference_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input.data_type == output.data_type);
-  inference_kernel_wrapper(m, bc, input, output);
+  inference_kernel_wrapper(ctx, runtime, m, bc, input, output);
 }
 
 /*static*/
@@ -364,7 +364,7 @@ void AllReduce::forward_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
   assert(input.data_type == output.data_type);
-  forward_kernel_wrapper(m, input, output);
+  forward_kernel_wrapper(ctx, runtime, m, input, output);
 }
 
 void AllReduce::backward_task(Task const *task,
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 60a1afaef..52bd05b06 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -118,7 +118,9 @@ inline bool CanApplyTwoShotAllReduce(int64_t num_elements,
 }
 
 // Customized all-reduce kernel backed by CUDA Peer memory.
-void inference_kernel_wrapper(AllReduceMeta *m,
+void inference_kernel_wrapper(Context ctx,
+                              Runtime *runtime,
+                              AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output) {
@@ -144,6 +146,7 @@ void inference_kernel_wrapper(AllReduceMeta *m,
       !CanApplyCustomAllReduce(num_elements, dtype)) {
     // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
     ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
+    runtime->concurrent_task_barrier(ctx);
     checkNCCL(ncclAllReduce(input.ptr,
                             output.ptr,
                             num_elements,
@@ -151,6 +154,7 @@ void inference_kernel_wrapper(AllReduceMeta *m,
                             ncclSum,
                             ncclComm,
                             stream));
+    runtime->concurrent_task_barrier(ctx);
     return;
   }
 
@@ -189,7 +193,9 @@ void inference_kernel_wrapper(AllReduceMeta *m,
       params, output.ptr, num_elements, dtype, strategy, stream);
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
+void forward_kernel_wrapper(Context ctx,
+                            Runtime *runtime,
+                            AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output) {
   cudaStream_t stream;
@@ -198,6 +204,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
   assert(input.domain == output.domain);
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
                           input.domain.get_volume(),
@@ -205,6 +212,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
                           ncclSum,
                           m->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 #else
   assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 25296ddcd..ecd92700d 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -6462,6 +6462,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<FusedOp::inference_task>(
           registrar, "FusedOp Inference Task");
@@ -6686,6 +6687,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::inference_task>(
           registrar, "AllReduce Inference Task");
@@ -6703,6 +6705,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     // AllReduce forward and backward must run concurrentluy since they
     // use ncclAllReduce internally
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::forward_task>(
           registrar, "AllReduce Forward Task");
@@ -6798,6 +6801,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<SGDOptimizer::nccl_update_task>(
           registrar, "SGD NCCL Update Task");
@@ -6812,6 +6816,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ADAM_UPD_NCCL_TASK_ID, "Adam NCCL Update");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<AdamOptimizer::nccl_update_task>(
           registrar, "Adam NCCL Update Task");
@@ -6949,6 +6955,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<ncclComm_t, Op::init_nccl_comms_task>(
           registrar, "NCCL Init Communicators Task");
@@ -6965,6 +6972,8 @@ void register_flexflow_internal_tasks(Runtime *runtime,
                                    "NCCL Finish Communicators");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
+    registrar.set_concurrent();
+    registrar.set_concurrent_barrier();
     if (pre_register) {
       Runtime::preregister_task_variant<Op::finish_nccl_comms_task>(
           registrar, "NCCL Finish Communicators Task");
diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc
index c42a0c9aa..06e7089f4 100644
--- a/src/runtime/optimizer.cc
+++ b/src/runtime/optimizer.cc
@@ -311,7 +311,7 @@ void SGDOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr);
+  nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr);
 }
 #endif
 
@@ -603,7 +603,7 @@ void AdamOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
+  nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
 }
 #endif
 
diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu
index df37e3b13..72ee74940 100644
--- a/src/runtime/optimizer_kernel.cu
+++ b/src/runtime/optimizer_kernel.cu
@@ -75,7 +75,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Context ctx,
+                                                 Runtime *runtime,
+                                                 SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
                                                  size_t size,
@@ -85,6 +87,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -92,6 +95,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
@@ -183,7 +187,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Context ctx,
+                                                  Runtime *runtime,
+                                                  AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
                                                   size_t size,
@@ -193,6 +199,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -200,6 +207,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update

From fe39c5450c90405e949c786ace59293429e291e3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 22 Oct 2024 18:26:39 -0700
Subject: [PATCH 583/667] chore: unify update_custom_mask calling

---
 include/flexflow/batch_config.h    |  1 +
 include/flexflow/request_manager.h |  1 +
 src/runtime/batch_config.cc        |  4 ++
 src/runtime/request_manager.cc     |  4 ++
 src/runtime/request_manager.cu     | 99 +++++++++++++++---------------
 5 files changed, 61 insertions(+), 48 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 4589f91f3..21fd8b77a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -76,6 +76,7 @@ class BatchConfig {
   static int max_spec_tree_token_num();
   static int max_sequence_length();
   static int max_output_length();
+  static bool streaming_cache();
   static int get_max_tree_depth();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index c151cdfbc..961c457c5 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -292,6 +292,7 @@ class RequestManager {
   void set_correction_factor(double correction_factor);
   double get_correction_factor();
   void set_streaming_cache(bool streaming_cache);
+  bool get_streaming_cache();
   bool get_memory_occupancy();
   void set_memory_occupancy(bool memory_occupancy);
   void
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 60665763e..74dca51a9 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -108,6 +108,10 @@ int BatchConfig::max_output_length() {
   return RequestManager::get_request_manager()->get_max_output_length();
 }
 
+bool BatchConfig::streaming_cache() {
+  return RequestManager::get_request_manager()->get_streaming_cache();
+}
+
 int BatchConfig::max_spec_tree_token_num() {
   return RequestManager::get_request_manager()->get_max_spec_tree_token_num();
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 130d10abb..ffb318587 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -303,6 +303,10 @@ void RequestManager::set_streaming_cache(bool streaming_cache_) {
   streaming_cache = streaming_cache_;
 }
 
+bool RequestManager::get_streaming_cache() {
+  return streaming_cache;
+}
+
 bool RequestManager::get_memory_occupancy() {
   return memory_occupancy;
 }
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 733cca745..3efcc95aa 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -18,6 +18,7 @@
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
+#include <cassert>
 
 namespace FlexFlow {
 
@@ -143,6 +144,7 @@ __global__ void
 // Layout of causalMask: [num_requests][tree_size][tree_size]
 // Layout of custom_mask: [num_requests][q_length][kv_length] (bit-packed)
 // Note that for spec-decoding, q_length == last_layer_length != tree_size
+// Also we should consider the influence of StreamingCache
 __global__ void
     update_custom_mask_kernel(uint8_t *custom_mask,
                               int32_t const *qk_indptr,
@@ -199,6 +201,41 @@ __global__ void
   custom_mask[qk_indptr[request_idx] + byte_idx] = packed_bits;
 }
 
+void update_custom_mask(BatchConfig const *batch_config,
+                        AttentionMetaData *metadata,
+                        BatchConfig::BitMask *causalMask,
+                        BatchConfig::PerRequestInfo *request_infos,
+                        bool *request_available,
+                        int batch_size,
+                        cudaStream_t stream) {
+  InferenceMode mode = batch_config->get_mode();
+  assert(mode == TREE_SEARCH_MODE || mode == TREE_VERIFY_MODE);
+  int parallelism = 0;
+  for (int req_idx = 0;
+        req_idx < batch_config->max_requests_per_batch();
+        req_idx++) {
+    if (batch_config->request_available[req_idx]) {
+      int q_len =
+          batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+      int kv_len =
+          batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+          batch_config->requestsInfo[req_idx]
+              .first_token_index_in_request;
+      parallelism += (q_len * kv_len + 7) / 8;
+    }
+  }
+  update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
+                              min(CUDA_NUM_THREADS, parallelism),
+                              0,
+                              stream>>>(
+      metadata->custom_mask,
+      metadata->qk_indptr,
+      causalMask,
+      request_infos,
+      request_available,
+      batch_size);
+}
+
 void RequestManager::load_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -454,30 +491,13 @@ void RequestManager::load_batch_config_task(
 
         // Update gpu-side custom mask referring from CaualMask
         if (!batch_config->prompt_phase) {
-          int parallelism = 0;
-          for (int req_idx = 0;
-               req_idx < batch_config->max_requests_per_batch();
-               req_idx++) {
-            if (batch_config->request_available[req_idx]) {
-              int q_len =
-                  batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-              int kv_len =
-                  batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                  batch_config->requestsInfo[req_idx]
-                      .first_token_index_in_request;
-              parallelism += (q_len * kv_len + 7) / 8;
-            }
-          }
-          update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-              handle.tree_search_attention_metadata->custom_mask,
-              handle.tree_search_attention_metadata->qk_indptr,
-              causalMask,
-              request_infos,
-              request_available,
-              batch_size);
+          update_custom_mask(batch_config,
+                             handle.tree_search_attention_metadata,
+                             causalMask,
+                             request_infos,
+                             request_available,
+                             batch_size,
+                             stream);
         }
       }
 
@@ -585,30 +605,13 @@ void RequestManager::load_batch_config_task(
 
         // Update gpu-side custom mask referring from CaualMask
         if (!batch_config->prompt_phase) {
-          int parallelism = 0;
-          for (int req_idx = 0;
-               req_idx < batch_config->max_requests_per_batch();
-               req_idx++) {
-            if (batch_config->request_available[req_idx]) {
-              int q_len =
-                  batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-              int kv_len =
-                  batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                  batch_config->requestsInfo[req_idx]
-                      .first_token_index_in_request;
-              parallelism += (q_len * kv_len + 7) / 8;
-            }
-          }
-          update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-              handle.tree_verify_attention_metadata->custom_mask,
-              handle.tree_verify_attention_metadata->qk_indptr,
-              causalMask,
-              request_infos,
-              request_available,
-              batch_size);
+          update_custom_mask(batch_config,
+                             handle.tree_verify_attention_metadata,
+                             causalMask,
+                             request_infos,
+                             request_available,
+                             batch_size,
+                             stream);
         }
       }
 

From f8823473021bfc4ecdf179914b19b0e3f2d88e8c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 22 Oct 2024 20:34:23 -0700
Subject: [PATCH 584/667] style: format

---
 include/flexflow/batch_config.h |  2 +-
 src/ops/fused.cu                |  8 ++++++--
 src/runtime/optimizer.cc        |  3 ++-
 src/runtime/request_manager.cu  | 24 ++++++++++--------------
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index be9c90e28..21fd8b77a 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -88,7 +88,7 @@ class BatchConfig {
   // These maximum values are used for copying BatchConfig
   // across workers
   inline static int const MAX_NUM_REQUESTS = 64;
-  inline static int const MAX_NUM_TOKENS = 2048;
+  inline static int const MAX_NUM_TOKENS = 1024;
   inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 8;
   inline static int const MAX_TREE_DEPTH = 8;
   inline static int const MAX_TREE_WIDTH = 16;
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 186df30c8..f9c85c123 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -1113,8 +1113,12 @@ __host__ void
             assert(fused->op_num_inputs[op] == 1);
             assert(fused->op_num_outputs[op] == 1);
             AllReduceMeta *m = (AllReduceMeta *)metas->meta[op];
-            Kernels::AllReduce::inference_kernel_wrapper(
-                ctx, runtime, m, bc, my_input_accessor[0], my_output_accessor[0]);
+            Kernels::AllReduce::inference_kernel_wrapper(ctx,
+                                                         runtime,
+                                                         m,
+                                                         bc,
+                                                         my_input_accessor[0],
+                                                         my_output_accessor[0]);
             break;
           }
           default: {
diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc
index 06e7089f4..96b735803 100644
--- a/src/runtime/optimizer.cc
+++ b/src/runtime/optimizer.cc
@@ -603,7 +603,8 @@ void AdamOptimizer::nccl_update_task(Task const *task,
     }
   }
 
-  nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
+  nccl_update_task_gpu(
+      ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr);
 }
 #endif
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 3efcc95aa..efc684b05 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -211,29 +211,25 @@ void update_custom_mask(BatchConfig const *batch_config,
   InferenceMode mode = batch_config->get_mode();
   assert(mode == TREE_SEARCH_MODE || mode == TREE_VERIFY_MODE);
   int parallelism = 0;
-  for (int req_idx = 0;
-        req_idx < batch_config->max_requests_per_batch();
-        req_idx++) {
+  for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch();
+       req_idx++) {
     if (batch_config->request_available[req_idx]) {
-      int q_len =
-          batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+      int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
       int kv_len =
           batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-          batch_config->requestsInfo[req_idx]
-              .first_token_index_in_request;
+          batch_config->requestsInfo[req_idx].first_token_index_in_request;
       parallelism += (q_len * kv_len + 7) / 8;
     }
   }
   update_custom_mask_kernel<<<GET_BLOCKS(parallelism),
                               min(CUDA_NUM_THREADS, parallelism),
                               0,
-                              stream>>>(
-      metadata->custom_mask,
-      metadata->qk_indptr,
-      causalMask,
-      request_infos,
-      request_available,
-      batch_size);
+                              stream>>>(metadata->custom_mask,
+                                        metadata->qk_indptr,
+                                        causalMask,
+                                        request_infos,
+                                        request_available,
+                                        batch_size);
 }
 
 void RequestManager::load_batch_config_task(

From 4593c7999dce4e038c21acc9278323602af69b58 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 23 Oct 2024 12:40:13 -0700
Subject: [PATCH 585/667] fix: StreamingLLM custom_mask

---
 include/flexflow/batch_config.h |  3 +-
 src/runtime/batch_config.cc     | 13 +++++
 src/runtime/request_manager.cu  | 85 +++++++++++++++++++++++++++------
 3 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 21fd8b77a..92d386c84 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -52,12 +52,13 @@ class StreamingCacheInfo {
   void commit_cache(int len);
   void reset_cache();
   int global_2_cache_index(int global_index);
+  int cache_2_global_index(int cache_index);
 
 public:
   int sink_cache_size, window_cache_size;
   // the meta info of the window cache, commit_len helps to determine if we fill
   // up the window.
-  int window_back, commit_len;
+  int window_back, commit_len, total_len;
 };
 
 class BatchConfig {
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 74dca51a9..b5d4c10ce 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -292,6 +292,7 @@ StreamingCacheInfo &
 // commit the verified result from target model;
 // For incremental decoding, we update the cache both in prefill and decoding
 void StreamingCacheInfo::commit_cache(int len) {
+  total_len += len;
   commit_len += len;
   if (commit_len <= sink_cache_size + window_cache_size) {
     window_back = std::max(0, commit_len - sink_cache_size);
@@ -304,6 +305,7 @@ void StreamingCacheInfo::commit_cache(int len) {
 void StreamingCacheInfo::reset_cache() {
   window_back = 0;
   commit_len = 0;
+  total_len = 0;
 }
 
 int StreamingCacheInfo::global_2_cache_index(int global_index) {
@@ -313,4 +315,15 @@ int StreamingCacheInfo::global_2_cache_index(int global_index) {
   return (global_index - sink_cache_size) % window_cache_size + sink_cache_size;
 }
 
+int StreamingCacheInfo::cache_2_global_index(int cache_index) {
+  if (cache_index < sink_cache_size) {
+    return cache_index;
+  }
+  // cache = (global-sink) % window + sink
+  cache_index -= sink_cache_size;
+  int num_window = (total_len - sink_cache_size) / window_cache_size -
+                   (window_back <= cache_index);
+  return sink_cache_size + cache_index + num_window * window_cache_size;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index efc684b05..590ae44a5 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -139,19 +139,29 @@ __global__ void
 #define test_bit_orig(bit_mask, idx, pos)                                      \
   (((bit_mask)[idx].bits[(pos) / 64] & (1ULL << ((pos) % 64))) != 0)
 
-// Passing the CPU-side causalMask, then output the bit-packed custom_mask for
-// attention forward.
-// Layout of causalMask: [num_requests][tree_size][tree_size]
-// Layout of custom_mask: [num_requests][q_length][kv_length] (bit-packed)
-// Note that for spec-decoding, q_length == last_layer_length != tree_size
-// Also we should consider the influence of StreamingCache
+// cache = (global-sink) % window + sink
+#define cache_2_global_index(cache_info, cache_index)                          \
+  do {                                                                         \
+    if (cache_index >= (cache_info).sink_cache_size) {                         \
+      cache_index -= (cache_info).sink_cache_size;                             \
+      int num_window =                                                         \
+          ((cache_info).total_len - (cache_info).sink_cache_size) /            \
+              (cache_info).window_cache_size -                                 \
+          ((cache_info).window_back <= cache_index);                           \
+      cache_index += (cache_info).sink_cache_size +                            \
+                     num_window * (cache_info).window_cache_size;              \
+    }                                                                          \
+  } while (0)
+
 __global__ void
     update_custom_mask_kernel(uint8_t *custom_mask,
                               int32_t const *qk_indptr,
                               BatchConfig::BitMask *causalMask,
                               BatchConfig::PerRequestInfo *request_infos,
                               bool *request_available,
-                              uint32_t const num_requests) {
+                              uint32_t const num_requests,
+                              StreamingCacheInfo *streaming_cache_infos,
+                              bool streaming_cache) {
   int byte_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int request_idx = 0;
   while (request_idx < num_requests) {
@@ -181,13 +191,29 @@ __global__ void
             q_start = request_infos[requext_idx_in_batch]
                           .first_token_index_in_request -
                       causal_mask.non_tree_cache_size,
-            non_tree_cache_size = causal_mask.non_tree_cache_size;
+            non_tree_cache_size = causal_mask.non_tree_cache_size,
+            kv_len =
+                streaming_cache
+                    ? request_infos[requext_idx_in_batch].num_tokens_in_batch +
+                          streaming_cache_infos[requext_idx_in_batch].commit_len
+                    : request_infos[requext_idx_in_batch].num_tokens_in_batch +
+                          request_infos[requext_idx_in_batch]
+                              .first_token_index_in_request;
 
   uint8_t packed_bits = 0;
   for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
-    int const bit_offset = byte_idx * 8 + bit_idx,
-              q_idx = bit_offset / (non_tree_cache_size + q_start + q_length),
-              kv_idx = bit_offset % (non_tree_cache_size + q_start + q_length);
+    int const bit_offset = byte_idx * 8 + bit_idx, q_idx = bit_offset / kv_len;
+    int kv_idx = bit_offset % kv_len;
+    if (streaming_cache) { // recover to the original index
+      if (kv_idx < streaming_cache_infos[requext_idx_in_batch].commit_len) {
+        cache_2_global_index(streaming_cache_infos[requext_idx_in_batch],
+                             kv_idx);
+      } else {
+        kv_idx -= streaming_cache_infos[requext_idx_in_batch].commit_len;
+        kv_idx +=
+            request_infos[requext_idx_in_batch].first_token_index_in_request;
+      }
+    }
     if (kv_idx < non_tree_cache_size || q_idx >= q_length) {
       packed_bits |= 1 << bit_idx;
     } else {
@@ -201,23 +227,36 @@ __global__ void
   custom_mask[qk_indptr[request_idx] + byte_idx] = packed_bits;
 }
 
+// Passing the CPU-side causalMask, then output the bit-packed custom_mask for
+// attention forward.
+// Layout of causalMask: [num_requests][tree_size][tree_size]
+// Layout of custom_mask: [num_requests][q_length][kv_length] (bit-packed)
+// Note that for spec-decoding, q_length == last_layer_length != tree_size
+// Also we should consider the influence of StreamingCache
 void update_custom_mask(BatchConfig const *batch_config,
                         AttentionMetaData *metadata,
                         BatchConfig::BitMask *causalMask,
                         BatchConfig::PerRequestInfo *request_infos,
                         bool *request_available,
                         int batch_size,
+                        StreamingCacheInfo *streaming_cache_infos,
                         cudaStream_t stream) {
   InferenceMode mode = batch_config->get_mode();
   assert(mode == TREE_SEARCH_MODE || mode == TREE_VERIFY_MODE);
+  bool streaming_cache =
+      mode == TREE_SEARCH_MODE && batch_config->streaming_cache();
   int parallelism = 0;
   for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch();
        req_idx++) {
     if (batch_config->request_available[req_idx]) {
       int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
       int kv_len =
-          batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+          streaming_cache
+              ? batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                    batch_config->streamingCacheInfo[req_idx].commit_len
+              : batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+                    batch_config->requestsInfo[req_idx]
+                        .first_token_index_in_request;
       parallelism += (q_len * kv_len + 7) / 8;
     }
   }
@@ -229,7 +268,9 @@ void update_custom_mask(BatchConfig const *batch_config,
                                         causalMask,
                                         request_infos,
                                         request_available,
-                                        batch_size);
+                                        batch_size,
+                                        streaming_cache_infos,
+                                        streaming_cache);
 }
 
 void RequestManager::load_batch_config_task(
@@ -465,6 +506,13 @@ void RequestManager::load_batch_config_task(
                 sizeof(BatchConfig::tokensInfo) +
                 sizeof(BatchConfig::requestsInfo) +
                 sizeof(BatchConfig::request_available));
+        StreamingCacheInfo *streaming_cache_infos =
+            reinterpret_cast<StreamingCacheInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available) +
+                sizeof(BatchConfig::causalMask));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +
@@ -493,6 +541,7 @@ void RequestManager::load_batch_config_task(
                              request_infos,
                              request_available,
                              batch_size,
+                             streaming_cache_infos,
                              stream);
         }
       }
@@ -579,6 +628,13 @@ void RequestManager::load_batch_config_task(
                 sizeof(BatchConfig::tokensInfo) +
                 sizeof(BatchConfig::requestsInfo) +
                 sizeof(BatchConfig::request_available));
+        StreamingCacheInfo *streaming_cache_infos =
+            reinterpret_cast<StreamingCacheInfo *>(
+                static_cast<char *>(handle.batch_config_metadata) +
+                sizeof(BatchConfig::tokensInfo) +
+                sizeof(BatchConfig::requestsInfo) +
+                sizeof(BatchConfig::request_available) +
+                sizeof(BatchConfig::causalMask));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +
@@ -607,6 +663,7 @@ void RequestManager::load_batch_config_task(
                              request_infos,
                              request_available,
                              batch_size,
+                             streaming_cache_infos,
                              stream);
         }
       }

From 38c9610fb55860cb82cf89ede2b4be4d8755dadd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 23 Oct 2024 18:40:35 -0700
Subject: [PATCH 586/667] fix: streamingllm execute correctly!

---
 .../inc_multihead_self_attention_kernels.cu   |  1 +
 src/runtime/request_manager.cc                | 18 ++++++-------
 src/runtime/request_manager.cu                | 25 ++++++-------------
 3 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index b2115ab44..ea65b1fce 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -358,6 +358,7 @@ void apply_pos_encoding_to_tokens_in_batch(
       m->local_hidden_size);
 }
 
+// TODO: upgrade to llama3 rope, same as apply_pos_encoding_to_tokens_in_batch
 __global__ void apply_pos_encoding_to_streaming_proj_kernel(
     half *kv_cache,
     BatchConfig::PerRequestInfo const *requestInfos,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 9c57719b4..7490e4006 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1333,11 +1333,9 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
       new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
       if (streaming_cache) {
         new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-            request.streaming_cache_info.global_2_cache_index(
-                committed_tokens[0].to_index);
+            request.ssm_cache_size;
         new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-            request.streaming_cache_info.global_2_cache_index(
-                committed_tokens[0].to_index);
+            request.ssm_cache_size;
       } else {
         new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
             committed_tokens[0].to_index;
@@ -1354,11 +1352,9 @@ BatchConfig RequestManager::prepare_first_spec_batch_config() {
         new_bc.tokensInfo[new_bc.num_tokens].request_index = request_index;
         if (streaming_cache) {
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
-              request.streaming_cache_info.global_2_cache_index(
-                  committed_tokens[committed_token_index].to_index);
+              request.ssm_cache_size + committed_token_index - 1;
           new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request =
-              request.streaming_cache_info.global_2_cache_index(
-                  committed_tokens[committed_token_index].to_index);
+              request.ssm_cache_size + committed_token_index - 1;
         } else {
           new_bc.tokensInfo[new_bc.num_tokens].abs_index_in_request =
               committed_tokens[committed_token_index].to_index;
@@ -1905,9 +1901,9 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 
   // Maintain other fields of llm_bitmask
   llm_bitmask.non_tree_cache_size = request.causal_mask.non_tree_cache_size;
-  // We don't need to set llm_bitmask.current_layer_size and
-  // llm_bitmask.tree_or_prompt_size here because they are not used in LLM
-  // verification.
+  llm_bitmask.tree_or_prompt_size = request.causal_mask.tree_or_prompt_size;
+  // We don't need to set llm_bitmask.current_layer_size here because they are
+  // not used in LLM verification.
   return llm_bitmask;
 }
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 590ae44a5..68bfc0430 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -192,13 +192,8 @@ __global__ void
                           .first_token_index_in_request -
                       causal_mask.non_tree_cache_size,
             non_tree_cache_size = causal_mask.non_tree_cache_size,
-            kv_len =
-                streaming_cache
-                    ? request_infos[requext_idx_in_batch].num_tokens_in_batch +
-                          streaming_cache_infos[requext_idx_in_batch].commit_len
-                    : request_infos[requext_idx_in_batch].num_tokens_in_batch +
-                          request_infos[requext_idx_in_batch]
-                              .first_token_index_in_request;
+            kv_len = causal_mask.non_tree_cache_size +
+                     causal_mask.tree_or_prompt_size;
 
   uint8_t packed_bits = 0;
   for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
@@ -209,9 +204,8 @@ __global__ void
         cache_2_global_index(streaming_cache_infos[requext_idx_in_batch],
                              kv_idx);
       } else {
-        kv_idx -= streaming_cache_infos[requext_idx_in_batch].commit_len;
-        kv_idx +=
-            request_infos[requext_idx_in_batch].first_token_index_in_request;
+        kv_idx += streaming_cache_infos[requext_idx_in_batch].total_len -
+                  streaming_cache_infos[requext_idx_in_batch].commit_len;
       }
     }
     if (kv_idx < non_tree_cache_size || q_idx >= q_length) {
@@ -250,13 +244,8 @@ void update_custom_mask(BatchConfig const *batch_config,
        req_idx++) {
     if (batch_config->request_available[req_idx]) {
       int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-      int kv_len =
-          streaming_cache
-              ? batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                    batch_config->streamingCacheInfo[req_idx].commit_len
-              : batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                    batch_config->requestsInfo[req_idx]
-                        .first_token_index_in_request;
+      int kv_len = batch_config->causalMask[req_idx].non_tree_cache_size +
+                   batch_config->causalMask[req_idx].tree_or_prompt_size;
       parallelism += (q_len * kv_len + 7) / 8;
     }
   }
@@ -757,7 +746,7 @@ void RequestManager::load_positions_task(
   int dram_copy[BatchConfig::MAX_NUM_TOKENS];
 
   for (int i = 0; i < batch_config->num_tokens; i++) {
-    dram_copy[i] = batch_config->tokensInfo[i].abs_index_in_request + offset;
+    dram_copy[i] = batch_config->tokensInfo[i].abs_depth_in_request + offset;
   }
 
   cudaStream_t stream;

From e40f47f4f1dcaaa03baf343a26df791fc0ce03f4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 24 Oct 2024 00:21:45 -0700
Subject: [PATCH 587/667] fix: interleaving acc rate

---
 src/runtime/request_manager.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7490e4006..cfe250a81 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2790,8 +2790,11 @@ void RequestManager::add_tokens_to_spec_token_tree(
       continue;
     }
 
-    int result_offset = request.first_token_offset_in_batch *
-                        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    // ssm_first_step only decode the last token (the root of the tree)
+    int result_offset =
+        (request.first_token_offset_in_batch +
+         (current_ssm_step == 1 ? (request.num_tokens_in_batch - 1) : 0)) *
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();

From eac11f0cffafde4c923e54bbc1b8cbdadca48545 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 24 Oct 2024 11:29:30 -0700
Subject: [PATCH 588/667] fix: minor

---
 src/runtime/request_manager.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index cfe250a81..0397b96f4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2873,8 +2873,11 @@ void RequestManager::add_tokens_to_spec_token_tree_old_version(
       continue;
     }
 
-    int result_offset = request.first_token_offset_in_batch *
-                        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
+    // ssm_first_step only decode the last token (the root of the tree)
+    int result_offset =
+        (request.first_token_offset_in_batch +
+         (current_ssm_step == 1 ? (request.num_tokens_in_batch - 1) : 0)) *
+        BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES;
     TokenTree &spec_token_tree = request.speculative_token_trees[0];
     std::vector<std::shared_ptr<TokenTreeNode>> &last_layer =
         spec_token_tree.tree_layers.back();

From d09259e10eaac88b329db60fbde23efff9bf6ba9 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <jinhongyi02@gmail.com>
Date: Sat, 26 Oct 2024 01:57:47 +0000
Subject: [PATCH 589/667] fix

---
 docker/flexflow-environment/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index fc894bcb8..db7164c84 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -37,6 +37,7 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
         bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
         rm ~/${MINICONDA_SCRIPT_NAME} && \
+	    /opt/conda/bin/conda config --set solver classic && \
         /opt/conda/bin/conda upgrade --all && \
         /opt/conda/bin/conda install conda-build conda-verify && \
         /opt/conda/bin/conda clean -ya

From f720144d39a2413442dbf56647f51dc92ee6dff1 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 20 Feb 2024 16:56:28 +0000
Subject: [PATCH 590/667] feat: update weight file naming style

---
 inference/models/falcon.cc                | 14 ++---
 inference/models/llama.cc                 | 70 +++++++++++------------
 inference/models/mpt.cc                   | 20 +++----
 inference/models/opt.cc                   | 18 +++---
 inference/models/starcoder.cc             | 16 +++---
 python/flexflow/serve/__init__.py         |  2 +-
 python/flexflow/serve/models/falcon.py    | 31 ++++++----
 python/flexflow/serve/models/llama.py     | 25 ++++----
 python/flexflow/serve/models/mpt.py       | 38 +++++++-----
 python/flexflow/serve/models/opt.py       | 32 +++++++----
 python/flexflow/serve/models/starcoder.py | 44 +++++++-------
 python/flexflow/serve/serve.py            |  8 ++-
 12 files changed, 175 insertions(+), 143 deletions(-)

diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 35e7c7725..28bd7d574 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -77,7 +77,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_layer_norm(
@@ -91,7 +91,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = res_ln_outputs[0];
       att_norm = res_ln_outputs[1];
@@ -118,7 +118,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             true,  /*qk_prod_scaling*/
             false, /*position_bias*/
             false, /*streaming_cache*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -143,7 +143,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,  /*scaling factor*/
             true,  /*qk_prod_scaling*/
             false, /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -169,7 +169,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             true,  /*qk_prod_scaling*/
             false, /*position_bias*/
             false, /*streaming_cache*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -190,7 +190,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h")
             .c_str());
 
     dense_h_to_4h = ff.gelu(dense_h_to_4h);
@@ -206,7 +206,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h")
             .c_str());
   }
   // final normalization and linear
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 0ec3542fa..79ed850f2 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -60,7 +60,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "tok_embeddings");
+                              "embed_tokens");
 
   Tensor w2 = nullptr;
 
@@ -77,7 +77,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_rms_norm(
@@ -87,7 +87,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
@@ -115,7 +115,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             true,  /*qk_prod_scaling*/
             false, /*position_bias*/
             streaming_cache,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -139,7 +139,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,  /*scaling factor*/
             true,  /*qk_prod_scaling*/
             false, /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -164,7 +164,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             true,            /*qk_prod_scaling*/
             false,           /*position_bias*/
             streaming_cache, /*streaming_cache*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -183,37 +183,36 @@ void LLAMA::create_llama_model(FFModel &ff,
         llama_config.rms_norm_eps,
         llama_config.hidden_size,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
+        std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
+            .c_str());
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
-                     .c_str());
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
-                     .c_str());
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str());
 
     Tensor multi =
         ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size);
@@ -229,8 +228,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
-                     .c_str());
+                 std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -252,7 +250,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                           nullptr,
                           REG_MODE_NONE,
                           0.0f,
-                          "output");
+                          "lm_head");
 
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index fd49f2b84..a7bf79f6b 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -59,7 +59,7 @@ void MPT::create_mpt_model(FFModel &ff,
                                       use_full_precision ? DT_FLOAT : DT_HALF,
                                       NULL,
                                       embed_init,
-                                      "transformer_wte");
+                                      "wte");
 
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -75,7 +75,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
     } else {
       ff.residual_layer_norm(
           intermediate_output,
@@ -88,7 +88,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
       hidden_states = res_ln_outputs[0];
       layernorm_output = res_ln_outputs[1];
     }
@@ -114,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -138,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -162,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -183,7 +183,7 @@ void MPT::create_mpt_model(FFModel &ff,
         1e-05,
         false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
+        std::string("layers." + std::to_string(i) + ".norm_2").c_str());
     hidden_states = res_ln_outputs[0];
     layernorm_output = res_ln_outputs[1];
 
@@ -199,7 +199,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str());
     layernorm_output = ff.gelu(layernorm_output);
     intermediate_output = ff.dense(
         layernorm_output,
@@ -212,7 +212,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str());
   }
 
   // final
@@ -226,7 +226,7 @@ void MPT::create_mpt_model(FFModel &ff,
                          1e-05,
                          false,
                          DT_NONE,
-                         "transformer_norm_f");
+                         "norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(all_final_norm,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 4b7476ce3..25f9833a1 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -96,7 +96,7 @@ void OPT::create_opt_model(FFModel &ff,
         1e-05,
         true,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
+        std::string("layers." + std::to_string(i) + ".self_attn_layer_norm")
             .c_str());
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
@@ -122,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -146,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -170,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -188,8 +188,8 @@ void OPT::create_opt_model(FFModel &ff,
                                     1e-05,
                                     true,
                                     DT_NONE,
-                                    std::string("layers_" + std::to_string(i) +
-                                                "_add_bias_residual_layer_norm")
+                                    std::string("layers." + std::to_string(i) +
+                                                ".add_bias_residual_layer_norm")
                                         .c_str());
     added = res_ln_outputs[0];
     Tensor final_norm = res_ln_outputs[1];
@@ -206,7 +206,7 @@ void OPT::create_opt_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_fc1").c_str());
+                 std::string("layers." + std::to_string(i) + ".fc1").c_str());
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -217,7 +217,7 @@ void OPT::create_opt_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+                   std::string("layers." + std::to_string(i) + ".fc2").c_str());
   }
 
   // final
@@ -244,7 +244,7 @@ void OPT::create_opt_model(FFModel &ff,
                             nullptr,
                             REG_MODE_NONE,
                             0.0f,
-                            "embed_tokens_weight_lm_head");
+                            "lm_head");
 
   Tensor output;
   if (mode == TREE_SEARCH_MODE) {
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 555ef7920..31505b0ba 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -67,7 +67,7 @@ void STARCODER::create_starcoder_model(
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "transformer_wte");
+                              "wte");
 
   Tensor positional_embedding =
       ff.embedding(position_input,
@@ -77,7 +77,7 @@ void STARCODER::create_starcoder_model(
                    use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
-                   "transformer_wpe");
+                   "wpe");
 
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -98,7 +98,7 @@ void STARCODER::create_starcoder_model(
         startcoder_config.layer_norm_epsilon,
         true,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
@@ -126,7 +126,7 @@ void STARCODER::create_starcoder_model(
             true,                                    /*qk_prod_scaling*/
             false,                                   /*position_bias*/
             false,                                   /*streaming_cache*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
         break;
@@ -147,7 +147,7 @@ void STARCODER::create_starcoder_model(
         startcoder_config.layer_norm_epsilon,
         true,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_2").c_str());
     residual = res_ln_outputs[0];
     Tensor l2_norm = res_ln_outputs[1];
 
@@ -163,7 +163,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str());
 
     c_fc = ff.gelu(c_fc);
 
@@ -178,7 +178,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str());
   }
   // final normalization and linear
   ff.residual_layer_norm(residual,
@@ -191,7 +191,7 @@ void STARCODER::create_starcoder_model(
                          startcoder_config.layer_norm_epsilon,
                          true,
                          DT_NONE,
-                         "transformer_ln_f");
+                         "ln_f");
   Tensor ln_f = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(ln_f,
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 5af077273..df630462a 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -200,7 +200,7 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024**2
+        configs_dict["offload_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 1b5491f3c..ab3bc4623 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -130,7 +130,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, att_norm = ffmodel.residual_layer_norm(
@@ -141,7 +141,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     self.falcon_config.rotary_embedding_meta,
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -176,7 +176,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     self.falcon_config.rotary_embedding_meta,
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.groupquery_self_attention(
@@ -193,7 +193,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     self.falcon_config.rotary_embedding_meta,
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
@@ -203,7 +203,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size * 4,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_h_to_4h",
+                name=f"layers.{i}.mlp.dense_h_to_4h",
             )
             dense_h_to_4h = ffmodel.gelu(dense_h_to_4h)
             mlp_output = ffmodel.dense(
@@ -211,7 +211,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_4h_to_h",
+                name=f"layers.{i}.mlp.dense_4h_to_h",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -249,6 +249,13 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (name.replace("transformer.h.", "layers.")
+            .replace("transformer.", "")
+            .replace("self_attention.dense", "self_attention.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         n_head = (
@@ -264,10 +271,10 @@ def convert_hf_model(model, dst_folder):
                 .replace("self_attention_dense", "attention_wo")
             )
             # Split Q,K,V attention weights
-            if "self_attention_query_key_value" in name:
-                name_q = name.replace("self_attention_query_key_value", "attention_wq")
-                name_k = name.replace("self_attention_query_key_value", "attention_wk")
-                name_v = name.replace("self_attention_query_key_value", "attention_wv")
+            if "self_attention.query_key_value" in name:
+                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
+                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
+                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -284,5 +291,5 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         # LM head weight
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index c8b5bfb11..0e59bb547 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -112,7 +112,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="tok_embeddings",
+            name="embed_tokens",
         )
 
         for i in range(self.llama_config.num_hidden_layers):
@@ -123,7 +123,7 @@ def build_model(self, max_tokens_per_batch):
                     token,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, attn_norm = ffmodel.residual_rms_norm(
@@ -131,7 +131,7 @@ def build_model(self, max_tokens_per_batch):
                     w2,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -151,7 +151,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     self.llama_config.rotary_embedding_meta,
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -170,7 +170,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     self.llama_config.rotary_embedding_meta,
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.groupquery_self_attention(
@@ -189,7 +189,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     self.llama_config.rotary_embedding_meta,
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -199,21 +199,21 @@ def build_model(self, max_tokens_per_batch):
                 mha,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
-                name=f"layers_{i}_ffn_norm",
+                name=f"layers.{i}.post_attention_layernorm",
             )
             w1 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w1",
+                name=f"layers.{i}.mlp.gate_proj",
             )
             w3 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w3",
+                name=f"layers.{i}.mlp.up_proj",
             )
             multi = ffmodel.sigmoid_silu_multi(w1, w3, self.llama_config.intermediate_size)
             w2 = ffmodel.dense(
@@ -221,7 +221,7 @@ def build_model(self, max_tokens_per_batch):
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w2",
+                name=f"layers.{i}.mlp.down_proj",
             )
 
         _, token = ffmodel.residual_rms_norm(
@@ -236,7 +236,7 @@ def build_model(self, max_tokens_per_batch):
             self.llama_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="output",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -256,6 +256,9 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return name.replace("model.", "")
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index e7d2c1990..a68bbd2a0 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -93,7 +93,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
 
         axes = [
@@ -110,7 +110,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
             else:
                 hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -122,7 +122,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -144,7 +144,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention_verify(
@@ -165,7 +165,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention(
@@ -186,7 +186,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             else:
                 assert False
@@ -200,7 +200,7 @@ def build_model(self, max_tokens_per_batch):
                 True,
                 1e-05,
                 False,
-                name=f"layers_{i}_norm_2",
+                name=f"layers.{i}.norm_2",
             )
             # mlp
             layernorm_output = ffmodel.dense(
@@ -208,7 +208,7 @@ def build_model(self, max_tokens_per_batch):
                 4 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_up_proj",
+                name=f"layers.{i}.ffn.up_proj",
             )
             layernorm_output = ffmodel.gelu(layernorm_output)
             intermediate_output = ffmodel.dense(
@@ -216,7 +216,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_down_proj",
+                name=f"layers.{i}.ffn.down_proj",
             )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -228,7 +228,7 @@ def build_model(self, max_tokens_per_batch):
             True,
             1e-05,
             False,
-            name=f"transformer_norm_f",
+            name=f"norm_f",
         )
         lm_head = ffmodel.dense(
             all_final_norm,
@@ -249,14 +249,22 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("transformer.blocks.", "layers.")
+            .replace("transformer.", "")
+            .replace("attn.out_proj", "attn.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
             name = name.replace("transformer.blocks.", "layers.").replace(".", "_")
             if "Wqkv" in name:
-                name_q = name.replace("attn_Wqkv", "attention_wq")
-                name_k = name.replace("attn_Wqkv", "attention_wk")
-                name_v = name.replace("attn_Wqkv", "attention_wv")
+                name_q = name.replace("attn.Wqkv", "attn.q_proj")
+                name_k = name.replace("attn.Wqkv", "attn.k_proj")
+                name_v = name.replace("attn.Wqkv", "attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -276,6 +284,6 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
         shutil.copy(
-            os.path.join(dst_folder, "transformer_wte_weight"),
-            os.path.join(dst_folder, "lm_head_weight"),
+            os.path.join(dst_folder, "wte.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index a121bf399..32bd4b799 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -135,7 +135,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_attention_layer_norm",
+                    name=f"layers.{i}.self_attn_layer_norm",
                 )
             else:
                 hidden_states = ffmodel.add(token, positional_embedding)
@@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multihead_self_attention_verify(
@@ -179,7 +179,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multihead_self_attention(
@@ -199,7 +199,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -211,7 +211,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 self.opt_config.layer_norm_elementwise_affine,
                 1e-05,
-                name=f"layers_{i}_add_bias_residual_layer_norm",
+                name=f"layers.{i}.add_bias_residual_layer_norm",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -222,14 +222,14 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.ffn_dim,
                 ActiMode.AC_MODE_RELU,
                 True,
-                name=f"layers_{i}_fc1",
+                name=f"layers.{i}.fc1",
             )
             fc2 = ffmodel.dense(
                 fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_fc2",
+                name=f"layers.{i}.fc2",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -241,7 +241,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_final_layer_norm",
+                    name=f"layers.{i}.final_layer_norm",
                 )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -259,7 +259,7 @@ def build_model(self, max_tokens_per_batch):
             self.opt_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="embed_tokens_weight_lm_head",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -279,6 +279,16 @@ def build_model(self, max_tokens_per_batch):
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("decoder.", "")
+            .replace("model.", "")
+            .replace("self_attn.wo.bias", "add_bias_residual_layer_norm.attn_bias")
+            .replace(
+                ".final_layer_norm", ".add_bias_residual_layer_norm"
+            )  # important to use the leading "_" to avoid matching the last LayerNorm
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
@@ -299,6 +309,6 @@ def convert_hf_model(model, dst_folder):
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
-            os.path.join(dst_folder, "embed_tokens_weight"),
-            os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
+            os.path.join(dst_folder, "embed_tokens.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 9272addb3..762ad24c4 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -105,7 +105,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
         positional_embedding = ffmodel.embedding(
             position_tensor,
@@ -115,7 +115,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wpe",
+            name="wpe",
         )
 
         axes = [
@@ -133,7 +133,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_1",
+                name=f"layers.{i}.ln_1",
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
@@ -153,7 +153,7 @@ def build_model(self, max_tokens_per_batch):
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
                 self.starcoder_config.rotary_embedding_meta,
-                name=f"layers_{i}_attention",
+                name=f"layers.{i}.attn.c_attn",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -165,7 +165,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_2",
+                name=f"layers.{i}.ln_2",
             )
 
             # mlp
@@ -175,7 +175,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_fc",
+                name=f"layers.{i}.mlp.c_fc",
             )
             activation = ffmodel.gelu(c_fc, False)
             c_proj = ffmodel.dense(
@@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_proj",
+                name=f"layers.{i}.mlp.c_proj",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -194,7 +194,7 @@ def build_model(self, max_tokens_per_batch):
             axes,
             True,
             self.starcoder_config.layer_norm_epsilon,
-            name=f"transformer_ln_f",
+            name=f"ln_f",
         )
         lm_head = ffmodel.dense(
             ln_f,
@@ -218,11 +218,11 @@ def build_model(self, max_tokens_per_batch):
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace(".", "_")
-            if "c_attn_weight" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            name = name.replace("transformer.h", "layers").replace("transformer", "")
+            if "attn.c_attn.weight" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -235,10 +235,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_attn_bias" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            elif "attn.c_attn.bias" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -251,14 +251,14 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_proj_bias" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.bias" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
-            elif "c_proj_weight" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.weight" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 5f8b07915..58c9dc9aa 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -220,7 +220,13 @@ def download_hf_weights_if_needed(self):
                 )
             # Download model from HuggingFace, or load it from the local folder
             hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, trust_remote_code=True
+                self.model_name,
+                trust_remote_code=True,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
             )
             # Print log message to notify user download of model has finished
             if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):

From 1c654af2e9fc19fd8c0360bad3345c2908284b59 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 20 Feb 2024 18:20:57 +0000
Subject: [PATCH 591/667] fix: file_loader

---
 src/runtime/file_loader.cc | 123 ++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 69 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index fb6d3b4dc..66a752d23 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -220,11 +220,11 @@ void load_attention_weights_v2(DT *ptr,
                                int tensor_parallelism_degree) {
   // layers_0_attention_wq_weight
   // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
-  std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
+  std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
   int base_index = 0;
@@ -377,10 +377,10 @@ void load_attention_weights_quantized(char *ptr,
                                       bool use_full_precision) {
   // layers_0_attention_wq_weight
   // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
 
   int file_index = 0;
@@ -658,7 +658,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     if (weight_idx > 0) {
       assert(weight_idx == 0 || weight_idx == 1);
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
     load_from_quantized_file(data,
@@ -693,68 +693,53 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
 
-  if (ff->config.benchmarking) {
-    std::cout << "Initializing weight " << weight_filename
-              << " with random data (benchmarking mode)" << std::endl;
-    // If benchmarking, we don't need to load the weights
-    // We can just fill the weight tensor with random data
-  } else {
-    if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-        l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
-        l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-      if (weight_filename.find("self_attention") != std::string::npos) {
-        load_attention_weights_multi_query(
-            data, weight_filename, weights_folder, hidden_dim, num_heads);
-      } else if (weight_filename.find("attention") != std::string::npos &&
-                 weight_filename.rfind("attention") ==
-                     weight_filename.length() - strlen("attention")) {
-        if (weight_idx == 0) {
-          load_attention_weights_v2(data,
-                                    num_heads,
-                                    num_kv_heads,
-                                    hidden_dim,
-                                    head_dim,
-                                    weight_filename,
-                                    weights_folder,
-                                    volume,
-                                    tensor_parallelism_degree);
-        } else {
-          long long value;
-          l->get_int_property("final_bias", value);
-          bool final_bias = (bool)value;
-          load_attention_bias_v2(data,
-                                 num_heads,
-                                 num_kv_heads,
-                                 hidden_dim,
-                                 head_dim,
-                                 final_bias,
-                                 weight_filename,
-                                 weights_folder);
-        }
-
+  if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+      l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
+      l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
+    if (weight_filename.find("attention") != std::string::npos &&
+        weight_filename.rfind("attention") ==
+            weight_filename.length() - strlen("attention")) {
+      if (weight_idx == 0) {
+        load_attention_weights_v2(data,
+                                  num_heads,
+                                  num_kv_heads,
+                                  hidden_dim,
+                                  qkv_inner_dim,
+                                  weight_filename,
+                                  weights_folder,
+                                  volume,
+                                  tensor_parallelism_degree);
       } else {
-        assert(false);
+        long long value;
+        l->get_int_property("final_bias", value);
+        bool final_bias = (bool)value;
+        load_attention_bias_v2(data,
+                               num_heads,
+                               num_kv_heads,
+                               hidden_dim,
+                               qkv_inner_dim,
+                               final_bias,
+                               weight_filename,
+                               weights_folder);
       }
-    } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
-      assert(weight_idx >= 0 || weight_idx <= 2);
-      weight_filename += (weight_idx == 0)
-                             ? "_attn_bias"
-                             : ((weight_idx == 1) ? "_weight" : "_bias");
-      std::cout << "Loading weight file " << weight_filename << std::endl;
-      std::string weight_filepath =
-          join_path({weights_folder, weight_filename});
-      load_from_file(data, volume, weight_filepath);
+
     } else {
-      // default op
-      assert(weight_idx == 0 || weight_idx == 1);
-      // handle exception
-      if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
-      }
-      std::cout << "Loading weight file " << weight_filename << std::endl;
-      std::string weight_filepath =
-          join_path({weights_folder, weight_filename});
-      load_from_file(data, volume, weight_filepath);
+      assert(false);
+    }
+  } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
+    assert(weight_idx >= 0 || weight_idx <= 2);
+    weight_filename += (weight_idx == 0)
+                           ? "_attn_bias"
+                           : ((weight_idx == 1) ? ".weight" : ".bias");
+    std::cout << "Loading weight file " << weight_filename << std::endl;
+    std::string weight_filepath = join_path({weights_folder, weight_filename});
+    load_from_file(data, volume, weight_filepath);
+  } else {
+    // default op
+    assert(weight_idx == 0 || weight_idx == 1);
+    // handle exception
+    if (weight_filename != "embed_tokens_weight_lm_head") {
+      weight_filename += weight_idx == 0 ? ".weight" : ".bias";
     }
   }
 

From 0a9cb8fe1002492d70d0eda2055ba7cd85ba211e Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 21 Feb 2024 21:23:05 +0000
Subject: [PATCH 592/667] fix

---
 src/runtime/file_loader.cc | 49 ++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 66a752d23..e3fdc26e9 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -696,35 +696,28 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-    if (weight_filename.find("attention") != std::string::npos &&
-        weight_filename.rfind("attention") ==
-            weight_filename.length() - strlen("attention")) {
-      if (weight_idx == 0) {
-        load_attention_weights_v2(data,
-                                  num_heads,
-                                  num_kv_heads,
-                                  hidden_dim,
-                                  qkv_inner_dim,
-                                  weight_filename,
-                                  weights_folder,
-                                  volume,
-                                  tensor_parallelism_degree);
-      } else {
-        long long value;
-        l->get_int_property("final_bias", value);
-        bool final_bias = (bool)value;
-        load_attention_bias_v2(data,
-                               num_heads,
-                               num_kv_heads,
-                               hidden_dim,
-                               qkv_inner_dim,
-                               final_bias,
-                               weight_filename,
-                               weights_folder);
-      }
-
+    if (weight_idx == 0) {
+      load_attention_weights_v2(data,
+                                num_heads,
+                                num_kv_heads,
+                                hidden_dim,
+                                qkv_inner_dim,
+                                weight_filename,
+                                weights_folder,
+                                volume,
+                                tensor_parallelism_degree);
     } else {
-      assert(false);
+      long long value;
+      l->get_int_property("final_bias", value);
+      bool final_bias = (bool)value;
+      load_attention_bias_v2(data,
+                             num_heads,
+                             num_kv_heads,
+                             hidden_dim,
+                             qkv_inner_dim,
+                             final_bias,
+                             weight_filename,
+                             weights_folder);
     }
   } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
     assert(weight_idx >= 0 || weight_idx <= 2);

From 6f13b5b6c735cf616e6fdcc4a0547becb120018d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Wed, 21 Feb 2024 23:03:11 +0000
Subject: [PATCH 593/667] fix

---
 python/flexflow/serve/models/opt.py |  3 ++-
 src/runtime/file_loader.cc          | 14 +++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index 32bd4b799..abf88b784 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -283,7 +283,8 @@ def convert_hf_weight_name(name):
         return (
             name.replace("decoder.", "")
             .replace("model.", "")
-            .replace("self_attn.wo.bias", "add_bias_residual_layer_norm.attn_bias")
+            .replace("self_attn.out_proj", "self_attn.o_proj")
+            .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias")
             .replace(
                 ".final_layer_norm", ".add_bias_residual_layer_norm"
             )  # important to use the leading "_" to avoid matching the last LayerNorm
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index e3fdc26e9..be215e73d 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -137,12 +137,12 @@ void load_attention_bias_v2(DT *ptr,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string q_file = layer_name + "_wq_bias";
-  std::string k_file = layer_name + "_wk_bias";
-  std::string v_file = layer_name + "_wv_bias";
+  std::string q_file = layer_name + ".q_proj.bias";
+  std::string k_file = layer_name + ".k_proj.bias";
+  std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
   if (final_bias) {
-    std::string o_file = layer_name + "_wo_bias";
+    std::string o_file = layer_name + ".o_proj.bias";
     bias_files.push_back(o_file);
   }
 
@@ -218,8 +218,6 @@ void load_attention_weights_v2(DT *ptr,
                                std::string weights_folder,
                                size_t volume,
                                int tensor_parallelism_degree) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
   std::string q_file = layer_name + ".q_proj.weight";
   std::string k_file = layer_name + ".k_proj.weight";
   std::string v_file = layer_name + ".v_proj.weight";
@@ -375,8 +373,6 @@ void load_attention_weights_quantized(char *ptr,
                                       std::string weights_folder,
                                       DataType data_type,
                                       bool use_full_precision) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
   std::string q_file = layer_name + ".q_proj.weight";
   std::string k_file = layer_name + ".k_proj.weight";
   std::string v_file = layer_name + ".v_proj.weight";
@@ -722,7 +718,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
     assert(weight_idx >= 0 || weight_idx <= 2);
     weight_filename += (weight_idx == 0)
-                           ? "_attn_bias"
+                           ? ".attn_bias"
                            : ((weight_idx == 1) ? ".weight" : ".bias");
     std::cout << "Loading weight file " << weight_filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, weight_filename});

From 82cb8de9696efa67521d939b7c280b8cc89204e1 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 29 Oct 2024 19:21:42 -0700
Subject: [PATCH 594/667] feat: update file_loader to latest ver. on `peft`

---
 src/runtime/file_loader.cc | 186 +++++++++++++++++++++++--------------
 1 file changed, 116 insertions(+), 70 deletions(-)

diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index be215e73d..097795521 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -229,65 +229,58 @@ void load_attention_weights_v2(DT *ptr,
   size_t single_proj_size =
       hidden_dim *
       head_dim; // size of each of Q,K,V,O weights for a single head
+  size_t one_weight_file_size =
+      num_heads * single_proj_size; // size of each of Q/K/V/O for all heads
+
+  size_t q_size = one_weight_file_size, o_size = one_weight_file_size;
+  size_t k_size = single_proj_size * num_kv_heads,
+         v_size = single_proj_size * num_kv_heads;
+
+  size_t k_replicate_size = one_weight_file_size;
+  size_t v_replicate_size = one_weight_file_size;
 
-  size_t qo_size = single_proj_size * num_heads,
-         kv_size = single_proj_size * num_kv_heads;
-  size_t kv_replicate_size = qo_size;
   int replicate_num = num_heads / num_kv_heads;
-  // stride for q, k, v, o
-  size_t stride_size =
-      (qo_size + kv_replicate_size * 2 + qo_size) / tensor_parallelism_degree;
 
+  // stride for q, k, v, o
+  size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) /
+                       tensor_parallelism_degree;
   for (auto filename : weight_filenames) {
     std::cout << "Loading weight file " << filename << std::endl;
     std::string weight_filepath = join_path({weights_folder, filename});
-    size_t file_size = (file_index == 0 || file_index == 3) ? qo_size : kv_size;
+
+    int data_index = 0;
+    size_t partial_size = (file_index == 0 || file_index == 3)
+                              ? one_weight_file_size
+                              : single_proj_size * num_kv_heads;
+    size_t one_partition_size =
+        one_weight_file_size / tensor_parallelism_degree;
 
     std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
     if (!in.good()) {
       std::cout << "Could not open file: " << weight_filepath << std::endl;
     }
     assert(in.good() && "incorrect weight file path");
-    std::vector<DT> host_array(file_size);
-    size_t loaded_data_size = sizeof(DT) * file_size;
+    std::vector<DT> host_array(partial_size);
+    size_t loaded_data_size = sizeof(DT) * partial_size;
     in.seekg(0, in.end);
     in.seekg(0, in.beg);
     in.read((char *)host_array.data(), loaded_data_size);
     size_t in_get_size = in.gcount();
+
     if (in_get_size != loaded_data_size) {
       std::cout << "load attention data error " << in_get_size << ", "
                 << loaded_data_size << ", " << file_index << ", "
                 << weight_filepath << "\n";
       assert(false && "data size mismatch");
     }
-
+    // wq, wk, wo
     if (file_index == 0) {
-      // q, o
-      int one_partition_size = qo_size / tensor_parallelism_degree;
-      int data_index = 0;
       for (int i = 0; i < tensor_parallelism_degree; i++) {
         for (int j = 0; j < one_partition_size; j++) {
           ptr[base_index + i * stride_size + j] = host_array.at(data_index++);
         }
       }
-      base_index += one_partition_size;
-    } else if (file_index == 3) {
-      // o
-      int one_partition_size =
-          head_dim * (num_heads / tensor_parallelism_degree);
-      int data_index = 0;
-      for (int i = 0; i < qo_size; i++) {
-        int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
-        int block_num = (i / one_partition_size);
-        int offset =
-            block_num / tensor_parallelism_degree * one_partition_size +
-            (i % one_partition_size);
-        ptr[base_index + part_idx * stride_size + offset] =
-            host_array.at(data_index++);
-      }
-      base_index += one_partition_size;
     } else {
-      // k, v
       for (int i = 0; i < num_heads; i++) {
         int kv_idx = i / (num_heads / num_kv_heads);
         int head_idx = i % (num_heads / tensor_parallelism_degree);
@@ -297,11 +290,52 @@ void load_attention_weights_v2(DT *ptr,
               j] = host_array.at(kv_idx * single_proj_size + j);
         }
       }
-      base_index += kv_replicate_size / tensor_parallelism_degree;
     }
 
+    // assert(data_index == partial_size);
+    base_index += one_partition_size;
     file_index++;
+  }
+  assert(base_index == (q_size + k_replicate_size + v_replicate_size) /
+                           tensor_parallelism_degree);
+
+  {
+    std::cout << "Loading weight file " << o_file << std::endl;
+    std::string weight_filepath = join_path({weights_folder, o_file});
+
+    std::ifstream in(weight_filepath, std::ios::in | std::ios::binary);
+    if (!in.good()) {
+      std::cout << "Could not open file: " << weight_filepath << std::endl;
+    }
+    assert(in.good() && "incorrect weight file path");
+    std::vector<DT> host_array(one_weight_file_size);
+    size_t loaded_data_size = sizeof(DT) * one_weight_file_size;
+    in.seekg(0, in.end);
+    in.seekg(0, in.beg);
+    in.read((char *)host_array.data(), loaded_data_size);
+    size_t in_get_size = in.gcount();
+
+    if (in_get_size != loaded_data_size) {
+      std::cout << "load data error" << std::endl;
+      assert(false);
+    }
+    assert(one_weight_file_size == host_array.size());
+    int data_index = 0;
+
+    int one_partition_size =
+        head_dim * (num_heads / tensor_parallelism_degree);
+    for (int i = 0; i < one_weight_file_size; i++) {
+      int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
+      int block_num = (i / one_partition_size);
+      int offset = block_num / tensor_parallelism_degree * one_partition_size +
+                   (i % one_partition_size);
+      ptr[base_index + part_idx * stride_size + offset] =
+          host_array.at(data_index++);
+    }
+
     in.close();
+
+    assert(data_index == one_weight_file_size);
   }
 }
 
@@ -689,46 +723,58 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
 
-  if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-      l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
-      l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-    if (weight_idx == 0) {
-      load_attention_weights_v2(data,
-                                num_heads,
-                                num_kv_heads,
-                                hidden_dim,
-                                qkv_inner_dim,
-                                weight_filename,
-                                weights_folder,
-                                volume,
-                                tensor_parallelism_degree);
-    } else {
-      long long value;
-      l->get_int_property("final_bias", value);
-      bool final_bias = (bool)value;
-      load_attention_bias_v2(data,
-                             num_heads,
-                             num_kv_heads,
-                             hidden_dim,
-                             qkv_inner_dim,
-                             final_bias,
-                             weight_filename,
-                             weights_folder);
-    }
-  } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
-    assert(weight_idx >= 0 || weight_idx <= 2);
-    weight_filename += (weight_idx == 0)
-                           ? ".attn_bias"
-                           : ((weight_idx == 1) ? ".weight" : ".bias");
-    std::cout << "Loading weight file " << weight_filename << std::endl;
-    std::string weight_filepath = join_path({weights_folder, weight_filename});
-    load_from_file(data, volume, weight_filepath);
+  if (ff->config.benchmarking) {
+    std::cout << "Initializing weight " << weight_filename
+              << " with random data (benchmarking mode)" << std::endl;
+    // If benchmarking, we don't need to load the weights
+    // We can just fill the weight tensor with random data
   } else {
-    // default op
-    assert(weight_idx == 0 || weight_idx == 1);
-    // handle exception
-    if (weight_filename != "embed_tokens_weight_lm_head") {
-      weight_filename += weight_idx == 0 ? ".weight" : ".bias";
+    if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+        l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
+        l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
+      if (weight_idx == 0) {
+        load_attention_weights_v2(data,
+                                  num_heads,
+                                  num_kv_heads,
+                                  hidden_dim,
+                                  head_dim,
+                                  weight_filename,
+                                  weights_folder,
+                                  volume,
+                                  tensor_parallelism_degree);
+      } else {
+        long long value;
+        l->get_int_property("final_bias", value);
+        bool final_bias = (bool)value;
+        load_attention_bias_v2(data,
+                               num_heads,
+                               num_kv_heads,
+                               hidden_dim,
+                               head_dim,
+                               final_bias,
+                               weight_filename,
+                               weights_folder);
+      }
+    } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
+      assert(weight_idx >= 0 || weight_idx <= 2);
+      weight_filename += (weight_idx == 0)
+                             ? ".attn_bias"
+                             : ((weight_idx == 1) ? ".weight" : ".bias");
+      std::cout << "Loading weight file " << weight_filename << std::endl;
+      std::string weight_filepath =
+          join_path({weights_folder, weight_filename});
+      load_from_file(data, volume, weight_filepath);
+    } else {
+      // default op
+      assert(weight_idx == 0 || weight_idx == 1);
+      // handle exception
+      if (weight_filename != "embed_tokens_weight_lm_head") {
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
+      }
+      std::cout << "Loading weight file " << weight_filename << std::endl;
+      std::string weight_filepath =
+          join_path({weights_folder, weight_filename});
+      load_from_file(data, volume, weight_filepath);
     }
   }
 

From 7667483ddbff7eb8fae71c8d5eae3654dd31531a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 30 Oct 2024 22:57:46 -0700
Subject: [PATCH 595/667] fix: alignment issue

---
 inference/models/llama.cc      | 24 ++++++++++++------------
 src/runtime/file_loader.cc     |  3 +--
 src/runtime/request_manager.cu | 16 +++++++++-------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 79ed850f2..988f8f4b5 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -217,18 +217,18 @@ void LLAMA::create_llama_model(FFModel &ff,
     Tensor multi =
         ff.sigmoid_silu_multi(w1, w3, llama_config.intermediate_size);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
+    w2 = ff.dense(
+        multi,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 097795521..05f3ec23f 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -322,8 +322,7 @@ void load_attention_weights_v2(DT *ptr,
     assert(one_weight_file_size == host_array.size());
     int data_index = 0;
 
-    int one_partition_size =
-        head_dim * (num_heads / tensor_parallelism_degree);
+    int one_partition_size = head_dim * (num_heads / tensor_parallelism_degree);
     for (int i = 0; i < one_weight_file_size; i++) {
       int part_idx = (i / one_partition_size) % tensor_parallelism_degree;
       int block_num = (i / one_partition_size);
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 68bfc0430..56437bd0c 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -185,15 +185,16 @@ __global__ void
     }
   }
 
+  BatchConfig::PerRequestInfo const &request_info =
+      request_infos[requext_idx_in_batch];
   BatchConfig::BitMask &causal_mask = causalMask[requext_idx_in_batch];
 
-  int const q_length = request_infos[requext_idx_in_batch].num_tokens_in_batch,
-            q_start = request_infos[requext_idx_in_batch]
-                          .first_token_index_in_request -
+  int const q_length = request_info.num_tokens_in_batch,
+            q_start = request_info.first_token_index_in_request -
                       causal_mask.non_tree_cache_size,
             non_tree_cache_size = causal_mask.non_tree_cache_size,
-            kv_len = causal_mask.non_tree_cache_size +
-                     causal_mask.tree_or_prompt_size;
+            kv_len = request_info.num_tokens_in_batch +
+                     request_info.first_token_index_in_request;
 
   uint8_t packed_bits = 0;
   for (int bit_idx = 0; bit_idx < 8; bit_idx++) {
@@ -244,8 +245,9 @@ void update_custom_mask(BatchConfig const *batch_config,
        req_idx++) {
     if (batch_config->request_available[req_idx]) {
       int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-      int kv_len = batch_config->causalMask[req_idx].non_tree_cache_size +
-                   batch_config->causalMask[req_idx].tree_or_prompt_size;
+      int kv_len =
+          batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+          batch_config->requestsInfo[req_idx].first_token_index_in_request;
       parallelism += (q_len * kv_len + 7) / 8;
     }
   }

From 7b7db5b5fd11e47362b29015c22fdd91170f4ca6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 1 Nov 2024 21:18:26 -0700
Subject: [PATCH 596/667] fix: use double for latencies

---
 inference/incr_decoding/incr_decoding.cc     | 18 +++++++++---------
 inference/spec_infer/spec_infer.cc           | 18 +++++++++---------
 inference/trace_generator/trace_generator.cc |  6 +++---
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 80b9f3c6f..2b2db8b95 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -54,9 +54,9 @@ void parse_input_args(char **argv,
                       int &sampling_seed,
                       bool &streaming_cache,
                       bool &slo_attainment_early_termination,
-                      int &baseline_latency_ms,
-                      int &ssm_spec_latency_ms,
-                      int &llm_verify_latency_ms,
+                      double &baseline_latency_ms,
+                      double &ssm_spec_latency_ms,
+                      double &llm_verify_latency_ms,
                       double &request_per_second,
                       std::string &emission_file_path,
                       bool &add_special_tokens) {
@@ -147,15 +147,15 @@ void parse_input_args(char **argv,
       continue;
     }
     if (!strcmp(argv[i], "--baseline-latency-ms")) {
-      baseline_latency_ms = std::stoi(argv[++i]);
+      baseline_latency_ms = std::stod(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--ssm-spec-latency-ms")) {
-      ssm_spec_latency_ms = std::stoi(argv[++i]);
+      ssm_spec_latency_ms = std::stod(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--llm-verify-latency-ms")) {
-      llm_verify_latency_ms = std::stoi(argv[++i]);
+      llm_verify_latency_ms = std::stod(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--request-per-second")) {
@@ -209,9 +209,9 @@ void FlexFlow::top_level_task(Task const *task,
   int sampling_seed = 0;
   bool streaming_cache = false;
   bool slo_attainment_early_termination = false;
-  int baseline_latency_ms = 50;
-  int ssm_spec_latency_ms = 20;
-  int llm_verify_latency_ms = 50;
+  double baseline_latency_ms = 50;
+  double ssm_spec_latency_ms = 20;
+  double llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
   bool add_special_tokens = true;
   std::string emission_file_path;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 78fa85ab2..cde24f7b2 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -76,9 +76,9 @@ void parse_input_args(char **argv,
                       int &sampling_seed,
                       bool &streaming_cache,
                       bool &slo_attainment_early_termination,
-                      int &baseline_latency_ms,
-                      int &ssm_spec_latency_ms,
-                      int &llm_verify_latency_ms,
+                      double &baseline_latency_ms,
+                      double &ssm_spec_latency_ms,
+                      double &llm_verify_latency_ms,
                       double &request_per_second,
                       bool &spec_infer_old_version,
                       bool &greedy_schedule,
@@ -190,15 +190,15 @@ void parse_input_args(char **argv,
       continue;
     }
     if (!strcmp(argv[i], "--baseline-latency-ms")) {
-      baseline_latency_ms = std::stoi(argv[++i]);
+      baseline_latency_ms = std::stod(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--ssm-spec-latency-ms")) {
-      ssm_spec_latency_ms = std::stoi(argv[++i]);
+      ssm_spec_latency_ms = std::stod(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--llm-verify-latency-ms")) {
-      llm_verify_latency_ms = std::stoi(argv[++i]);
+      llm_verify_latency_ms = std::stod(argv[++i]);
       continue;
     }
     if (!strcmp(argv[i], "--request-per-second")) {
@@ -393,9 +393,9 @@ void FlexFlow::top_level_task(Task const *task,
   int sampling_seed = 0;
   bool streaming_cache = false;
   bool slo_attainment_early_termination = false;
-  int baseline_latency_ms = 50;
-  int ssm_spec_latency_ms = 20;
-  int llm_verify_latency_ms = 50;
+  double baseline_latency_ms = 50;
+  double ssm_spec_latency_ms = 20;
+  double llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
   bool spec_infer_old_version = false;
   bool greedy_schedule = false;
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 853487cc4..c45c0537f 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -331,9 +331,9 @@ void FlexFlow::top_level_task(Task const *task,
   int sampling_seed = 0;
   bool streaming_cache = false;
   bool slo_attainment_early_termination = false;
-  int baseline_latency_ms = 50;
-  int ssm_spec_latency_ms = 20;
-  int llm_verify_latency_ms = 50;
+  double baseline_latency_ms = 50;
+  double ssm_spec_latency_ms = 20;
+  double llm_verify_latency_ms = 50;
   double request_per_second = 1.0;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();

From da92d535648f3ee5e693e1d85d6db6dd6d24e542 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 3 Nov 2024 22:19:04 -0800
Subject: [PATCH 597/667] feat: modified the logic of distributing the budget
 across requests

---
 include/flexflow/request_manager.h |  4 +++-
 src/runtime/request_manager.cc     | 26 ++++++++++++++++++++------
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f4b7aede8..b76291129 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -537,7 +537,9 @@ class RequestManager {
   void prune_token_tree();
   void prune_token_tree_equal();
   void prune_token_tree_greedy();
-  void add_tokens_toward_slo(RequestGuid guid, int &budget);
+  void add_tokens_toward_slo(RequestGuid guid,
+                             int &budget,
+                             int num_req_with_slo);
   void add_tokens_toward_memory_occupancy(int budget);
   void add_tokens_toward_goodput(int budget);
   void add_tokens_toward_goodput_per_request(int budget, int request_index);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0397b96f4..53cd7ecc6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2970,7 +2970,7 @@ void RequestManager::prune_token_tree() {
        spare_latency_2_request_index) {
     int request_index = spare_latency_request_index_pair.second;
     RequestGuid guid = guid_of_requests[request_index];
-    add_tokens_toward_slo(guid, budget);
+    add_tokens_toward_slo(guid, budget, spare_latency_2_request_index.size());
   }
 
   assert(budget >= 0);
@@ -3025,11 +3025,25 @@ void RequestManager::prune_token_tree_greedy() {
   }
 }
 
-void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
+void RequestManager::add_tokens_toward_slo(RequestGuid guid,
+                                           int &budget,
+                                           int num_req_with_slo) {
   Request &request = all_requests[guid];
-  double num_tokens_to_decode = (ssm_spec_latency_ms + llm_verify_latency_ms) *
-                                correction_factor /
-                                (baseline_latency_ms * request.get_slo_ratio());
+  double num_tokens_to_decode = 0.0;
+  double num_tokens_to_decode_per_step =
+      (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
+      (baseline_latency_ms * request.get_slo_ratio());
+  bool attained =
+      request.decode_latency_ms <= get_request_expected_latency(request);
+
+  if (attained) {
+    num_tokens_to_decode = num_tokens_to_decode_per_step;
+  } else {
+    num_tokens_to_decode = num_tokens_to_decode_per_step +
+                           request.decode_latency_ms /
+                               (baseline_latency_ms * request.get_slo_ratio()) -
+                           request.decode_length();
+  }
 
   // The root is already included
   // In function add_root_to_spec_token_tree
@@ -3037,7 +3051,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid, int &budget) {
 
   // The max token that can be added to the token tree when fulfilling the SLO
   int max_token_toward_slo =
-      int(get_max_tokens_per_batch() / get_num_active_requests());
+      int(get_max_tokens_per_batch() / num_req_with_slo * 1.1);
 
   while (budget > 0 and max_token_toward_slo > 0 and
          current_added < num_tokens_to_decode) {

From 832f5cb0d86703eda245abb1b891aa55ec707c51 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Mon, 4 Nov 2024 09:49:42 -0800
Subject: [PATCH 598/667] fix for merge

---
 include/flexflow/request_manager.h           | 2 --
 inference/trace_generator/trace_generator.cc | 1 +
 src/runtime/batch_config.cc                  | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index fa950e776..8403fb889 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -282,8 +282,6 @@ class RequestManager {
   int get_max_kv_cache_size();
   void set_max_output_length(int max_output_length);
   int get_max_output_length();
-  void set_max_kv_cache_size(int max_kv_cache_size);
-  int get_max_kv_cache_size();
   void set_decoding_mode(DecodingMode mode);
   void set_verbose(bool verbose_);
   int get_k();
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 2c37b336d..602f884b9 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -354,6 +354,7 @@ void FlexFlow::top_level_task(Task const *task,
                    verbose,
                    max_sequence_length,
                    max_output_length,
+                   max_kv_cache_size,
                    scaling_factor);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index cf6700934..ca5d08e98 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -110,6 +110,7 @@ int BatchConfig::max_output_length() {
 
 int BatchConfig::max_kv_cache_size() {
   return RequestManager::get_request_manager()->get_max_kv_cache_size();
+}
 bool BatchConfig::streaming_cache() {
   return RequestManager::get_request_manager()->get_streaming_cache();
 }

From 4a7162f9f1ae92480957a2c16cc9863e5495bc63 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Mon, 4 Nov 2024 12:48:54 -0800
Subject: [PATCH 599/667] init page manager at request manager init and clean
 the format

---
 include/flexflow/batch_config.h               |   6 +-
 include/flexflow/model.h                      |   6 +
 .../inc_multihead_self_attention_kernels.h    |  25 +-
 include/flexflow/page_manager.h               | 177 ++++++-----
 include/flexflow/request_manager.h            |   8 +-
 inference/incr_decoding/incr_decoding.cc      |   2 +-
 inference/models/falcon.cc                    |   5 +
 inference/models/falcon.h                     |   1 +
 inference/models/llama.cc                     |   5 +
 inference/models/llama.h                      |   1 +
 inference/models/mpt.cc                       |   4 +
 inference/models/mpt.h                        |   1 +
 inference/models/opt.cc                       |   4 +
 inference/models/opt.h                        |   1 +
 inference/models/starcoder.cc                 |   5 +
 inference/models/starcoder.h                  |   1 +
 src/ops/tree_inc_multihead_self_attention.cu  | 119 ++++----
 src/runtime/batch_config.cc                   |   2 +-
 src/runtime/inference_manager.cc              |  16 +
 src/runtime/page_manager.cc                   | 278 ++++++++++--------
 src/runtime/request_manager.cc                | 149 ++++++----
 src/runtime/request_manager.cu                |  41 ++-
 22 files changed, 514 insertions(+), 343 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index fccf2b6f0..d4c2e38e2 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -114,9 +114,9 @@ class BatchConfig {
     int first_token_index_in_request = -1;
     int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
-    int padding = 0; // Padding for memory pointer alignment
-    int num_kv_pages; //number of kv pages used
-    int kv_last_page_len; //last page length of kv
+    int padding = 0;      // Padding for memory pointer alignment
+    int num_kv_pages;     // number of kv pages used
+    int kv_last_page_len; // last page length of kv
     RequestGuid request_guid;
   };
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 32177a383..bff5d2802 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1081,6 +1081,10 @@ class FFModel {
                CompMode comp_mode = COMP_MODE_TRAINING);
   void compile_inference();
   void set_transformer_layer_id(int id);
+  void set_num_transformer_layers(int num_layers);
+  void set_num_kv_heads(int num_heads);
+  void set_qkv_dim(int qkv_dim);
+  void set_size_dt(int size_dt);
   void set_position_offset(int offset);
   void graph_optimize(size_t budget,
                       bool only_data_parallel,
@@ -1142,6 +1146,8 @@ class FFModel {
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
+  int num_transformer_layers;
+  int num_kv_heads, qkv_dim, size_dt;
   int position_offset;
   FFConfig config;
   FFIterationConfig iter_config;
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index fe8d32387..919393985 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -15,20 +15,25 @@ namespace Kernels {
 namespace IncMultiHeadAttention {
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_k_entry_offset_verify(int const token_idx,
-                                                     int const page_idx,
-                                                     int const num_heads,
-                                                     int const head_dim) {
-  size_t index = ((page_idx) * kPagesize * 2 + (token_idx % kPagesize)) * head_dim * num_heads;
+__device__ __forceinline__ size_t
+    get_k_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index = ((page_idx)*kPagesize * 2 + (token_idx % kPagesize)) *
+                 head_dim * num_heads;
   return index;
 }
 
 // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
-__device__ __forceinline__ size_t get_v_entry_offset_verify(int const token_idx,
-                                                     int const page_idx,
-                                                     int const num_heads,
-                                                     int const head_dim) {
-  size_t index = ((page_idx) * kPagesize * 2 + kPagesize + (token_idx % kPagesize)) * head_dim * num_heads;
+__device__ __forceinline__ size_t
+    get_v_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index =
+      ((page_idx)*kPagesize * 2 + kPagesize + (token_idx % kPagesize)) *
+      head_dim * num_heads;
   return index;
 }
 
diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
index 54b661e02..87a544d81 100644
--- a/include/flexflow/page_manager.h
+++ b/include/flexflow/page_manager.h
@@ -1,14 +1,14 @@
 #pragma once
 
 #include "flexflow/batch_config.h"
+#include "flexflow/config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
-#include "flexflow/config.h"
 #include "flexflow/utils/file_loader.h"
+#include <deque>
 #include <future>
 #include <mutex>
 #include <tokenizers_cpp.h>
-#include <deque>
 
 namespace FlexFlow {
 
@@ -20,118 +20,143 @@ using TokenId = BatchConfig::TokenId;
  */
 class LogicalTokenBlock {
 public:
-    using TokenId = BatchConfig::TokenId;
+  using TokenId = BatchConfig::TokenId;
 
-    // Constructor
-    LogicalTokenBlock(int block_number, uint32_t block_size);
+  // Constructor
+  LogicalTokenBlock(int block_number, uint32_t block_size);
 
-    // Method to check if the block is empty
-    bool is_empty() const;
+  // Method to check if the block is empty
+  bool is_empty() const;
 
-    // Method to check if the block is full
-    bool is_full() const;
+  // Method to check if the block is full
+  bool is_full() const;
 
-    // Method to get the number of empty slots
-    int get_num_empty_slots() const;
+  // Method to get the number of empty slots
+  int get_num_empty_slots() const;
 
-    // Method to get the number of allocated slots
-    int get_num_alloc_slots() const;
+  // Method to get the number of allocated slots
+  int get_num_alloc_slots() const;
 
-    // Used to clean up the spec tokens in a block since these spec tokens may not be committed after use
-    void reset_num_spec_tokens();
+  // Used to clean up the spec tokens in a block since these spec tokens may not
+  // be committed after use
+  void reset_num_spec_tokens();
 
-    // Method to append tokens
-    void append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed);
+  // Method to append tokens
+  void append_tokens(std::vector<TokenId> const &token_ids_to_append,
+                     bool committed);
 
-    int get_num_tokens() const { return num_tokens; }
-    int get_num_commit_tokens() const { return num_commit_tokens; }
-    int get_num_spec_tokens() const { return num_spec_tokens; }
+  int get_num_tokens() const {
+    return num_tokens;
+  }
+  int get_num_commit_tokens() const {
+    return num_commit_tokens;
+  }
+  int get_num_spec_tokens() const {
+    return num_spec_tokens;
+  }
 
-    std::vector<TokenId> get_token_ids() const;
+  std::vector<TokenId> get_token_ids() const;
 
 private:
-    int block_number; // the index of the logical token block
-    int block_size; // the size of the block
-    int num_tokens; // the number of tokens currently stored in the block
-    int num_commit_tokens; // the number of tokens inside this block that are already committed
-    int num_spec_tokens; // the number of tokens inside this block that are speculative tokens, which is stored temporarily
-    std::vector<TokenId> token_ids; //store the token ids in a order that corresponds to the inference sequence
+  int block_number;      // the index of the logical token block
+  int block_size;        // the size of the block
+  int num_tokens;        // the number of tokens currently stored in the block
+  int num_commit_tokens; // the number of tokens inside this block that are
+                         // already committed
+  int num_spec_tokens;   // the number of tokens inside this block that are
+                         // speculative tokens, which is stored temporarily
+  std::vector<TokenId> token_ids; // store the token ids in a order that
+                                  // corresponds to the inference sequence
 };
 
 /**
  * @class PhysicalTokenBlock
- * @brief A class to represent a physical block of tokens similar to physical memory address
- * It keeps track of the location of the tokens stored on GPU memory
+ * @brief A class to represent a physical block of tokens similar to physical
+ * memory address It keeps track of the location of the tokens stored on GPU
+ * memory
  */
 class PhysicalTokenBlock {
 public:
-    // Constructor
-    PhysicalTokenBlock(int block_number, int block_size);
-
-    // Method to get the block number
-    int get_block_number() const { return block_number; }
-    void incr_ref_count() { ref_count++; }
-    void decr_ref_count() { ref_count--; }
-    int ref_count; // reference count, TODO: move to private
+  // Constructor
+  PhysicalTokenBlock(int block_number, int block_size);
+
+  // Method to get the block number
+  int get_block_number() const {
+    return block_number;
+  }
+  void incr_ref_count() {
+    ref_count++;
+  }
+  void decr_ref_count() {
+    ref_count--;
+  }
+  int ref_count; // reference count, TODO: move to private
 
 private:
-    int block_number; // the index of the physical token block
-    int block_size; // the size of the block
+  int block_number; // the index of the physical token block
+  int block_size;   // the size of the block
 };
 
 /**
  * @class BlockAllocator
- * @brief A Block Manager that is reponsible for maintaining a pool of free blocks
+ * @brief A Block Manager that is reponsible for maintaining a pool of free
+ * blocks
  */
 class BlockAllocator {
 public:
-    // Constructor
-    BlockAllocator(int block_size, int num_total_blocks);
+  // Constructor
+  BlockAllocator(int block_size, int num_total_blocks);
 
-    // Allocate a block
-    PhysicalTokenBlock allocate();
+  // Allocate a block
+  PhysicalTokenBlock allocate();
 
-    // Free a block
-    void free(PhysicalTokenBlock& block);
+  // Free a block
+  void free(PhysicalTokenBlock &block);
 
-    // Get the number of free blocks
-    int get_num_free_blocks() const;
+  // Get the number of free blocks
+  int get_num_free_blocks() const;
 
 private:
-    int block_size;
-    int num_total_blocks;
-    std::deque<PhysicalTokenBlock> free_blocks;
+  int block_size;
+  int num_total_blocks;
+  std::deque<PhysicalTokenBlock> free_blocks;
 };
 
 /*
-* @class PageManager
-* @brief A wrapper class that manages the kv cache allocation status
-* notice that all the layers of model will share the same page manager because the position of kv cache will be the same
-*/
+ * @class PageManager
+ * @brief A wrapper class that manages the kv cache allocation status
+ * notice that all the layers of model will share the same page manager because
+ * the position of kv cache will be the same
+ */
 class PageManager {
 public:
-    // Get the singleton instance of the PageManager as it will be shared in multiple places
-    static PageManager *get_page_manager();
-    using BlockTable = std::vector<PhysicalTokenBlock>;
-    using RequestGuid = BatchConfig::RequestGuid;
-    PageManager(int block_size, int num_total_blocks);
-
-    int allocate_one_block(const RequestGuid& request_guid);
-    void free_request(const RequestGuid& request_guid);
-    //used for the case that we want to free the last num_blocks that stores spec tokens(which are the tokens are not yet committed)
-    void free_multiple_blocks(const RequestGuid& request_guid, int num_blocks);
-    std::vector<int> get_block_table_indices(const RequestGuid& request_guid) const;
-
-    
-    void free_block_table(BlockTable& block_table);
-private:
-    int block_size; // the size of the block
-    int num_total_blocks; // the total number of blocks
-    BlockAllocator block_allocator;
-    std::unordered_map<RequestGuid, BlockTable> block_tables;
+  // Get the singleton instance of the PageManager as it will be shared in
+  // multiple places
+  static PageManager *get_page_manager();
+  static PageManager *get_page_manager(FFModel *ff, int kv_cache_size);
+  using BlockTable = std::vector<PhysicalTokenBlock>;
+  using RequestGuid = BatchConfig::RequestGuid;
+  PageManager(int block_size, int num_total_blocks);
+  int allocate_one_block(RequestGuid const &request_guid);
+  void free_request(RequestGuid const &request_guid);
+  // used for the case that we want to free the last num_blocks that stores spec
+  // tokens(which are the tokens are not yet committed)
+  void free_multiple_blocks(RequestGuid const &request_guid, int num_blocks);
+  std::vector<int>
+      get_block_table_indices(RequestGuid const &request_guid) const;
+
+  void free_block_table(BlockTable &block_table);
 
-    int get_num_total_free_blocks() const;
-    int get_num_allocated_blocks(const RequestGuid& request_guid) const;
+private:
+  int num_transformer_layers;
+  int total_kv_cache_size;
+  int block_size;       // the size of the block
+  int num_total_blocks; // the total number of blocks
+  BlockAllocator block_allocator;
+  std::unordered_map<RequestGuid, BlockTable> block_tables;
+
+  int get_num_total_free_blocks() const;
+  int get_num_allocated_blocks(RequestGuid const &request_guid) const;
 };
 
 }; // namespace FlexFlow
\ No newline at end of file
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8403fb889..f7fe3c872 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -18,8 +18,8 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
-#include "flexflow/utils/file_loader.h"
 #include "flexflow/page_manager.h"
+#include "flexflow/utils/file_loader.h"
 #include <condition_variable>
 #include <future>
 #include <mutex>
@@ -149,7 +149,8 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  //page attention, page_last_committed should be -1 because there are no blocks at the beginning
+  // page attention, page_last_committed should be -1 because there are no
+  // blocks at the beginning
   int page_last_committed = -1;
   std::vector<LogicalTokenBlock> blocks;
 
@@ -539,8 +540,7 @@ class RequestManager {
   int get_len_last_block(Request &request) const;
   int get_idx_last_logical_token(Request &request) const;
   int idx_logical_to_physical(Request &request, int idx_logical);
-  void _append_block_to_request(
-    Request &request, bool is_commit);
+  void _append_block_to_request(Request &request, bool is_commit);
   int append_token_to_block(Request &request, TokenId token, bool is_commit);
   void reset_block_table(Request &request);
   void print_num_tokens(Request &request);
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 1f7947a8c..5a18daab4 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -209,7 +209,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
   int max_output_length = 512;
-  int max_kv_cache_size = -1; //if -1, then use the default value
+  int max_kv_cache_size = -1; // if -1, then use the default value
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index 28bd7d574..9049b3885 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -63,6 +63,11 @@ void FALCON::create_falcon_model(FFModel &ff,
   Tensor mha = nullptr, mlp_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(falcon_config.n_layer);
+  ff.set_num_kv_heads(falcon_config.n_head_kv);
+  ff.set_qkv_dim(falcon_config.hidden_size / falcon_config.n_head * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
+
   for (int i = 0; i < falcon_config.n_layer; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index 393462633..a15c28991 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 988f8f4b5..92f1cdf76 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -64,6 +64,11 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor w2 = nullptr;
 
+  ff.set_num_transformer_layers(llama_config.num_hidden_layers);
+  ff.set_num_kv_heads(llama_config.num_key_value_heads);
+  ff.set_qkv_dim(llama_config.hidden_size / llama_config.num_attention_heads *
+                 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < llama_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 3f11ca96d..cd6f9c5cc 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index a7bf79f6b..b95cb5c91 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -64,6 +64,10 @@ void MPT::create_mpt_model(FFModel &ff,
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(mpt_config.n_layers);
+  ff.set_num_kv_heads(mpt_config.n_heads);
+  ff.set_qkv_dim(mpt_config.hidden_size / mpt_config.n_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < mpt_config.n_layers; i++) {
     ff.set_transformer_layer_id(i);
 
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index bd7a9410f..8466ea1cb 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 25f9833a1..352809ede 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -77,6 +77,10 @@ void OPT::create_opt_model(FFModel &ff,
   Tensor fc2 = nullptr, added = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(opt_config.num_hidden_layers);
+  ff.set_num_kv_heads(opt_config.num_attention_heads);
+  ff.set_qkv_dim(opt_config.hidden_size / opt_config.num_attention_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < opt_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 90443e872..23ba8888b 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 31505b0ba..401a754d0 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -82,6 +82,11 @@ void STARCODER::create_starcoder_model(
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
 
+  ff.set_num_transformer_layers(startcoder_config.num_hidden_layers);
+  ff.set_num_kv_heads(startcoder_config.num_attention_heads);
+  ff.set_qkv_dim(startcoder_config.hidden_size /
+                 startcoder_config.num_attention_heads * 2);
+  ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < startcoder_config.num_hidden_layers; i++) {
     // set transformer layer id
     ff.set_transformer_layer_id(i);
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index 7241acde3..57e1229f1 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -16,6 +16,7 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 5ec718586..d1e916461 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -48,19 +48,19 @@ using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
 template <typename DT>
-__global__ void
-    update_qkv_in_batch_verify_kernel(DT *qkv_proj_array,
-                               half *qTmp_ptr,
-                               half *kvCache_ptr,
-                               int32_t *kv_indptr,
-                               int32_t *kv_page_indices,
-                               bool const *request_available,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int const max_num_pages,
-                               int num_q_heads,
-                               int num_kv_heads,
-                               int head_dim,
-                               int num_new_tokens) {
+__global__ void update_qkv_in_batch_verify_kernel(
+    DT *qkv_proj_array,
+    half *qTmp_ptr,
+    half *kvCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
+    bool const *request_available,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int const max_num_pages,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int num_new_tokens) {
   int const q_hidden_size = num_q_heads * head_dim;
   int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
   int const kv_hidden_size = num_kv_heads * head_dim;
@@ -68,7 +68,6 @@ __global__ void
   int const token_idx = thread_idx / q_hidden_size;
   int const offset = thread_idx % q_hidden_size;
 
-
   if (token_idx >= num_new_tokens) {
     return;
   }
@@ -76,7 +75,6 @@ __global__ void
   int const req_idx = tokenInfos[token_idx].request_index;
   int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
-
   // calculate the compact request index in the easiest way
   // TODO: recheck
   int req_idx_compact = -1;
@@ -98,13 +96,12 @@ __global__ void
     int start = kv_indptr[req_idx_compact];
     int end = kv_indptr[req_idx_compact + 1] - 1;
     assert(start <= end && "Invalid kv_indptr");
-    assert(start + (token_abs_idx / kPagesize) <= end &&
-           "Invalid page index");
+    assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index");
     int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
     size_t to_k_idx = get_k_entry_offset_verify(
-           token_abs_idx, page_idx, num_kv_heads, head_dim),
+               token_abs_idx, page_idx, num_kv_heads, head_dim),
            to_v_idx = get_v_entry_offset_verify(
-           token_abs_idx, page_idx, num_kv_heads, head_dim);
+               token_abs_idx, page_idx, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
@@ -119,8 +116,8 @@ __global__ void
 
 template <typename DT>
 void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         cudaStream_t stream) {
+                                BatchConfig const *bc,
+                                cudaStream_t stream) {
   // printf("entered update_qkv_in_batch_verify\n");
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
@@ -131,20 +128,21 @@ void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
       round_up_pages(BatchConfig::max_sequence_length() +
                      BatchConfig::max_spec_tree_token_num());
   update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
-                               min(CUDA_NUM_THREADS, parallelism),
-                               0,
-                               stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                                         static_cast<half *>(m->queryTmp),
-                                         static_cast<half *>(m->kvCache),
-                                         m->handle.tree_verify_attention_metadata->kv_indptr,
-                                         m->handle.tree_verify_attention_metadata->kv_indices,
-                                         m->request_available,
-                                         m->token_infos,
-                                         max_num_pages,
-                                         m->num_q_heads,
-                                         m->num_kv_heads,
-                                         m->qk_dim,
-                                         num_new_tokens);
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->kvCache),
+      m->handle.tree_verify_attention_metadata->kv_indptr,
+      m->handle.tree_verify_attention_metadata->kv_indices,
+      m->request_available,
+      m->token_infos,
+      max_num_pages,
+      m->num_q_heads,
+      m->num_kv_heads,
+      m->qk_dim,
+      num_new_tokens);
   // cudaStreamSynchronize(stream);
   // printf("exited update_qkv_in_batch_verify\n");
 }
@@ -187,16 +185,29 @@ __global__ void commit_tokens_kernel(
       // int const req_id = committedTokenInfos[i].request_index;
       // int const tok_id = committedTokenInfos[i].token_depth;
       int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
-      int const page_from_idx = committedTokenInfos[i].index_in_kv_cache / kPagesize;
+      int const page_from_idx =
+          committedTokenInfos[i].index_in_kv_cache / kPagesize;
 
       size_t from_k_idx = get_k_entry_offset_verify(
-                  committedTokenInfos[i].index_in_kv_cache, page_from_idx, num_kv_heads, head_dim),
+                 committedTokenInfos[i].index_in_kv_cache,
+                 page_from_idx,
+                 num_kv_heads,
+                 head_dim),
              from_v_idx = get_v_entry_offset_verify(
-                  committedTokenInfos[i].index_in_kv_cache, page_from_idx, num_kv_heads, head_dim);
-      size_t to_k_idx = get_k_entry_offset_verify(
-                 committedTokenInfos[i].token_depth, page_to_idx, num_kv_heads, head_dim),
-             to_v_idx = get_v_entry_offset_verify(
-                 committedTokenInfos[i].token_depth, page_to_idx, num_kv_heads, head_dim);
+                 committedTokenInfos[i].index_in_kv_cache,
+                 page_from_idx,
+                 num_kv_heads,
+                 head_dim);
+      size_t to_k_idx =
+                 get_k_entry_offset_verify(committedTokenInfos[i].token_depth,
+                                           page_to_idx,
+                                           num_kv_heads,
+                                           head_dim),
+             to_v_idx =
+                 get_v_entry_offset_verify(committedTokenInfos[i].token_depth,
+                                           page_to_idx,
+                                           num_kv_heads,
+                                           head_dim);
 
       kCache_ptr[to_k_idx + offset] = kCache_ptr[from_k_idx + offset];
       kCache_ptr[to_v_idx + offset] = kCache_ptr[from_v_idx + offset];
@@ -220,16 +231,17 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
-                         stream>>>(static_cast<half *>(m->kvCache),
-                                   m->handle.tree_verify_attention_metadata->kv_indptr,
-                                   m->handle.tree_verify_attention_metadata->kv_indices,
-                                   m->committed_token_infos,
-                                   m->request_available,
-                                   num_requests,
-                                   m->num_kv_heads,
-                                   m->qk_dim,
-                                   m->num_tokens_to_commit,
-                                   max_num_pages);
+                         stream>>>(
+      static_cast<half *>(m->kvCache),
+      m->handle.tree_verify_attention_metadata->kv_indptr,
+      m->handle.tree_verify_attention_metadata->kv_indices,
+      m->committed_token_infos,
+      m->request_available,
+      num_requests,
+      m->num_kv_heads,
+      m->qk_dim,
+      m->num_tokens_to_commit,
+      max_num_pages);
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
   //   float elapsed = 0;
@@ -611,7 +623,8 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // }
   // cudaError_t err = cudaDeviceSynchronize();
   // if (err != cudaSuccess) {
-  //     std::cerr << "Kernel launch failed with error: " << cudaGetErrorString(err) << std::endl;
+  //     std::cerr << "Kernel launch failed with error: " <<
+  //     cudaGetErrorString(err) << std::endl;
   // }
 }
 
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index ca5d08e98..426f848d9 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -311,7 +311,7 @@ void StreamingCacheInfo::reset_cache() {
   total_len = 0;
 }
 
-//page attention: TODO: I think we just need to change the index
+// page attention: TODO: I think we just need to change the index
 
 int StreamingCacheInfo::global_2_cache_index(int global_index) {
   if (global_index < sink_cache_size) {
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index dd13bb2e0..31c2a51cd 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -493,6 +493,22 @@ void FFModel::set_transformer_layer_id(int id) {
   assert(id < MAX_NUM_TRANSFORMER_LAYERS);
 }
 
+void FFModel::set_num_transformer_layers(int num_layers) {
+  num_transformer_layers = num_layers;
+}
+
+void FFModel::set_num_kv_heads(int num_heads) {
+  num_kv_heads = num_heads;
+}
+
+void FFModel::set_qkv_dim(int qkv_dim) {
+  qkv_dim = qkv_dim;
+}
+
+void FFModel::set_size_dt(int size_dt) {
+  size_dt = size_dt;
+}
+
 void FFModel::set_position_offset(int offset) {
   assert(offset == 0 || offset == 2);
   position_offset = offset;
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index f59837f30..c88c4f6ce 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -17,202 +17,234 @@
 
 namespace FlexFlow {
 
-// For all runtime functions, they share a single page manager for pages information
+// For all runtime functions, they share a single page manager for pages
+// information
 PageManager *page_manager_singleton = nullptr;
 
 // the interface of logicaltokenblock
 LogicalTokenBlock::LogicalTokenBlock(int block_number, uint32_t block_size)
-    : block_number(block_number), block_size(block_size), num_tokens(0), num_commit_tokens(0), num_spec_tokens(0) {
-    }
+    : block_number(block_number), block_size(block_size), num_tokens(0),
+      num_commit_tokens(0), num_spec_tokens(0) {}
 
 bool LogicalTokenBlock::is_empty() const {
-    assert(num_spec_tokens == 0 && num_commit_tokens == 0);
-    assert(num_tokens <= block_size);
-    return num_tokens == 0;
+  assert(num_spec_tokens == 0 && num_commit_tokens == 0);
+  assert(num_tokens <= block_size);
+  return num_tokens == 0;
 }
 
 bool LogicalTokenBlock::is_full() const {
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
-    return num_tokens == block_size;
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return num_tokens == block_size;
 }
 
 int LogicalTokenBlock::get_num_empty_slots() const {
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
-    return block_size - num_tokens;
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return block_size - num_tokens;
 }
 
 int LogicalTokenBlock::get_num_alloc_slots() const {
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
-    return num_tokens;
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return num_tokens;
 }
 
-void LogicalTokenBlock::reset_num_spec_tokens(){
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
+void LogicalTokenBlock::reset_num_spec_tokens() {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
 
-    num_tokens -= num_spec_tokens;
-    num_spec_tokens = 0;
+  num_tokens -= num_spec_tokens;
+  num_spec_tokens = 0;
 
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
 }
 
-void LogicalTokenBlock::append_tokens(const std::vector<TokenId>& token_ids_to_append, bool committed) {
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
-    if (num_tokens + token_ids_to_append.size() > block_size) {
-        printf("block is full! Cannot append more tokens\n");
-        throw std::runtime_error("Block is full! Cannot append more tokens.");
-    }
-    token_ids.insert(token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end());
-    num_tokens += token_ids_to_append.size();
-    if (committed) {
-        num_commit_tokens += token_ids_to_append.size();
-    }else{
-        num_spec_tokens += token_ids_to_append.size();
-    }
-    assert(num_spec_tokens + num_commit_tokens == num_tokens);
-    assert(num_tokens <= block_size);
+void LogicalTokenBlock::append_tokens(
+    std::vector<TokenId> const &token_ids_to_append, bool committed) {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  if (num_tokens + token_ids_to_append.size() > block_size) {
+    printf("block is full! Cannot append more tokens\n");
+    throw std::runtime_error("Block is full! Cannot append more tokens.");
+  }
+  token_ids.insert(
+      token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end());
+  num_tokens += token_ids_to_append.size();
+  if (committed) {
+    num_commit_tokens += token_ids_to_append.size();
+  } else {
+    num_spec_tokens += token_ids_to_append.size();
+  }
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
 }
 
 std::vector<TokenId> LogicalTokenBlock::get_token_ids() const {
-    return token_ids;
+  return token_ids;
 }
 
 PhysicalTokenBlock::PhysicalTokenBlock(int block_number, int block_size)
     : block_number(block_number), block_size(block_size), ref_count(0) {}
 
 BlockAllocator::BlockAllocator(int block_size, int num_total_blocks) {
-    for (int block_number = 0; block_number < num_total_blocks; ++block_number) {
-        free_blocks.push_back(PhysicalTokenBlock(block_number, block_size));
-    }
-    num_total_blocks = num_total_blocks;
+  for (int block_number = 0; block_number < num_total_blocks; ++block_number) {
+    free_blocks.push_back(PhysicalTokenBlock(block_number, block_size));
+  }
+  num_total_blocks = num_total_blocks;
 }
 
 // Allocate a block
 PhysicalTokenBlock BlockAllocator::allocate() {
-    if (free_blocks.empty()) {
-        printf("no free blocks are available\n");
-        throw std::runtime_error("Out of memory! No free blocks are available.");
-    }
-    PhysicalTokenBlock block = free_blocks.front();
-    free_blocks.pop_front();
-    block.incr_ref_count();
-    return block;
+  if (free_blocks.empty()) {
+    printf("no free blocks are available\n");
+    throw std::runtime_error("Out of memory! No free blocks are available.");
+  }
+  PhysicalTokenBlock block = free_blocks.front();
+  free_blocks.pop_front();
+  block.incr_ref_count();
+  return block;
 }
 
 // Free a block
-void BlockAllocator::free(PhysicalTokenBlock& block) {
-    if (block.ref_count == 0) {
-        printf("block is already freed\n");
-        throw std::runtime_error("Double free! Block is already freed.");
-    }
-    block.decr_ref_count();
-    if (block.ref_count == 0) {
-        // printf("put block number: %d back to free_blocks\n", block.get_block_number());
-        free_blocks.push_back(block);
-    }else{
-        // in current implementation this should not be the case
-        printf("block is not freed. Ref count: %d\n", block.ref_count);
-        throw std::runtime_error("Block is not freed. Ref count: " + std::to_string(block.ref_count));
-    }
+void BlockAllocator::free(PhysicalTokenBlock &block) {
+  if (block.ref_count == 0) {
+    printf("block is already freed\n");
+    throw std::runtime_error("Double free! Block is already freed.");
+  }
+  block.decr_ref_count();
+  if (block.ref_count == 0) {
+    // printf("put block number: %d back to free_blocks\n",
+    // block.get_block_number());
+    free_blocks.push_back(block);
+  } else {
+    // in current implementation this should not be the case
+    printf("block is not freed. Ref count: %d\n", block.ref_count);
+    throw std::runtime_error("Block is not freed. Ref count: " +
+                             std::to_string(block.ref_count));
+  }
 }
 
 int BlockAllocator::get_num_free_blocks() const {
-    return free_blocks.size();
+  return free_blocks.size();
 }
 
 PageManager::PageManager(int block_size, int num_total_blocks)
     : block_size(block_size), num_total_blocks(num_total_blocks),
-      block_allocator(block_size, num_total_blocks) {
-      }
+      block_allocator(block_size, num_total_blocks) {}
 
-//return the physical number of this block
-int PageManager::allocate_one_block(const RequestGuid& request_guid) {
-    BlockTable& block_table = block_tables[request_guid];
+// return the physical number of this block
+int PageManager::allocate_one_block(RequestGuid const &request_guid) {
+  BlockTable &block_table = block_tables[request_guid];
 
-    PhysicalTokenBlock block = block_allocator.allocate();
-    block_table.push_back(block);
-    block_tables[request_guid] = block_table;
-    return block.get_block_number();
+  PhysicalTokenBlock block = block_allocator.allocate();
+  block_table.push_back(block);
+  block_tables[request_guid] = block_table;
+  return block.get_block_number();
 }
 
-void PageManager::free_block_table(BlockTable& block_table) {
-    // make it reverse order to free the last allocated block first
-    BlockTable::reverse_iterator rit = block_table.rbegin();
-    for (; rit != block_table.rend(); ++rit) {
-        block_allocator.free(*rit);
-    }
-    return;
+void PageManager::free_block_table(BlockTable &block_table) {
+  // make it reverse order to free the last allocated block first
+  BlockTable::reverse_iterator rit = block_table.rbegin();
+  for (; rit != block_table.rend(); ++rit) {
+    block_allocator.free(*rit);
+  }
+  return;
 }
 
-void PageManager::free_request(const RequestGuid& request_guid) {
-    //we only free the blocks that are already used
-    assert(block_tables.find(request_guid) != block_tables.end());
-    BlockTable block_table = block_tables[request_guid];
-    free_block_table(block_table);
-    block_tables.erase(request_guid);
-    return;
+void PageManager::free_request(RequestGuid const &request_guid) {
+  // we only free the blocks that are already used
+  assert(block_tables.find(request_guid) != block_tables.end());
+  BlockTable block_table = block_tables[request_guid];
+  free_block_table(block_table);
+  block_tables.erase(request_guid);
+  return;
 }
 
 // delete the last num_blocks in the request_guid
-void PageManager::free_multiple_blocks(const RequestGuid& request_guid, int num_blocks) {
-    assert(block_tables.find(request_guid) != block_tables.end());
-    auto& block_table = block_tables[request_guid];
-    assert(num_blocks <= block_table.size());
-    int num_blocks_allocated = block_table.size();
-    for (int i = 0; i < num_blocks; i++) {
-        block_allocator.free(block_table[num_blocks_allocated - i - 1]);
-    }
-    // only keep the first num_blocks_allocated - num_blocks blocks
-    block_table.erase(block_table.begin() + num_blocks_allocated - num_blocks, block_table.end());
-    block_tables[request_guid] = block_table;
-    return;
+void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
+                                       int num_blocks) {
+  assert(block_tables.find(request_guid) != block_tables.end());
+  auto &block_table = block_tables[request_guid];
+  assert(num_blocks <= block_table.size());
+  int num_blocks_allocated = block_table.size();
+  for (int i = 0; i < num_blocks; i++) {
+    block_allocator.free(block_table[num_blocks_allocated - i - 1]);
+  }
+  // only keep the first num_blocks_allocated - num_blocks blocks
+  block_table.erase(block_table.begin() + num_blocks_allocated - num_blocks,
+                    block_table.end());
+  block_tables[request_guid] = block_table;
+  return;
 }
 
-// int PageManager::get_index_last_block(const RequestGuid& request_guid) const {
+// int PageManager::get_index_last_block(const RequestGuid& request_guid) const
+// {
 //     const auto& block_table = block_tables.at(request_guid);
 //     return block_table.back.get_block_number();
 // }
 
-std::vector<int> PageManager::get_block_table_indices(const RequestGuid& request_guid) const {
-    std::vector<int> indices;
-    const auto& it = block_tables.find(request_guid);
-    if (it == block_tables.end()) {
-        return indices;
-    }
-    const auto& block_table = it->second;
-    for (const auto& block : block_table) {
-        indices.push_back(block.get_block_number());
-    }
+std::vector<int> PageManager::get_block_table_indices(
+    RequestGuid const &request_guid) const {
+  std::vector<int> indices;
+  auto const &it = block_tables.find(request_guid);
+  if (it == block_tables.end()) {
     return indices;
+  }
+  auto const &block_table = it->second;
+  for (auto const &block : block_table) {
+    indices.push_back(block.get_block_number());
+  }
+  return indices;
 }
 
 int PageManager::get_num_total_free_blocks() const {
-    return block_allocator.get_num_free_blocks();
+  return block_allocator.get_num_free_blocks();
+}
+
+int PageManager::get_num_allocated_blocks(
+    RequestGuid const &request_guid) const {
+  auto it = block_tables.find(request_guid);
+  if (it == block_tables.end()) {
+    return 0;
+  } else {
+    return it->second.size();
+  }
 }
 
-int PageManager::get_num_allocated_blocks(const RequestGuid& request_guid) const {
-    auto it = block_tables.find(request_guid);
-    if (it == block_tables.end()) {
-        return 0;
-    }else{
-        return it->second.size();
+PageManager *PageManager::get_page_manager(FFModel *ff,
+                                           int total_kv_cache_size) {
+  int num_kv_heads = ff->num_kv_heads;
+  int size_dt = ff->size_dt;
+  int qkv_dim = ff->qkv_dim;
+  int num_transformer_layers = ff->num_transformer_layers;
+  int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
+  if (page_manager_singleton == nullptr) {
+    int num_total_blocks = 0;
+    if (total_kv_cache_size == -1) {
+      num_total_blocks = (BatchConfig::max_spec_tree_token_num() +
+                          BatchConfig::max_sequence_length() + kPagesize - 1) /
+                         kPagesize * BatchConfig::max_requests_per_batch();
+    } else {
+      num_total_blocks =
+          total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim / num_kv_heads /
+          (num_transformer_layers / pipeline_parallelism_degree) / 2;
     }
+    page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
+  }
+  return page_manager_singleton;
 }
 
 PageManager *PageManager::get_page_manager() {
   if (page_manager_singleton == nullptr) {
-    int num_total_blocks = (BatchConfig::max_spec_tree_token_num() +
-        BatchConfig::max_sequence_length() + kPagesize - 1) /
+    int num_total_blocks =
+        (BatchConfig::max_spec_tree_token_num() +
+         BatchConfig::max_sequence_length() + kPagesize - 1) /
         kPagesize * BatchConfig::max_requests_per_batch();
     page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
   }
   return page_manager_singleton;
 }
 
-
-}; //FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 83fdc5f1f..301d41a2a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -20,6 +20,9 @@
 #include <bitset>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
+#include <exception>
+#include <execinfo.h>
 #include <filesystem>
 #include <future>
 #include <iomanip>
@@ -30,9 +33,6 @@
 #include <thread>
 #include <unordered_map>
 #include <vector>
-#include <exception>
-#include <cstdlib>
-#include <execinfo.h> 
 
 namespace FlexFlow {
 
@@ -42,9 +42,10 @@ using tokenizers::Tokenizer;
 Legion::Logger log_req_mgr("RequestManager");
 
 void printStackTrace() {
-    void *array[10];
-    size_t size = backtrace(array, 10);   // Get stack frames
-    backtrace_symbols_fd(array, size, STDERR_FILENO);  // Print stack trace to stderr
+  void *array[10];
+  size_t size = backtrace(array, 10); // Get stack frames
+  backtrace_symbols_fd(
+      array, size, STDERR_FILENO); // Print stack trace to stderr
 }
 
 bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
@@ -220,7 +221,6 @@ int RequestManager::get_max_kv_cache_size() {
   return max_kv_cache_size;
 }
 
-
 void RequestManager::set_decoding_mode(DecodingMode mode) {
   assert(mode == INCREMENTAL_DECODING || mode == SPECULATIVE_DECODING);
   decoding_mode = mode;
@@ -1175,19 +1175,22 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
           request->tokens[request->llm_prefill_len + idx];
 
       assert(request->llm_prefill_len + idx < request->tokens.size());
-      append_token_to_block(*request, request->tokens[request->llm_prefill_len + idx], true);
+      append_token_to_block(
+          *request, request->tokens[request->llm_prefill_len + idx], true);
     }
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
-    //update related page info in batch config
-    bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(*request);
+    // update related page info in batch config
+    bc.requestsInfo[request_index].num_kv_pages =
+        get_num_blocks_allocated(*request);
     if (bc.requestsInfo[request_index].num_kv_pages == 0) {
       // turn this request into not available for one round
       bc.request_available[request_index] = false;
     }
-    bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(*request);
+    bc.requestsInfo[request_index].kv_last_page_len =
+        get_len_last_block(*request);
     bc.requestsInfo[request_index].request_guid = request->guid;
   }
   bc.num_tokens = num_tokens;
@@ -1577,11 +1580,12 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
-    //before commit token, reset the pages assigned by cleaning all the tokens
-    std::vector<int> block_table_before_commit = page_manager->get_block_table_indices(guid);
+    // before commit token, reset the pages assigned by cleaning all the tokens
+    std::vector<int> block_table_before_commit =
+        page_manager->get_block_table_indices(guid);
     // also need to reset the pages
     reset_block_table(request);
-    
+
     int token_offset = request.first_token_offset_in_batch;
 
     // 1. Maintain requestsInfo
@@ -1602,21 +1606,25 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
          committed_token_index++) {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
-      
-      int idx_to_physical = append_token_to_block(request, committed_token.token_id, true);
+
+      int idx_to_physical =
+          append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index;
       assert(idx_from_logical >= 0);
       assert(idx_from_logical / kPagesize < block_table_before_commit.size());
-      int idx_from_physical = block_table_before_commit[idx_from_logical / kPagesize] * kPagesize + committed_token.from_index % kPagesize;
-
+      int idx_from_physical =
+          block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
+          committed_token.from_index % kPagesize;
 
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache = idx_from_physical;
-      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth = idx_to_physical;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
+          idx_from_physical;
+      new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
+          idx_to_physical;
       new_bc.num_tokens_to_commit++;
       // also append the token to the block
-      }
+    }
 
     // Load the tokens on the token tree that are not yet pruned to
     // BatchConfig.tokensInfo.
@@ -1654,9 +1662,11 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
 
     // page attention information
-    new_bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
+    new_bc.requestsInfo[request_index].num_kv_pages =
+        get_num_blocks_allocated(request);
     assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
-    new_bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
+    new_bc.requestsInfo[request_index].kv_last_page_len =
+        get_len_last_block(request);
     assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
     new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
@@ -1978,7 +1988,9 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 /* --------- Page Attention Related Functions --------- */
 int RequestManager::get_num_blocks_allocated(Request &request) const {
   // needs some assertion
-  assert(request.blocks.size() == PageManager::get_page_manager()->get_block_table_indices(request.guid).size());
+  assert(request.blocks.size() == PageManager::get_page_manager()
+                                      ->get_block_table_indices(request.guid)
+                                      .size());
   return request.blocks.size();
 }
 
@@ -1995,57 +2007,68 @@ int RequestManager::get_idx_last_logical_token(Request &request) const {
   if (request.blocks.empty()) {
     printf("Error: request.blocks is empty\n");
     return -1;
-  }else{
-    return (request.blocks.size() - 1) * kPagesize + request.blocks.back().get_num_tokens() - 1;
+  } else {
+    return (request.blocks.size() - 1) * kPagesize +
+           request.blocks.back().get_num_tokens() - 1;
   }
 }
 
 int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   // get physical indices
   PageManager *page_manager = PageManager::get_page_manager();
-  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
   if (request.blocks.size() != block_table_indices.size()) {
     assert(request.blocks.size() == block_table_indices.size());
   }
-  return block_table_indices[idx_logical / kPagesize] * kPagesize + idx_logical % kPagesize;
+  return block_table_indices[idx_logical / kPagesize] * kPagesize +
+         idx_logical % kPagesize;
 }
 
 // this will allocate one logical block and one physical block to the request
-void RequestManager::_append_block_to_request(
-    Request &request, bool is_commit) {
+void RequestManager::_append_block_to_request(Request &request,
+                                              bool is_commit) {
   PageManager *page_manager = PageManager::get_page_manager();
   assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
+  assert(request.blocks.size() ==
+         page_manager->get_block_table_indices(request.guid).size());
   // Append the logical block to the request
-  // page attention: in this function we need to remember the last logical block number that still contains committed tokens
-  LogicalTokenBlock block(request.blocks.size(),
-                                  kPagesize);
+  // page attention: in this function we need to remember the last logical block
+  // number that still contains committed tokens
+  LogicalTokenBlock block(request.blocks.size(), kPagesize);
   request.blocks.push_back(block);
   page_manager->allocate_one_block(request.guid);
-  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
-  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  assert(request.blocks.size() ==
+         page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
     request.page_last_committed++;
     int size_blocks = request.blocks.size();
-    assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+    assert(request.page_last_committed <
+           static_cast<int>(request.blocks.size()));
   }
 }
 
-//this function is used for appending a token to the last logical block and also the last physical block
-//it will return the physical position of this token
-int RequestManager::append_token_to_block(Request &request, TokenId token, bool is_commit) {
+// this function is used for appending a token to the last logical block and
+// also the last physical block it will return the physical position of this
+// token
+int RequestManager::append_token_to_block(Request &request,
+                                          TokenId token,
+                                          bool is_commit) {
   PageManager *page_manager = PageManager::get_page_manager();
-  if (request.blocks.empty() ||
-      request.blocks.back().is_full()) {
+  if (request.blocks.empty() || request.blocks.back().is_full()) {
     // Append a new logical block
     _append_block_to_request(request, is_commit);
-    assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
+    assert(request.blocks.size() ==
+           page_manager->get_block_table_indices(request.guid).size());
     // also allocate one physical page
   }
   // insert token to both logical block and physical block
   request.blocks.back().append_tokens({token}, is_commit);
-  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
+  assert(request.blocks.size() ==
+         page_manager->get_block_table_indices(request.guid).size());
   int idx_logical = get_idx_last_logical_token(request);
   assert(idx_logical >= 0);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
@@ -2053,36 +2076,47 @@ int RequestManager::append_token_to_block(Request &request, TokenId token, bool
   return idx_physical;
 }
 
-void RequestManager::reset_block_table(Request &request){
+void RequestManager::reset_block_table(Request &request) {
   // get the indices of original physical block table for request
   PageManager *page_manager = PageManager::get_page_manager();
   assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
-  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  assert(request.blocks.size() ==
+         page_manager->get_block_table_indices(request.guid).size());
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
   assert(block_table_indices.size() > request.page_last_committed);
-  page_manager->free_multiple_blocks(request.guid, block_table_indices.size() - request.page_last_committed - 1);
+  page_manager->free_multiple_blocks(request.guid,
+                                     block_table_indices.size() -
+                                         request.page_last_committed - 1);
   // reset this request's logical block table
   if (request.page_last_committed < static_cast<int>(request.blocks.size())) {
-    request.blocks.erase(request.blocks.begin() + request.page_last_committed + 1, request.blocks.end());
+    request.blocks.erase(request.blocks.begin() + request.page_last_committed +
+                             1,
+                         request.blocks.end());
   }
   request.blocks.back().reset_num_spec_tokens();
   // the indices of block table should be the same as the number of blocks
-  std::vector<int> block_table = page_manager->get_block_table_indices(request.guid);
+  std::vector<int> block_table =
+      page_manager->get_block_table_indices(request.guid);
 
-  assert(request.blocks.size() == page_manager->get_block_table_indices(request.guid).size());
+  assert(request.blocks.size() ==
+         page_manager->get_block_table_indices(request.guid).size());
   return;
 }
 
 // debug function
 void RequestManager::print_num_tokens(Request &request) {
   PageManager *page_manager = PageManager::get_page_manager();
-  std::vector<int> block_table_indices = page_manager->get_block_table_indices(request.guid);
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
   printf("number of blocks: %d", request.blocks.size());
   printf(" number of pages allocated: %d", block_table_indices.size());
   printf(" last page length: %d", request.blocks.back().get_num_tokens());
-  printf(" last page spec tokens: %d", request.blocks.back().get_num_spec_tokens());
-  printf(" last page commit tokens: %d\n", request.blocks.back().get_num_commit_tokens());
+  printf(" last page spec tokens: %d",
+         request.blocks.back().get_num_spec_tokens());
+  printf(" last page commit tokens: %d\n",
+         request.blocks.back().get_num_commit_tokens());
 }
 
 /* --------- Bitmask Related Functions --------- */
@@ -2490,6 +2524,8 @@ void RequestManager::background_serving_task(
     Runtime *runtime) {
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
+  printf("start background serving task and llm has %d num_transfor_layers\n",
+         llm->num_transformer_layers);
   {
     // Update FFModel's lg_hlr and lg_ctx to the current
     // task's runtime and ctx, since all future legion tasks are
@@ -2504,6 +2540,8 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
+  // page attention: initalize the page manager here
+  PageManager::get_page_manager(llm, rm->get_max_kv_cache_size());
   if (rm->decoding_mode == INCREMENTAL_DECODING) {
     // No SSMs: perform incremental decoding
     rm->serve_decoding(llm);
@@ -2759,9 +2797,6 @@ void RequestManager::terminate_background_server() {
       }
     }
 
-
-
-
     std::string latency_per_request_ms = "\n latency_per_request_ms( ";
     for (auto const &profiling_info : profiling_requests) {
       double latency_ms = (profiling_info.second.finish_time -
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index db2718b9b..b762fedd2 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -94,29 +94,37 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
   qk_indptr_h[0] = 0;
   int q_lens = 0, qk_lens = 0;
   int indices_offset = 0, indices_lens = 0;
-  for (int req_idx = 0, indptr_idx = 0; req_idx < batch_config->max_requests_per_batch(); req_idx++) {
+  for (int req_idx = 0, indptr_idx = 0;
+       req_idx < batch_config->max_requests_per_batch();
+       req_idx++) {
     if (batch_config->request_available[req_idx]) {
       int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-      int kv_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                  batch_config->requestsInfo[req_idx].first_token_index_in_request;
-      
+      int kv_len =
+          batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+          batch_config->requestsInfo[req_idx].first_token_index_in_request;
+
       q_lens += q_len;
       qk_lens += (q_len * kv_len + 7) / 8;
       indices_offset = indices_lens;
       indices_lens += (kv_len + kPagesize - 1) / kPagesize;
       q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-      kv_indptr_h[indptr_idx + 1] = batch_config->requestsInfo[req_idx].num_kv_pages + kv_indptr_h[indptr_idx];
+      kv_indptr_h[indptr_idx + 1] =
+          batch_config->requestsInfo[req_idx].num_kv_pages +
+          kv_indptr_h[indptr_idx];
       assert(kv_indptr_h[indptr_idx] >= 0);
 
-      assert(batch_config->requestsInfo[req_idx].num_kv_pages == (kv_len + kPagesize - 1) / kPagesize);
+      assert(batch_config->requestsInfo[req_idx].num_kv_pages ==
+             (kv_len + kPagesize - 1) / kPagesize);
       assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= kPagesize);
-      std::vector<int32_t> kv_indices = pm -> get_block_table_indices(batch_config->requestsInfo[req_idx].request_guid);
+      std::vector<int32_t> kv_indices = pm->get_block_table_indices(
+          batch_config->requestsInfo[req_idx].request_guid);
       assert(kv_indices.size() == (kv_len + kPagesize - 1) / kPagesize);
       for (int i = indices_offset; i < indices_lens; i++) {
         kv_indices_h[i] = kv_indices[i - indices_offset];
       }
       qk_indptr_h[indptr_idx + 1] = qk_lens;
-      kv_last_page_len_h[indptr_idx] = batch_config->requestsInfo[req_idx].kv_last_page_len;
+      kv_last_page_len_h[indptr_idx] =
+          batch_config->requestsInfo[req_idx].kv_last_page_len;
       indptr_idx++;
     }
   }
@@ -127,11 +135,12 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
                             sizeof(int32_t) * batch_size * max_num_pages,
                             cudaMemcpyHostToDevice,
                             stream));
-  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_last_page_len,
-                            kv_last_page_len_h,
-                            sizeof(int32_t) * batch_size,
-                            cudaMemcpyHostToDevice,
-                            stream));
+  checkCUDA(
+      cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_last_page_len,
+                      kv_last_page_len_h,
+                      sizeof(int32_t) * batch_size,
+                      cudaMemcpyHostToDevice,
+                      stream));
   checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->q_indptr,
                             q_indptr_h,
                             sizeof(int32_t) * (batch_size + 1),
@@ -675,8 +684,10 @@ void RequestManager::load_batch_config_task(
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     PageManager *pm = PageManager::get_page_manager();
-    static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1], kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
-    static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS * BatchConfig::MAX_NUM_TOKENS];
+    static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS *
+                                BatchConfig::MAX_NUM_TOKENS];
     static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
     static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
 

From 6b74f93177fd56c047eff9ca8c60238f14ce8c29 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Tue, 5 Nov 2024 06:09:23 -0800
Subject: [PATCH 600/667] ckpt

---
 include/flexflow/model.h                |  4 ++-
 inference/models/llama.cc               |  7 +++--
 inference/spec_infer/spec_infer.cc      |  6 ++++
 src/ops/inc_multihead_self_attention.cu | 40 +++++++++++++++++++++++--
 src/runtime/inference_manager.cc        |  9 +++---
 src/runtime/page_manager.cc             | 22 +++++++++-----
 src/runtime/request_manager.cc          |  2 ++
 7 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index bff5d2802..8823a6f25 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -1147,7 +1147,9 @@ class FFModel {
   size_t current_transformer_layer_id;
   // positional embedding start offset
   int num_transformer_layers;
-  int num_kv_heads, qkv_dim, size_dt;
+  int num_kv_heads;
+  int qkv_dim;
+  int size_dt;
   int position_offset;
   FFConfig config;
   FFIterationConfig iter_config;
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 92f1cdf76..cd9fce238 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -64,10 +64,12 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor w2 = nullptr;
 
+  printf("we are in llama model\n");
   ff.set_num_transformer_layers(llama_config.num_hidden_layers);
   ff.set_num_kv_heads(llama_config.num_key_value_heads);
-  ff.set_qkv_dim(llama_config.hidden_size / llama_config.num_attention_heads *
-                 2);
+  int qkv_dim = llama_config.hidden_size / llama_config.num_attention_heads * 2;
+  printf("qkv_dim: %d\n", qkv_dim);
+  ff.set_qkv_dim(qkv_dim);
   ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < llama_config.num_hidden_layers; i++) {
     // set transformer layer id
@@ -294,6 +296,7 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
+  printf("llama qkv dim in the end: %d\n", ff.qkv_dim);
 }
 
 }; // namespace FlexFlow
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index dba03e3f8..596df29a1 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -88,6 +88,7 @@ void parse_input_args(char **argv,
                       bool &add_special_tokens) {
   for (int i = 1; i < argc; i++) {
     // llm model name
+    printf("argv[i]: %s\n", argv[i]);
     if (!strcmp(argv[i], "-llm-model")) {
       model_names.llm_model_name = std::string(argv[++i]);
       for (char &c : model_names.llm_model_name) {
@@ -241,6 +242,7 @@ void parse_input_args(char **argv,
   wordexp(paths.cache_folder_path.c_str(), &p, 0);
   paths.cache_folder_path = p.we_wordv[0];
   wordfree(&p);
+  printf("argv parsed end\n");
 }
 
 void get_model_meta(FilePaths &file_paths,
@@ -412,6 +414,7 @@ void FlexFlow::top_level_task(Task const *task,
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
   int argc = command_args.argc;
+  printf("begin parsing input args\n");
   parse_input_args(argv,
                    argc,
                    file_paths,
@@ -526,6 +529,9 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "Invalid LLM model type passed (or no type was passed).");
   }
 
+  printf("after creating llm model we have tree model qkv_dim: %d\n",
+         tree_model.qkv_dim);
+
   // Create SSM models
   int num_ssms = model_metadata.ssm_model_types.size();
   std::vector<int> ssm_model_ids;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 7472b61f0..44bb694e5 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -494,10 +494,45 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t max_num_pages =
         round_up_pages(BatchConfig::max_sequence_length() +
                        BatchConfig::max_spec_tree_token_num());
+    int total_kv_cache_size = BatchConfig::max_kv_cache_size();
     switch (infer_mode) {
-      case INC_DECODING_MODE:
-      case TREE_SEARCH_MODE:
       case TREE_VERIFY_MODE: {
+        query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
+        // a K-ary tree max node is (k^n - 1) / 2
+        if (total_kv_cache_size == -1){
+          key_cache_size = num_kv_heads * qk_dim *
+                          BatchConfig::max_requests_per_batch() * max_num_pages *
+                          kPagesize;
+          value_cache_size = num_kv_heads * v_dim *
+                            BatchConfig::max_requests_per_batch() *
+                            max_num_pages * kPagesize;
+        }else{
+          key_cache_size = total_kv_cache_size / 2;
+          value_cache_size = total_kv_cache_size / 2;
+        }
+        // if (streaming_cache) {
+        //   size_t max_post_pos_enc_pages =
+        //       round_up_pages(BatchConfig::MAX_STREAMING_POS -
+        //                      BatchConfig::get_max_tree_depth() +
+        //                      max(max_tokens_per_batch,
+        //                          BatchConfig::max_spec_tree_token_num()));
+        //   key_cache_size = num_kv_heads * qk_dim *
+        //                    BatchConfig::max_requests_per_batch() *
+        //                    max_post_pos_enc_pages * kPagesize;
+        //   value_cache_size = num_kv_heads * v_dim *
+        //                      BatchConfig::max_requests_per_batch() *
+        //                      max_post_pos_enc_pages * kPagesize;
+        //   streaming_pre_pos_enc_size =
+        //       num_kv_heads * (qk_dim + v_dim) *
+        //       BatchConfig::max_requests_per_batch() *
+        //       round_up_pages(BatchConfig::MAX_STREAMING_POS -
+        //                      BatchConfig::get_max_tree_depth()) *
+        //       kPagesize;
+        // }
+        break;
+      }
+      case TREE_SEARCH_MODE:
+      case INC_DECODING_MODE:
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
@@ -526,7 +561,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
               kPagesize;
         }
         break;
-      }
       default:
         assert(false && "Unkown inference mode");
     }
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 31c2a51cd..21403c473 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -501,12 +501,13 @@ void FFModel::set_num_kv_heads(int num_heads) {
   num_kv_heads = num_heads;
 }
 
-void FFModel::set_qkv_dim(int qkv_dim) {
-  qkv_dim = qkv_dim;
+void FFModel::set_qkv_dim(int qkv) {
+  qkv_dim = qkv;
 }
 
-void FFModel::set_size_dt(int size_dt) {
-  size_dt = size_dt;
+void FFModel::set_size_dt(int dt) {
+  printf("Setting size_dt to %d\n", dt);
+  size_dt = dt;
 }
 
 void FFModel::set_position_offset(int offset) {
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index c88c4f6ce..3f22e0f4a 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -220,6 +220,13 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
   int qkv_dim = ff->qkv_dim;
   int num_transformer_layers = ff->num_transformer_layers;
   int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
+  printf("num_kv_heads: %d, size_dt: %d, qkv_dim: %d, num_transformer_layers: "
+         "%d, pipeline_parallelism_degree: %d\n",
+         num_kv_heads, size_dt, qkv_dim, num_transformer_layers,
+         pipeline_parallelism_degree);
+  assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
+         num_transformer_layers > 0 && pipeline_parallelism_degree > 0); //needs to make sure that the model is initialized
+  printf("page manager singleton is initialized\n");
   if (page_manager_singleton == nullptr) {
     int num_total_blocks = 0;
     if (total_kv_cache_size == -1) {
@@ -237,13 +244,14 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
 }
 
 PageManager *PageManager::get_page_manager() {
-  if (page_manager_singleton == nullptr) {
-    int num_total_blocks =
-        (BatchConfig::max_spec_tree_token_num() +
-         BatchConfig::max_sequence_length() + kPagesize - 1) /
-        kPagesize * BatchConfig::max_requests_per_batch();
-    page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
-  }
+  // if (page_manager_singleton == nullptr) {
+  //   int num_total_blocks =
+  //       (BatchConfig::max_spec_tree_token_num() +
+  //        BatchConfig::max_sequence_length() + kPagesize - 1) /
+  //       kPagesize * BatchConfig::max_requests_per_batch();
+  //   page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
+  // }
+  assert(page_manager_singleton != nullptr);
   return page_manager_singleton;
 }
 
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 301d41a2a..417a95046 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2526,6 +2526,8 @@ void RequestManager::background_serving_task(
   FFModel *llm = *(FFModel **)task->args;
   printf("start background serving task and llm has %d num_transfor_layers\n",
          llm->num_transformer_layers);
+  printf("qkv dim: %d, num_heads: %d\n",
+         llm->qkv_dim, llm->num_kv_heads);
   {
     // Update FFModel's lg_hlr and lg_ctx to the current
     // task's runtime and ctx, since all future legion tasks are

From 20cb7144e208a58a1bb55b296541e43629787784 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Tue, 5 Nov 2024 12:59:09 -0800
Subject: [PATCH 601/667] refactor and add kv cache flag via page manager

---
 include/flexflow/batch_config.h         |  2 +-
 include/flexflow/page_manager.h         | 10 ++++-----
 include/flexflow/request_manager.h      |  6 ++---
 inference/spec_infer/spec_infer.cc      |  4 ++--
 src/ops/inc_multihead_self_attention.cu | 29 +++++--------------------
 src/runtime/batch_config.cc             |  2 +-
 src/runtime/page_manager.cc             | 18 ++++++++++-----
 src/runtime/request_manager.cc          | 27 ++++-------------------
 8 files changed, 34 insertions(+), 64 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index d4c2e38e2..9ef037c2f 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -77,7 +77,7 @@ class BatchConfig {
   static int max_spec_tree_token_num();
   static int max_sequence_length();
   static int max_output_length();
-  static int max_kv_cache_size();
+  static size_t max_kv_cache_size();
   static bool streaming_cache();
   static int get_max_tree_depth();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
index 87a544d81..c0d6df085 100644
--- a/include/flexflow/page_manager.h
+++ b/include/flexflow/page_manager.h
@@ -118,7 +118,7 @@ class BlockAllocator {
 
 private:
   int block_size;
-  int num_total_blocks;
+  size_t num_total_blocks;
   std::deque<PhysicalTokenBlock> free_blocks;
 };
 
@@ -133,10 +133,11 @@ class PageManager {
   // Get the singleton instance of the PageManager as it will be shared in
   // multiple places
   static PageManager *get_page_manager();
-  static PageManager *get_page_manager(FFModel *ff, int kv_cache_size);
+  static PageManager *get_page_manager(FFModel *ff, size_t kv_cache_size);
+  size_t get_kv_cache_size_per_layer();
   using BlockTable = std::vector<PhysicalTokenBlock>;
   using RequestGuid = BatchConfig::RequestGuid;
-  PageManager(int block_size, int num_total_blocks);
+  PageManager(int block_size, size_t num_total_blocks);
   int allocate_one_block(RequestGuid const &request_guid);
   void free_request(RequestGuid const &request_guid);
   // used for the case that we want to free the last num_blocks that stores spec
@@ -148,8 +149,7 @@ class PageManager {
   void free_block_table(BlockTable &block_table);
 
 private:
-  int num_transformer_layers;
-  int total_kv_cache_size;
+  size_t kv_cache_size_per_layer;
   int block_size;       // the size of the block
   int num_total_blocks; // the total number of blocks
   BlockAllocator block_allocator;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f7fe3c872..16aec6f6c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -279,8 +279,8 @@ class RequestManager {
   int get_max_spec_tree_token_num();
   void set_max_sequence_length(int max_seq_length);
   int get_max_sequence_length();
-  void set_max_kv_cache_size(int max_kv_cache_size);
-  int get_max_kv_cache_size();
+  void set_max_kv_cache_size(size_t max_kv_cache_size);
+  size_t get_max_kv_cache_size();
   void set_max_output_length(int max_output_length);
   int get_max_output_length();
   void set_decoding_mode(DecodingMode mode);
@@ -398,7 +398,7 @@ class RequestManager {
   int max_spec_tree_token_num;
   int max_sequence_length;
   int max_output_length;
-  int max_kv_cache_size;
+  size_t max_kv_cache_size;
   int max_tree_depth;
   int max_tree_width;
   int k;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 596df29a1..be256f080 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -68,7 +68,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &max_output_length,
-                      int &max_kv_cache_size,
+                      size_t &max_kv_cache_size,
                       int &max_tree_width,
                       int &max_tree_depth,
                       int &expansion_degree,
@@ -390,7 +390,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 512;
   int max_output_length = 512;
-  int max_kv_cache_size = -1; // if -1, then use the default value
+  size_t max_kv_cache_size = 0; // if 0, then use the default value
   int expansion_degree = 3;
   int max_tree_depth = 8;
   int max_tree_width = 16;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 44bb694e5..885b23d94 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -15,6 +15,7 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
+#include "flexflow/page_manager.h"
 #include "flashinfer/decode_attention_decl.cuh"
 #include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
@@ -494,12 +495,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t max_num_pages =
         round_up_pages(BatchConfig::max_sequence_length() +
                        BatchConfig::max_spec_tree_token_num());
-    int total_kv_cache_size = BatchConfig::max_kv_cache_size();
+    PageManager *pm = PageManager::get_page_manager();
+    size_t total_kv_cache_size_per_layer = pm->get_kv_cache_size_per_layer();
     switch (infer_mode) {
       case TREE_VERIFY_MODE: {
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
-        if (total_kv_cache_size == -1){
+        if (total_kv_cache_size_per_layer == 0){
           key_cache_size = num_kv_heads * qk_dim *
                           BatchConfig::max_requests_per_batch() * max_num_pages *
                           kPagesize;
@@ -507,28 +509,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                             BatchConfig::max_requests_per_batch() *
                             max_num_pages * kPagesize;
         }else{
-          key_cache_size = total_kv_cache_size / 2;
-          value_cache_size = total_kv_cache_size / 2;
+          key_cache_size = total_kv_cache_size_per_layer / 2;
+          value_cache_size = total_kv_cache_size_per_layer / 2;
         }
-        // if (streaming_cache) {
-        //   size_t max_post_pos_enc_pages =
-        //       round_up_pages(BatchConfig::MAX_STREAMING_POS -
-        //                      BatchConfig::get_max_tree_depth() +
-        //                      max(max_tokens_per_batch,
-        //                          BatchConfig::max_spec_tree_token_num()));
-        //   key_cache_size = num_kv_heads * qk_dim *
-        //                    BatchConfig::max_requests_per_batch() *
-        //                    max_post_pos_enc_pages * kPagesize;
-        //   value_cache_size = num_kv_heads * v_dim *
-        //                      BatchConfig::max_requests_per_batch() *
-        //                      max_post_pos_enc_pages * kPagesize;
-        //   streaming_pre_pos_enc_size =
-        //       num_kv_heads * (qk_dim + v_dim) *
-        //       BatchConfig::max_requests_per_batch() *
-        //       round_up_pages(BatchConfig::MAX_STREAMING_POS -
-        //                      BatchConfig::get_max_tree_depth()) *
-        //       kPagesize;
-        // }
         break;
       }
       case TREE_SEARCH_MODE:
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 426f848d9..bb581dd23 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -108,7 +108,7 @@ int BatchConfig::max_output_length() {
   return RequestManager::get_request_manager()->get_max_output_length();
 }
 
-int BatchConfig::max_kv_cache_size() {
+size_t BatchConfig::max_kv_cache_size() {
   return RequestManager::get_request_manager()->get_max_kv_cache_size();
 }
 bool BatchConfig::streaming_cache() {
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 3f22e0f4a..62a3ae11a 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -130,7 +130,7 @@ int BlockAllocator::get_num_free_blocks() const {
   return free_blocks.size();
 }
 
-PageManager::PageManager(int block_size, int num_total_blocks)
+PageManager::PageManager(int block_size, size_t num_total_blocks)
     : block_size(block_size), num_total_blocks(num_total_blocks),
       block_allocator(block_size, num_total_blocks) {}
 
@@ -214,7 +214,7 @@ int PageManager::get_num_allocated_blocks(
 }
 
 PageManager *PageManager::get_page_manager(FFModel *ff,
-                                           int total_kv_cache_size) {
+                                           size_t total_kv_cache_size) {
   int num_kv_heads = ff->num_kv_heads;
   int size_dt = ff->size_dt;
   int qkv_dim = ff->qkv_dim;
@@ -226,23 +226,29 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
          pipeline_parallelism_degree);
   assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
          num_transformer_layers > 0 && pipeline_parallelism_degree > 0); //needs to make sure that the model is initialized
-  printf("page manager singleton is initialized\n");
   if (page_manager_singleton == nullptr) {
-    int num_total_blocks = 0;
-    if (total_kv_cache_size == -1) {
+    size_t num_total_blocks = 0;
+    if (total_kv_cache_size == 0) {
       num_total_blocks = (BatchConfig::max_spec_tree_token_num() +
                           BatchConfig::max_sequence_length() + kPagesize - 1) /
                          kPagesize * BatchConfig::max_requests_per_batch();
     } else {
       num_total_blocks =
           total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim / num_kv_heads /
-          (num_transformer_layers / pipeline_parallelism_degree) / 2;
+          num_transformer_layers / kPagesize;
     }
+    printf("page manager singleton is initialized with %d blocks\n",
+           num_total_blocks);
     page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
+    page_manager_singleton->kv_cache_size_per_layer = total_kv_cache_size * 1024 * 1024 / num_transformer_layers;
   }
   return page_manager_singleton;
 }
 
+size_t PageManager::get_kv_cache_size_per_layer() {
+  return kv_cache_size_per_layer;
+}
+
 PageManager *PageManager::get_page_manager() {
   // if (page_manager_singleton == nullptr) {
   //   int num_total_blocks =
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 417a95046..e03f0ad6e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -213,11 +213,11 @@ int RequestManager::get_max_output_length() {
   return max_output_length;
 }
 
-void RequestManager::set_max_kv_cache_size(int max_kv_cache_size) {
+void RequestManager::set_max_kv_cache_size(size_t max_kv_cache_size) {
   this->max_kv_cache_size = max_kv_cache_size;
 }
 
-int RequestManager::get_max_kv_cache_size() {
+size_t RequestManager::get_max_kv_cache_size() {
   return max_kv_cache_size;
 }
 
@@ -703,9 +703,6 @@ void RequestManager::request_update_attainment(int batch_index, bool attained) {
 
 void RequestManager::request_complete_clean_up(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
-  if (profiling_requests[guid].finish_time != 0) {
-    printf("some request has been completed!!\n");
-  }
 
   profiling_requests[guid].finish_time =
       Realm::Clock::current_time_in_microseconds();
@@ -2105,20 +2102,6 @@ void RequestManager::reset_block_table(Request &request) {
   return;
 }
 
-// debug function
-void RequestManager::print_num_tokens(Request &request) {
-  PageManager *page_manager = PageManager::get_page_manager();
-  std::vector<int> block_table_indices =
-      page_manager->get_block_table_indices(request.guid);
-  printf("number of blocks: %d", request.blocks.size());
-  printf(" number of pages allocated: %d", block_table_indices.size());
-  printf(" last page length: %d", request.blocks.back().get_num_tokens());
-  printf(" last page spec tokens: %d",
-         request.blocks.back().get_num_spec_tokens());
-  printf(" last page commit tokens: %d\n",
-         request.blocks.back().get_num_commit_tokens());
-}
-
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {
@@ -2524,10 +2507,6 @@ void RequestManager::background_serving_task(
     Runtime *runtime) {
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
-  printf("start background serving task and llm has %d num_transfor_layers\n",
-         llm->num_transformer_layers);
-  printf("qkv dim: %d, num_heads: %d\n",
-         llm->qkv_dim, llm->num_kv_heads);
   {
     // Update FFModel's lg_hlr and lg_ctx to the current
     // task's runtime and ctx, since all future legion tasks are
@@ -2543,6 +2522,8 @@ void RequestManager::background_serving_task(
     }
   }
   // page attention: initalize the page manager here
+  int kv_cache_size = rm->get_max_kv_cache_size();
+  printf("KV cache size: %d\n", kv_cache_size);
   PageManager::get_page_manager(llm, rm->get_max_kv_cache_size());
   if (rm->decoding_mode == INCREMENTAL_DECODING) {
     // No SSMs: perform incremental decoding

From 311c45017dd293ed963888d7aedc7893cd08e5f3 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Tue, 5 Nov 2024 13:48:45 -0800
Subject: [PATCH 602/667] ckpt for performance issue

---
 include/flexflow/request_manager.h      |  2 ++
 src/ops/inc_multihead_self_attention.cu |  4 +--
 src/runtime/page_manager.cc             | 24 +++----------
 src/runtime/request_manager.cc          | 48 ++++++++++++-------------
 4 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 16aec6f6c..cfd9a2c77 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -490,6 +490,8 @@ class RequestManager {
     std::vector<int> generated_tokens_per_step;
     // To calculate the E2E time of serving
     long long server_start_time = 0;
+    // added for seeing how many things are disabled
+    int num_disabled = 0;
   };
 
   ProfileInfo profiling;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 885b23d94..c0561f11b 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -509,8 +509,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                             BatchConfig::max_requests_per_batch() *
                             max_num_pages * kPagesize;
         }else{
-          key_cache_size = total_kv_cache_size_per_layer / 2;
-          value_cache_size = total_kv_cache_size_per_layer / 2;
+          key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
+          value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
         }
         break;
       }
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 62a3ae11a..a8559c816 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -155,7 +155,8 @@ void PageManager::free_block_table(BlockTable &block_table) {
 
 void PageManager::free_request(RequestGuid const &request_guid) {
   // we only free the blocks that are already used
-  assert(block_tables.find(request_guid) != block_tables.end());
+  // assert(block_tables.find(request_guid) != block_tables.end());
+  printf("free the blocks for request %d\n", request_guid);
   BlockTable block_table = block_tables[request_guid];
   free_block_table(block_table);
   block_tables.erase(request_guid);
@@ -165,9 +166,9 @@ void PageManager::free_request(RequestGuid const &request_guid) {
 // delete the last num_blocks in the request_guid
 void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
                                        int num_blocks) {
-  assert(block_tables.find(request_guid) != block_tables.end());
+  // assert(block_tables.find(request_guid) != block_tables.end());
   auto &block_table = block_tables[request_guid];
-  assert(num_blocks <= block_table.size());
+  // assert(num_blocks <= block_table.size());
   int num_blocks_allocated = block_table.size();
   for (int i = 0; i < num_blocks; i++) {
     block_allocator.free(block_table[num_blocks_allocated - i - 1]);
@@ -179,12 +180,6 @@ void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
   return;
 }
 
-// int PageManager::get_index_last_block(const RequestGuid& request_guid) const
-// {
-//     const auto& block_table = block_tables.at(request_guid);
-//     return block_table.back.get_block_number();
-// }
-
 std::vector<int> PageManager::get_block_table_indices(
     RequestGuid const &request_guid) const {
   std::vector<int> indices;
@@ -220,10 +215,6 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
   int qkv_dim = ff->qkv_dim;
   int num_transformer_layers = ff->num_transformer_layers;
   int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
-  printf("num_kv_heads: %d, size_dt: %d, qkv_dim: %d, num_transformer_layers: "
-         "%d, pipeline_parallelism_degree: %d\n",
-         num_kv_heads, size_dt, qkv_dim, num_transformer_layers,
-         pipeline_parallelism_degree);
   assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
          num_transformer_layers > 0 && pipeline_parallelism_degree > 0); //needs to make sure that the model is initialized
   if (page_manager_singleton == nullptr) {
@@ -250,13 +241,6 @@ size_t PageManager::get_kv_cache_size_per_layer() {
 }
 
 PageManager *PageManager::get_page_manager() {
-  // if (page_manager_singleton == nullptr) {
-  //   int num_total_blocks =
-  //       (BatchConfig::max_spec_tree_token_num() +
-  //        BatchConfig::max_sequence_length() + kPagesize - 1) /
-  //       kPagesize * BatchConfig::max_requests_per_batch();
-  //   page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
-  // }
   assert(page_manager_singleton != nullptr);
   return page_manager_singleton;
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e03f0ad6e..bc23375f5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1184,6 +1184,7 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
         get_num_blocks_allocated(*request);
     if (bc.requestsInfo[request_index].num_kv_pages == 0) {
       // turn this request into not available for one round
+      profiling.num_disabled++;
       bc.request_available[request_index] = false;
     }
     bc.requestsInfo[request_index].kv_last_page_len =
@@ -1607,8 +1608,8 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       int idx_to_physical =
           append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index;
-      assert(idx_from_logical >= 0);
-      assert(idx_from_logical / kPagesize < block_table_before_commit.size());
+      // assert(idx_from_logical >= 0);
+      // assert(idx_from_logical / kPagesize < block_table_before_commit.size());
       int idx_from_physical =
           block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
           committed_token.from_index % kPagesize;
@@ -1661,10 +1662,10 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // page attention information
     new_bc.requestsInfo[request_index].num_kv_pages =
         get_num_blocks_allocated(request);
-    assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
+    // assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
     new_bc.requestsInfo[request_index].kv_last_page_len =
         get_len_last_block(request);
-    assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
+    // assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
     new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
@@ -1985,9 +1986,6 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
 /* --------- Page Attention Related Functions --------- */
 int RequestManager::get_num_blocks_allocated(Request &request) const {
   // needs some assertion
-  assert(request.blocks.size() == PageManager::get_page_manager()
-                                      ->get_block_table_indices(request.guid)
-                                      .size());
   return request.blocks.size();
 }
 
@@ -2016,7 +2014,7 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   std::vector<int> block_table_indices =
       page_manager->get_block_table_indices(request.guid);
   if (request.blocks.size() != block_table_indices.size()) {
-    assert(request.blocks.size() == block_table_indices.size());
+    // assert(request.blocks.size() == block_table_indices.size());
   }
   return block_table_indices[idx_logical / kPagesize] * kPagesize +
          idx_logical % kPagesize;
@@ -2026,9 +2024,9 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
 void RequestManager::_append_block_to_request(Request &request,
                                               bool is_commit) {
   PageManager *page_manager = PageManager::get_page_manager();
-  assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-  assert(request.blocks.size() ==
-         page_manager->get_block_table_indices(request.guid).size());
+  // assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+  // assert(request.blocks.size() ==
+  //        page_manager->get_block_table_indices(request.guid).size());
   // Append the logical block to the request
   // page attention: in this function we need to remember the last logical block
   // number that still contains committed tokens
@@ -2037,14 +2035,14 @@ void RequestManager::_append_block_to_request(Request &request,
   page_manager->allocate_one_block(request.guid);
   std::vector<int> block_table_indices =
       page_manager->get_block_table_indices(request.guid);
-  assert(request.blocks.size() ==
-         page_manager->get_block_table_indices(request.guid).size());
+  // assert(request.blocks.size() ==
+  //        page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
     request.page_last_committed++;
     int size_blocks = request.blocks.size();
-    assert(request.page_last_committed <
-           static_cast<int>(request.blocks.size()));
+    // assert(request.page_last_committed <
+    //        static_cast<int>(request.blocks.size()));
   }
 }
 
@@ -2058,14 +2056,14 @@ int RequestManager::append_token_to_block(Request &request,
   if (request.blocks.empty() || request.blocks.back().is_full()) {
     // Append a new logical block
     _append_block_to_request(request, is_commit);
-    assert(request.blocks.size() ==
-           page_manager->get_block_table_indices(request.guid).size());
+    // assert(request.blocks.size() ==
+    //        page_manager->get_block_table_indices(request.guid).size());
     // also allocate one physical page
   }
   // insert token to both logical block and physical block
   request.blocks.back().append_tokens({token}, is_commit);
-  assert(request.blocks.size() ==
-         page_manager->get_block_table_indices(request.guid).size());
+  // assert(request.blocks.size() ==
+  //        page_manager->get_block_table_indices(request.guid).size());
   int idx_logical = get_idx_last_logical_token(request);
   assert(idx_logical >= 0);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
@@ -2077,12 +2075,12 @@ void RequestManager::reset_block_table(Request &request) {
   // get the indices of original physical block table for request
   PageManager *page_manager = PageManager::get_page_manager();
   assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-  assert(request.blocks.size() ==
-         page_manager->get_block_table_indices(request.guid).size());
+  // assert(request.blocks.size() ==
+  //        page_manager->get_block_table_indices(request.guid).size());
   std::vector<int> block_table_indices =
       page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
-  assert(block_table_indices.size() > request.page_last_committed);
+  // assert(block_table_indices.size() > request.page_last_committed);
   page_manager->free_multiple_blocks(request.guid,
                                      block_table_indices.size() -
                                          request.page_last_committed - 1);
@@ -2097,8 +2095,8 @@ void RequestManager::reset_block_table(Request &request) {
   std::vector<int> block_table =
       page_manager->get_block_table_indices(request.guid);
 
-  assert(request.blocks.size() ==
-         page_manager->get_block_table_indices(request.guid).size());
+  // assert(request.blocks.size() ==
+  //        page_manager->get_block_table_indices(request.guid).size());
   return;
 }
 
@@ -2884,6 +2882,8 @@ void RequestManager::terminate_background_server() {
     generated_tokens_per_step += ")";
     str += generated_tokens_per_step;
 
+    printf("there are %d requests disabled\n", profiling.num_disabled);
+
     std::string mean_generated_tokens_per_step =
         "\n mean_generated_tokens_per_step( ";
     double mean_generated_tokens =

From a493f2ae096f51741552e666174e9d7e19c98d9e Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Tue, 5 Nov 2024 15:45:25 -0800
Subject: [PATCH 603/667] first attempt in incr decoding with page attention

---
 src/ops/inc_multihead_self_attention.cu       |  2 +-
 .../inc_multihead_self_attention_kernels.cu   | 82 ++++++++++++-------
 src/ops/tree_inc_multihead_self_attention.cu  |  2 -
 src/runtime/request_manager.cc                |  8 +-
 src/runtime/request_manager.cu                | 75 +++++++++--------
 5 files changed, 98 insertions(+), 71 deletions(-)

diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index c0561f11b..c20ea2346 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -498,6 +498,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     PageManager *pm = PageManager::get_page_manager();
     size_t total_kv_cache_size_per_layer = pm->get_kv_cache_size_per_layer();
     switch (infer_mode) {
+      case INC_DECODING_MODE:
       case TREE_VERIFY_MODE: {
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
@@ -515,7 +516,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         break;
       }
       case TREE_SEARCH_MODE:
-      case INC_DECODING_MODE:
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index ea65b1fce..1a697fbda 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -443,22 +443,26 @@ void apply_pos_encoding_to_streaming_proj(
 }
 
 template <typename DT>
-__global__ void
-    update_qkv_in_batch_kernel(DT *qkv_proj_array,
-                               half *qTmp_ptr,
-                               half *kvCache_ptr,
-                               BatchConfig::PerTokenInfo const *tokenInfos,
-                               int const max_num_pages,
-                               int num_q_heads,
-                               int num_kv_heads,
-                               int head_dim,
-                               int num_new_tokens) {
+__global__ void update_qkv_in_batch_kernel(
+    DT *qkv_proj_array,
+    half *qTmp_ptr,
+    half *kvCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
+    bool const *request_available,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int const max_num_pages,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int num_new_tokens) {
   int const q_hidden_size = num_q_heads * head_dim;
   int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
   int const kv_hidden_size = num_kv_heads * head_dim;
   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int const token_idx = thread_idx / q_hidden_size;
   int const offset = thread_idx % q_hidden_size;
+
   if (token_idx >= num_new_tokens) {
     return;
   }
@@ -466,15 +470,33 @@ __global__ void
   int const req_idx = tokenInfos[token_idx].request_index;
   int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
+  // calculate the compact request index in the easiest way
+  // TODO: recheck
+  int req_idx_compact = -1;
+  int cnt = 0;
+  while (cnt < req_idx + 1) {
+    if (request_available[cnt]) {
+      req_idx_compact++;
+    }
+    cnt++;
+  }
+
+  assert(req_idx_compact >= 0 && "Invalid request index");
+
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
       static_cast<half>(qkv_proj_array[from_idx + offset]);
 
   if (offset < kv_hidden_size) {
-    size_t to_k_idx = get_k_entry_offset(
-               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
-           to_v_idx = get_v_entry_offset(
-               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
+    int start = kv_indptr[req_idx_compact];
+    int end = kv_indptr[req_idx_compact + 1] - 1;
+    assert(start <= end && "Invalid kv_indptr");
+    assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index");
+    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
+    size_t to_k_idx = get_k_entry_offset_verify(
+               token_abs_idx, page_idx, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset_verify(
+               token_abs_idx, page_idx, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
@@ -489,8 +511,8 @@ __global__ void
 
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         cudaStream_t stream) {
+                                BatchConfig const *bc,
+                                cudaStream_t stream) {
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
     return;
@@ -499,18 +521,22 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
   int const max_num_pages =
       round_up_pages(BatchConfig::max_sequence_length() +
                      BatchConfig::max_spec_tree_token_num());
-  update_qkv_in_batch_kernel<<<GET_BLOCKS(parallelism),
-                               min(CUDA_NUM_THREADS, parallelism),
-                               0,
-                               stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                                         static_cast<half *>(m->queryTmp),
-                                         static_cast<half *>(m->kvCache),
-                                         m->token_infos,
-                                         max_num_pages,
-                                         m->num_q_heads,
-                                         m->num_kv_heads,
-                                         m->qk_dim,
-                                         num_new_tokens);
+  update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->kvCache),
+      m->handle.tree_verify_attention_metadata->kv_indptr,
+      m->handle.tree_verify_attention_metadata->kv_indices,
+      m->request_available,
+      m->token_infos,
+      max_num_pages,
+      m->num_q_heads,
+      m->num_kv_heads,
+      m->qk_dim,
+      num_new_tokens);
 }
 
 __global__ void update_kv_in_streaming_cache_kernel(
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index d1e916461..ad3f0e91f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -143,8 +143,6 @@ void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
       m->num_kv_heads,
       m->qk_dim,
       num_new_tokens);
-  // cudaStreamSynchronize(stream);
-  // printf("exited update_qkv_in_batch_verify\n");
 }
 
 __global__ void commit_tokens_kernel(
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bc23375f5..d156c6253 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1306,6 +1306,10 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+    // append the token here
+    int idx_to_physical = append_token_to_block(request, request.tokens.back(), true);
+    bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
+    bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
 
     bc.num_tokens++;
 
@@ -1608,8 +1612,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       int idx_to_physical =
           append_token_to_block(request, committed_token.token_id, true);
       int idx_from_logical = committed_token.from_index;
-      // assert(idx_from_logical >= 0);
-      // assert(idx_from_logical / kPagesize < block_table_before_commit.size());
       int idx_from_physical =
           block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
           committed_token.from_index % kPagesize;
@@ -1662,10 +1664,8 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // page attention information
     new_bc.requestsInfo[request_index].num_kv_pages =
         get_num_blocks_allocated(request);
-    // assert(new_bc.requestsInfo[request_index].num_kv_pages > 0);
     new_bc.requestsInfo[request_index].kv_last_page_len =
         get_len_last_block(request);
-    // assert(new_bc.requestsInfo[request_index].kv_last_page_len > 0);
     new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index b762fedd2..78c63fd6a 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -437,6 +437,13 @@ void RequestManager::load_batch_config_task(
 
   // load attention metadata
   if (batch_config->get_mode() == INC_DECODING_MODE) {
+    PageManager *pm = PageManager::get_page_manager();
+    static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS *
+                                BatchConfig::MAX_NUM_TOKENS];
+    static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
+    static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
     if (handle.incr_attention_metadata->enabled()) {
       // calculate the attention meta data
       {
@@ -453,46 +460,43 @@ void RequestManager::load_batch_config_task(
             round_up_pages(BatchConfig::max_sequence_length() +
                            BatchConfig::max_spec_tree_token_num());
 
-        int parallelism = batch_size;
-        prepare_inference_params_kernel<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-            batch_size,
-            request_infos,
-            request_available,
-            max_num_pages,
-            handle.incr_attention_metadata->q_indptr,
-            handle.incr_attention_metadata->kv_indptr,
-            handle.incr_attention_metadata->kv_indices,
-            handle.incr_attention_metadata->kv_last_page_len,
-            handle.incr_attention_metadata->qk_indptr);
+        // int parallelism = batch_size;
+        prepare_inference_params_kernel_h(batch_config,
+                                          pm,
+                                          handle,
+                                          stream,
+                                          max_num_pages,
+                                          q_indptr_h,
+                                          kv_indptr_h,
+                                          kv_indices_h,
+                                          kv_last_page_len_h,
+                                          qk_indptr_h);
       }
 
       // prepare attention forward handler
       {
         int batch_size = batch_config->num_active_requests();
-        static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
-            kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
-            kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
-        q_indptr_h[0] = 0;
-        kv_indptr_h[0] = 0;
-        for (int req_idx = 0, indptr_idx = 0;
-             req_idx < batch_config->max_requests_per_batch();
-             req_idx++) {
-          if (batch_config->request_available[req_idx]) {
-            int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-            int kv_len =
-                batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-                batch_config->requestsInfo[req_idx]
-                    .first_token_index_in_request;
-            q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-            kv_indptr_h[indptr_idx + 1] =
-                kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
-            kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
-            indptr_idx++;
-          }
-        }
+        // static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        //     kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
+        //     kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
+        // q_indptr_h[0] = 0;
+        // kv_indptr_h[0] = 0;
+        // for (int req_idx = 0, indptr_idx = 0;
+        //      req_idx < batch_config->max_requests_per_batch();
+        //      req_idx++) {
+        //   if (batch_config->request_available[req_idx]) {
+        //     int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
+        //     int kv_len =
+        //         batch_config->requestsInfo[req_idx].num_tokens_in_batch +
+        //         batch_config->requestsInfo[req_idx]
+        //             .first_token_index_in_request;
+        //     q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
+        //     kv_indptr_h[indptr_idx + 1] =
+        //         kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
+        //     kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
+        //     indptr_idx++;
+        //   }
+        // }
 
         if (!batch_config->prompt_phase) {
           BatchDecodeHandler *handler = nullptr;
@@ -690,7 +694,6 @@ void RequestManager::load_batch_config_task(
                                 BatchConfig::MAX_NUM_TOKENS];
     static int32_t qk_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
     static int32_t kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
-
     if (handle.tree_verify_attention_metadata->enabled()) {
       // calculate the attention meta data
       {

From 5250a3b78ca1238e9ac7c08a0985cefbf0148de6 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Tue, 5 Nov 2024 20:27:39 -0800
Subject: [PATCH 604/667] ckpt for nothing

---
 src/ops/kernels/inc_multihead_self_attention_kernels.cu | 2 +-
 src/runtime/request_manager.cc                          | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 1a697fbda..f6c3da319 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -521,7 +521,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
   int const max_num_pages =
       round_up_pages(BatchConfig::max_sequence_length() +
                      BatchConfig::max_spec_tree_token_num());
-  update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
+  update_qkv_in_batch_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
                                       stream>>>(
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d156c6253..44e719e66 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1310,6 +1310,9 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     int idx_to_physical = append_token_to_block(request, request.tokens.back(), true);
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
+    bc.requestsInfo[request_index].request_guid = request.guid;
+    printf("Request %d, token %d, idx_to_physical %d\n", request.guid, request.tokens.back(), idx_to_physical);
+    printf("Request %d, num_kv_pages %d, kv_last_page_len %d\n", request.guid, bc.requestsInfo[request_index].num_kv_pages, bc.requestsInfo[request_index].kv_last_page_len);
 
     bc.num_tokens++;
 
@@ -2521,7 +2524,6 @@ void RequestManager::background_serving_task(
   }
   // page attention: initalize the page manager here
   int kv_cache_size = rm->get_max_kv_cache_size();
-  printf("KV cache size: %d\n", kv_cache_size);
   PageManager::get_page_manager(llm, rm->get_max_kv_cache_size());
   if (rm->decoding_mode == INCREMENTAL_DECODING) {
     // No SSMs: perform incremental decoding
@@ -2738,6 +2740,10 @@ void RequestManager::terminate_background_server_at_exit() {
 
 void RequestManager::terminate_background_server() {
   if (is_background_server_serving()) {
+    printf("profiling llm step times size: %ld\n",
+           profiling.llm_step_times.size());
+    printf("profiling requests per step size: %ld\n",
+           profiling.requests_per_step.size());
     assert(profiling.llm_step_times.size() ==
            profiling.requests_per_step.size());
     // Write the last profiling statistics to output file

From 18d6d456ad2d458efa8f102b25305f790e987308 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 7 Nov 2024 07:38:41 -0800
Subject: [PATCH 605/667] feat: modify the logic of the scheduler

---
 src/runtime/request_manager.cc | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 53cd7ecc6..43513627a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -3029,21 +3029,17 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid,
                                            int &budget,
                                            int num_req_with_slo) {
   Request &request = all_requests[guid];
-  double num_tokens_to_decode = 0.0;
   double num_tokens_to_decode_per_step =
       (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
       (baseline_latency_ms * request.get_slo_ratio());
-  bool attained =
-      request.decode_latency_ms <= get_request_expected_latency(request);
+  double expected_num_tokens_decoded =
+      request.decode_latency_ms /
+      (baseline_latency_ms * request.get_slo_ratio());
 
-  if (attained) {
-    num_tokens_to_decode = num_tokens_to_decode_per_step;
-  } else {
-    num_tokens_to_decode = num_tokens_to_decode_per_step +
-                           request.decode_latency_ms /
-                               (baseline_latency_ms * request.get_slo_ratio()) -
-                           request.decode_length();
-  }
+  double num_tokens_to_decode =
+      max(1.0,
+          num_tokens_to_decode_per_step + expected_num_tokens_decoded -
+              request.decode_length());
 
   // The root is already included
   // In function add_root_to_spec_token_tree
@@ -3051,7 +3047,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid,
 
   // The max token that can be added to the token tree when fulfilling the SLO
   int max_token_toward_slo =
-      int(get_max_tokens_per_batch() / num_req_with_slo * 1.1);
+      int(get_max_tokens_per_batch() / num_available_requests);
 
   while (budget > 0 and max_token_toward_slo > 0 and
          current_added < num_tokens_to_decode) {

From 810983e859924844aeaf20a485563321ccfefa89 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 11:36:26 -0800
Subject: [PATCH 606/667] fix compilation error

---
 .../inc_multihead_self_attention_kernels.h    |   1 +
 .../inc_multihead_self_attention_kernels.cu   | 108 ++++++++++++++++++
 src/ops/tree_inc_multihead_self_attention.cu  |  98 ----------------
 3 files changed, 109 insertions(+), 98 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 919393985..1cbd383ca 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -112,6 +112,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream);
 
+template <typename DT>
 void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
                                 BatchConfig const *bc,
                                 cudaStream_t stream);
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index f6c3da319..2167c45a6 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -539,6 +539,104 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
       num_new_tokens);
 }
 
+template <typename DT>
+__global__ void update_qkv_in_batch_verify_kernel(
+    DT *qkv_proj_array,
+    half *qTmp_ptr,
+    half *kvCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
+    bool const *request_available,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int const max_num_pages,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int num_new_tokens) {
+  int const q_hidden_size = num_q_heads * head_dim;
+  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
+  int const kv_hidden_size = num_kv_heads * head_dim;
+  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int const token_idx = thread_idx / q_hidden_size;
+  int const offset = thread_idx % q_hidden_size;
+
+  if (token_idx >= num_new_tokens) {
+    return;
+  }
+
+  int const req_idx = tokenInfos[token_idx].request_index;
+  int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
+
+  // calculate the compact request index in the easiest way
+  // TODO: recheck
+  int req_idx_compact = -1;
+  int cnt = 0;
+  while (cnt < req_idx + 1) {
+    if (request_available[cnt]) {
+      req_idx_compact++;
+    }
+    cnt++;
+  }
+
+  assert(req_idx_compact >= 0 && "Invalid request index");
+
+  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+  qTmp_ptr[token_idx * q_hidden_size + offset] =
+      static_cast<half>(qkv_proj_array[from_idx + offset]);
+
+  if (offset < kv_hidden_size) {
+    int start = kv_indptr[req_idx_compact];
+    int end = kv_indptr[req_idx_compact + 1] - 1;
+    assert(start <= end && "Invalid kv_indptr");
+    assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index");
+    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
+    size_t to_k_idx = get_k_entry_offset_verify(
+               token_abs_idx, page_idx, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset_verify(
+               token_abs_idx, page_idx, num_kv_heads, head_dim);
+    // key and value cache should be stored interleaved
+    int const stride = num_q_heads / num_kv_heads;
+    int const kv_offset =
+        offset / head_dim * stride * head_dim + offset % head_dim;
+    kvCache_ptr[to_k_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
+    kvCache_ptr[to_v_idx + offset] =
+        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+                                         temp_kv_hidden_size + kv_offset]);
+  }
+}
+
+template <typename DT>
+void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
+                                BatchConfig const *bc,
+                                cudaStream_t stream) {
+  // printf("entered update_qkv_in_batch_verify\n");
+  int num_new_tokens = bc->num_active_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
+  int parallelism = m->local_hidden_size * num_new_tokens;
+  int const max_num_pages =
+      round_up_pages(BatchConfig::max_sequence_length() +
+                     BatchConfig::max_spec_tree_token_num());
+  update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->kvCache),
+      m->handle.tree_verify_attention_metadata->kv_indptr,
+      m->handle.tree_verify_attention_metadata->kv_indices,
+      m->request_available,
+      m->token_infos,
+      max_num_pages,
+      m->num_q_heads,
+      m->num_kv_heads,
+      m->qk_dim,
+      num_new_tokens);
+}
+
 __global__ void update_kv_in_streaming_cache_kernel(
     half *pre_pos_enc_buf,
     half *kv_cache,
@@ -969,6 +1067,16 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     BatchConfig const *bc,
     cudaStream_t stream);
 
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_verify<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_verify<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    cudaStream_t stream);
+
 template void
     Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<half>(
         IncMultiHeadSelfAttentionMeta const *m,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index ad3f0e91f..905df573a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -47,104 +47,6 @@ using flashinfer::PageStorage;
 using flashinfer::PosEncodingMode;
 using flashinfer::QKVLayout;
 
-template <typename DT>
-__global__ void update_qkv_in_batch_verify_kernel(
-    DT *qkv_proj_array,
-    half *qTmp_ptr,
-    half *kvCache_ptr,
-    int32_t *kv_indptr,
-    int32_t *kv_page_indices,
-    bool const *request_available,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    int const max_num_pages,
-    int num_q_heads,
-    int num_kv_heads,
-    int head_dim,
-    int num_new_tokens) {
-  int const q_hidden_size = num_q_heads * head_dim;
-  int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
-  int const kv_hidden_size = num_kv_heads * head_dim;
-  int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int const token_idx = thread_idx / q_hidden_size;
-  int const offset = thread_idx % q_hidden_size;
-
-  if (token_idx >= num_new_tokens) {
-    return;
-  }
-
-  int const req_idx = tokenInfos[token_idx].request_index;
-  int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
-
-  // calculate the compact request index in the easiest way
-  // TODO: recheck
-  int req_idx_compact = -1;
-  int cnt = 0;
-  while (cnt < req_idx + 1) {
-    if (request_available[cnt]) {
-      req_idx_compact++;
-    }
-    cnt++;
-  }
-
-  assert(req_idx_compact >= 0 && "Invalid request index");
-
-  size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
-  qTmp_ptr[token_idx * q_hidden_size + offset] =
-      static_cast<half>(qkv_proj_array[from_idx + offset]);
-
-  if (offset < kv_hidden_size) {
-    int start = kv_indptr[req_idx_compact];
-    int end = kv_indptr[req_idx_compact + 1] - 1;
-    assert(start <= end && "Invalid kv_indptr");
-    assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index");
-    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
-    size_t to_k_idx = get_k_entry_offset_verify(
-               token_abs_idx, page_idx, num_kv_heads, head_dim),
-           to_v_idx = get_v_entry_offset_verify(
-               token_abs_idx, page_idx, num_kv_heads, head_dim);
-    // key and value cache should be stored interleaved
-    int const stride = num_q_heads / num_kv_heads;
-    int const kv_offset =
-        offset / head_dim * stride * head_dim + offset % head_dim;
-    kvCache_ptr[to_k_idx + offset] =
-        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size + kv_offset]);
-    kvCache_ptr[to_v_idx + offset] =
-        static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
-                                         temp_kv_hidden_size + kv_offset]);
-  }
-}
-
-template <typename DT>
-void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
-                                BatchConfig const *bc,
-                                cudaStream_t stream) {
-  // printf("entered update_qkv_in_batch_verify\n");
-  int num_new_tokens = bc->num_active_tokens();
-  if (num_new_tokens == 0) {
-    return;
-  }
-  int parallelism = m->local_hidden_size * num_new_tokens;
-  int const max_num_pages =
-      round_up_pages(BatchConfig::max_sequence_length() +
-                     BatchConfig::max_spec_tree_token_num());
-  update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<half *>(m->queryTmp),
-      static_cast<half *>(m->kvCache),
-      m->handle.tree_verify_attention_metadata->kv_indptr,
-      m->handle.tree_verify_attention_metadata->kv_indices,
-      m->request_available,
-      m->token_infos,
-      max_num_pages,
-      m->num_q_heads,
-      m->num_kv_heads,
-      m->qk_dim,
-      num_new_tokens);
-}
-
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
     int32_t *kv_indptr,

From f7656be8ef6ef53ee87b631107953b952108e1b8 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 12:03:05 -0800
Subject: [PATCH 607/667] all good for spec, now test incr

---
 .../inc_multihead_self_attention_kernels.h    |  2 +-
 src/ops/inc_multihead_self_attention.cu       |  2 +-
 .../inc_multihead_self_attention_kernels.cu   | 99 +++++++------------
 3 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 1cbd383ca..9ca48f69f 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -115,7 +115,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
 template <typename DT>
 void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
                                 BatchConfig const *bc,
-                                cudaStream_t stream);
+                                cudaStream_t stream, bool is_spec = true);
 
 // [For the tokens in streaming cache]
 // Convert the out-of-order cache to in-order relative position.
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index c20ea2346..dfe0ad7ec 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -275,7 +275,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     apply_pos_encoding_to_tokens_in_batch(
         m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
     // Move the batch qkv values to where took by attention
-    update_qkv_in_batch<DT>(m, bc, stream);
+    update_qkv_in_batch_verify<DT>(m, bc, stream, false);
   }
 
   // phase 4: Attention computation
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 2167c45a6..dcd2bf8e9 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -443,26 +443,22 @@ void apply_pos_encoding_to_streaming_proj(
 }
 
 template <typename DT>
-__global__ void update_qkv_in_batch_kernel(
-    DT *qkv_proj_array,
-    half *qTmp_ptr,
-    half *kvCache_ptr,
-    int32_t *kv_indptr,
-    int32_t *kv_page_indices,
-    bool const *request_available,
-    BatchConfig::PerTokenInfo const *tokenInfos,
-    int const max_num_pages,
-    int num_q_heads,
-    int num_kv_heads,
-    int head_dim,
-    int num_new_tokens) {
+__global__ void
+    update_qkv_in_batch_kernel(DT *qkv_proj_array,
+                               half *qTmp_ptr,
+                               half *kvCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int const max_num_pages,
+                               int num_q_heads,
+                               int num_kv_heads,
+                               int head_dim,
+                               int num_new_tokens) {
   int const q_hidden_size = num_q_heads * head_dim;
   int const temp_kv_hidden_size = num_q_heads * head_dim; // temporary hard code
   int const kv_hidden_size = num_kv_heads * head_dim;
   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int const token_idx = thread_idx / q_hidden_size;
   int const offset = thread_idx % q_hidden_size;
-
   if (token_idx >= num_new_tokens) {
     return;
   }
@@ -470,33 +466,15 @@ __global__ void update_qkv_in_batch_kernel(
   int const req_idx = tokenInfos[token_idx].request_index;
   int token_abs_idx = tokenInfos[token_idx].abs_index_in_request;
 
-  // calculate the compact request index in the easiest way
-  // TODO: recheck
-  int req_idx_compact = -1;
-  int cnt = 0;
-  while (cnt < req_idx + 1) {
-    if (request_available[cnt]) {
-      req_idx_compact++;
-    }
-    cnt++;
-  }
-
-  assert(req_idx_compact >= 0 && "Invalid request index");
-
   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
   qTmp_ptr[token_idx * q_hidden_size + offset] =
       static_cast<half>(qkv_proj_array[from_idx + offset]);
 
   if (offset < kv_hidden_size) {
-    int start = kv_indptr[req_idx_compact];
-    int end = kv_indptr[req_idx_compact + 1] - 1;
-    assert(start <= end && "Invalid kv_indptr");
-    assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page index");
-    int page_idx = kv_page_indices[start + (token_abs_idx / kPagesize)];
-    size_t to_k_idx = get_k_entry_offset_verify(
-               token_abs_idx, page_idx, num_kv_heads, head_dim),
-           to_v_idx = get_v_entry_offset_verify(
-               token_abs_idx, page_idx, num_kv_heads, head_dim);
+    size_t to_k_idx = get_k_entry_offset(
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim),
+           to_v_idx = get_v_entry_offset(
+               req_idx, token_abs_idx, max_num_pages, num_kv_heads, head_dim);
     // key and value cache should be stored interleaved
     int const stride = num_q_heads / num_kv_heads;
     int const kv_offset =
@@ -511,8 +489,8 @@ __global__ void update_qkv_in_batch_kernel(
 
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                                BatchConfig const *bc,
-                                cudaStream_t stream) {
+                         BatchConfig const *bc,
+                         cudaStream_t stream) {
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
     return;
@@ -522,21 +500,17 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
       round_up_pages(BatchConfig::max_sequence_length() +
                      BatchConfig::max_spec_tree_token_num());
   update_qkv_in_batch_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
-      static_cast<DT *>(m->devQKVProjArray),
-      static_cast<half *>(m->queryTmp),
-      static_cast<half *>(m->kvCache),
-      m->handle.tree_verify_attention_metadata->kv_indptr,
-      m->handle.tree_verify_attention_metadata->kv_indices,
-      m->request_available,
-      m->token_infos,
-      max_num_pages,
-      m->num_q_heads,
-      m->num_kv_heads,
-      m->qk_dim,
-      num_new_tokens);
+                               min(CUDA_NUM_THREADS, parallelism),
+                               0,
+                               stream>>>(static_cast<DT *>(m->devQKVProjArray),
+                                         static_cast<half *>(m->queryTmp),
+                                         static_cast<half *>(m->kvCache),
+                                         m->token_infos,
+                                         max_num_pages,
+                                         m->num_q_heads,
+                                         m->num_kv_heads,
+                                         m->qk_dim,
+                                         num_new_tokens);
 }
 
 template <typename DT>
@@ -548,7 +522,6 @@ __global__ void update_qkv_in_batch_verify_kernel(
     int32_t *kv_page_indices,
     bool const *request_available,
     BatchConfig::PerTokenInfo const *tokenInfos,
-    int const max_num_pages,
     int num_q_heads,
     int num_kv_heads,
     int head_dim,
@@ -609,16 +582,17 @@ __global__ void update_qkv_in_batch_verify_kernel(
 template <typename DT>
 void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
                                 BatchConfig const *bc,
-                                cudaStream_t stream) {
+                                cudaStream_t stream, bool is_spec) {
   // printf("entered update_qkv_in_batch_verify\n");
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
     return;
   }
   int parallelism = m->local_hidden_size * num_new_tokens;
-  int const max_num_pages =
-      round_up_pages(BatchConfig::max_sequence_length() +
-                     BatchConfig::max_spec_tree_token_num());
+  int32_t *kv_indptr = is_spec ? m->handle.tree_verify_attention_metadata->kv_indptr
+                               : m->handle.incr_attention_metadata->kv_indptr;
+  int32_t *kv_indices = is_spec ? m->handle.tree_verify_attention_metadata->kv_indices
+                                : m->handle.incr_attention_metadata->kv_indices;
   update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
@@ -626,11 +600,10 @@ void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
       static_cast<DT *>(m->devQKVProjArray),
       static_cast<half *>(m->queryTmp),
       static_cast<half *>(m->kvCache),
-      m->handle.tree_verify_attention_metadata->kv_indptr,
-      m->handle.tree_verify_attention_metadata->kv_indices,
+      kv_indptr,
+      kv_indices,
       m->request_available,
       m->token_infos,
-      max_num_pages,
       m->num_q_heads,
       m->num_kv_heads,
       m->qk_dim,
@@ -1070,12 +1043,12 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
 template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_verify<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
-    cudaStream_t stream);
+    cudaStream_t stream, bool is_spec);
 
 template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_verify<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
-    cudaStream_t stream);
+    cudaStream_t stream, bool is_spec);
 
 template void
     Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<half>(

From 8c203ecdb22dc24c5e9926891b4dedf4000a69a6 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 12:27:38 -0800
Subject: [PATCH 608/667] typo

---
 inference/incr_decoding/incr_decoding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 5a18daab4..ccbb85c59 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -51,7 +51,7 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_prefilling_batch,
                       int &max_sequence_length,
                       int &max_output_length,
-                      int &max_kv_cache_size,
+                      size_t &max_kv_cache_size,
                       int &sampling_seed,
                       bool &streaming_cache,
                       bool &slo_attainment_early_termination,
@@ -209,7 +209,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_tokens_per_prefilling_batch = -1;
   int max_sequence_length = 256;
   int max_output_length = 512;
-  int max_kv_cache_size = -1; // if -1, then use the default value
+  size_t max_kv_cache_size = 0; // if -1, then use the default value
   RequestManager::DecodingMode decoding_mode =
       RequestManager::INCREMENTAL_DECODING;
   int sampling_seed = 0;

From 3c158f8501ac7e5a865871c32dbfbd3d13f32b30 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 12:35:55 -0800
Subject: [PATCH 609/667] workable incrdecoding!

---
 src/runtime/request_manager.cc |  2 --
 src/runtime/request_manager.cu | 16 ++++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 44e719e66..e44dcda12 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1311,8 +1311,6 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
     bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
     bc.requestsInfo[request_index].request_guid = request.guid;
-    printf("Request %d, token %d, idx_to_physical %d\n", request.guid, request.tokens.back(), idx_to_physical);
-    printf("Request %d, num_kv_pages %d, kv_last_page_len %d\n", request.guid, bc.requestsInfo[request_index].num_kv_pages, bc.requestsInfo[request_index].kv_last_page_len);
 
     bc.num_tokens++;
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 78c63fd6a..f6935dc0a 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -79,7 +79,7 @@ void RequestManager::load_tokens_task(
 
 void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
                                        PageManager *pm,
-                                       FFHandler handle,
+                                       AttentionMetaData *attention_metadata,
                                        cudaStream_t stream,
                                        uint32_t const max_num_pages,
                                        int32_t *q_indptr_h,
@@ -130,28 +130,28 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
   }
 
   // do the copy
-  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_indices,
+  checkCUDA(cudaMemcpyAsync(attention_metadata->kv_indices,
                             kv_indices_h,
                             sizeof(int32_t) * batch_size * max_num_pages,
                             cudaMemcpyHostToDevice,
                             stream));
   checkCUDA(
-      cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_last_page_len,
+      cudaMemcpyAsync(attention_metadata->kv_last_page_len,
                       kv_last_page_len_h,
                       sizeof(int32_t) * batch_size,
                       cudaMemcpyHostToDevice,
                       stream));
-  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->q_indptr,
+  checkCUDA(cudaMemcpyAsync(attention_metadata->q_indptr,
                             q_indptr_h,
                             sizeof(int32_t) * (batch_size + 1),
                             cudaMemcpyHostToDevice,
                             stream));
-  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->kv_indptr,
+  checkCUDA(cudaMemcpyAsync(attention_metadata->kv_indptr,
                             kv_indptr_h,
                             sizeof(int32_t) * (batch_size + 1),
                             cudaMemcpyHostToDevice,
                             stream));
-  checkCUDA(cudaMemcpyAsync(handle.tree_verify_attention_metadata->qk_indptr,
+  checkCUDA(cudaMemcpyAsync(attention_metadata->qk_indptr,
                             qk_indptr_h,
                             sizeof(int32_t) * (batch_size + 1),
                             cudaMemcpyHostToDevice,
@@ -463,7 +463,7 @@ void RequestManager::load_batch_config_task(
         // int parallelism = batch_size;
         prepare_inference_params_kernel_h(batch_config,
                                           pm,
-                                          handle,
+                                          handle.incr_attention_metadata,
                                           stream,
                                           max_num_pages,
                                           q_indptr_h,
@@ -726,7 +726,7 @@ void RequestManager::load_batch_config_task(
         // int parallelism = batch_size;
         prepare_inference_params_kernel_h(batch_config,
                                           pm,
-                                          handle,
+                                          handle.tree_verify_attention_metadata,
                                           stream,
                                           max_num_pages,
                                           q_indptr_h,

From 7d612f7bf1a997f46b4206a099965e7faa188c41 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 16:57:53 -0800
Subject: [PATCH 610/667] refactor

---
 .../ops/kernels/inc_multihead_self_attention_kernels.h |  4 ++--
 src/ops/inc_multihead_self_attention.cu                |  2 +-
 .../kernels/inc_multihead_self_attention_kernels.cu    | 10 +++++-----
 src/ops/tree_inc_multihead_self_attention.cu           |  2 +-
 src/runtime/request_manager.cu                         |  1 -
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9ca48f69f..2c73da124 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -113,9 +113,9 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          cudaStream_t stream);
 
 template <typename DT>
-void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                                 BatchConfig const *bc,
-                                cudaStream_t stream, bool is_spec = true);
+                                cudaStream_t stream, bool is_spec);
 
 // [For the tokens in streaming cache]
 // Convert the out-of-order cache to in-order relative position.
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index dfe0ad7ec..dfa3e140e 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -275,7 +275,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     apply_pos_encoding_to_tokens_in_batch(
         m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
     // Move the batch qkv values to where took by attention
-    update_qkv_in_batch_verify<DT>(m, bc, stream, false);
+    update_qkv_in_batch<DT>(m, bc, stream, false);
   }
 
   // phase 4: Attention computation
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index dcd2bf8e9..63472bcb0 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -514,7 +514,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-__global__ void update_qkv_in_batch_verify_kernel(
+__global__ void update_qkv_in_batch_paged_kernel(
     DT *qkv_proj_array,
     half *qTmp_ptr,
     half *kvCache_ptr,
@@ -580,7 +580,7 @@ __global__ void update_qkv_in_batch_verify_kernel(
 }
 
 template <typename DT>
-void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                                 BatchConfig const *bc,
                                 cudaStream_t stream, bool is_spec) {
   // printf("entered update_qkv_in_batch_verify\n");
@@ -593,7 +593,7 @@ void update_qkv_in_batch_verify(IncMultiHeadSelfAttentionMeta const *m,
                                : m->handle.incr_attention_metadata->kv_indptr;
   int32_t *kv_indices = is_spec ? m->handle.tree_verify_attention_metadata->kv_indices
                                 : m->handle.incr_attention_metadata->kv_indices;
-  update_qkv_in_batch_verify_kernel<<<GET_BLOCKS(parallelism),
+  update_qkv_in_batch_paged_kernel<<<GET_BLOCKS(parallelism),
                                       min(CUDA_NUM_THREADS, parallelism),
                                       0,
                                       stream>>>(
@@ -1040,12 +1040,12 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     BatchConfig const *bc,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_verify<float>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream, bool is_spec);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_verify<half>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream, bool is_spec);
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 905df573a..6846c048a 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -433,7 +433,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventRecord(t_start, stream);
 
   // Update key-val cache, compact q array
-  update_qkv_in_batch_verify<DT>(m, bc, stream);
+  update_qkv_in_batch<DT>(m, bc, stream, true);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index f6935dc0a..ed44e8944 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -460,7 +460,6 @@ void RequestManager::load_batch_config_task(
             round_up_pages(BatchConfig::max_sequence_length() +
                            BatchConfig::max_spec_tree_token_num());
 
-        // int parallelism = batch_size;
         prepare_inference_params_kernel_h(batch_config,
                                           pm,
                                           handle.incr_attention_metadata,

From 07ec33e5017d374f3e0bbd2068ec17fbe7d0fb5e Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 17:14:57 -0800
Subject: [PATCH 611/667] some format

---
 .../inc_multihead_self_attention_kernels.h    |  5 +--
 src/ops/inc_multihead_self_attention.cu       | 14 ++++----
 .../inc_multihead_self_attention_kernels.cu   | 27 ++++++++-------
 src/runtime/page_manager.cc                   | 12 ++++---
 src/runtime/request_manager.cc                | 13 +++++---
 src/runtime/request_manager.cu                | 33 +++----------------
 6 files changed, 46 insertions(+), 58 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 2c73da124..9f886ffec 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -114,8 +114,9 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                                BatchConfig const *bc,
-                                cudaStream_t stream, bool is_spec);
+                         BatchConfig const *bc,
+                         cudaStream_t stream,
+                         bool is_spec);
 
 // [For the tokens in streaming cache]
 // Convert the out-of-order cache to in-order relative position.
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index dfa3e140e..b959620ce 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -15,7 +15,6 @@
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
 #endif
-#include "flexflow/page_manager.h"
 #include "flashinfer/decode_attention_decl.cuh"
 #include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/ffconst_utils.h"
@@ -23,6 +22,7 @@
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/page_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 #include <math_constants.h>
 
@@ -502,14 +502,14 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       case TREE_VERIFY_MODE: {
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
-        if (total_kv_cache_size_per_layer == 0){
+        if (total_kv_cache_size_per_layer == 0) {
           key_cache_size = num_kv_heads * qk_dim *
-                          BatchConfig::max_requests_per_batch() * max_num_pages *
-                          kPagesize;
+                           BatchConfig::max_requests_per_batch() *
+                           max_num_pages * kPagesize;
           value_cache_size = num_kv_heads * v_dim *
-                            BatchConfig::max_requests_per_batch() *
-                            max_num_pages * kPagesize;
-        }else{
+                             BatchConfig::max_requests_per_batch() *
+                             max_num_pages * kPagesize;
+        } else {
           key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
           value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
         }
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 63472bcb0..a1f179d79 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -581,22 +581,25 @@ __global__ void update_qkv_in_batch_paged_kernel(
 
 template <typename DT>
 void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
-                                BatchConfig const *bc,
-                                cudaStream_t stream, bool is_spec) {
+                         BatchConfig const *bc,
+                         cudaStream_t stream,
+                         bool is_spec) {
   // printf("entered update_qkv_in_batch_verify\n");
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
     return;
   }
   int parallelism = m->local_hidden_size * num_new_tokens;
-  int32_t *kv_indptr = is_spec ? m->handle.tree_verify_attention_metadata->kv_indptr
-                               : m->handle.incr_attention_metadata->kv_indptr;
-  int32_t *kv_indices = is_spec ? m->handle.tree_verify_attention_metadata->kv_indices
-                                : m->handle.incr_attention_metadata->kv_indices;
+  int32_t *kv_indptr = is_spec
+                           ? m->handle.tree_verify_attention_metadata->kv_indptr
+                           : m->handle.incr_attention_metadata->kv_indptr;
+  int32_t *kv_indices =
+      is_spec ? m->handle.tree_verify_attention_metadata->kv_indices
+              : m->handle.incr_attention_metadata->kv_indices;
   update_qkv_in_batch_paged_kernel<<<GET_BLOCKS(parallelism),
-                                      min(CUDA_NUM_THREADS, parallelism),
-                                      0,
-                                      stream>>>(
+                                     min(CUDA_NUM_THREADS, parallelism),
+                                     0,
+                                     stream>>>(
       static_cast<DT *>(m->devQKVProjArray),
       static_cast<half *>(m->queryTmp),
       static_cast<half *>(m->kvCache),
@@ -1043,12 +1046,14 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
 template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
-    cudaStream_t stream, bool is_spec);
+    cudaStream_t stream,
+    bool is_spec);
 
 template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
-    cudaStream_t stream, bool is_spec);
+    cudaStream_t stream,
+    bool is_spec);
 
 template void
     Kernels::IncMultiHeadAttention::update_kv_in_streaming_cache<half>(
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index a8559c816..55dcad23b 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -216,7 +216,9 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
   int num_transformer_layers = ff->num_transformer_layers;
   int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
   assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
-         num_transformer_layers > 0 && pipeline_parallelism_degree > 0); //needs to make sure that the model is initialized
+         num_transformer_layers > 0 &&
+         pipeline_parallelism_degree >
+             0); // needs to make sure that the model is initialized
   if (page_manager_singleton == nullptr) {
     size_t num_total_blocks = 0;
     if (total_kv_cache_size == 0) {
@@ -224,14 +226,14 @@ PageManager *PageManager::get_page_manager(FFModel *ff,
                           BatchConfig::max_sequence_length() + kPagesize - 1) /
                          kPagesize * BatchConfig::max_requests_per_batch();
     } else {
-      num_total_blocks =
-          total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim / num_kv_heads /
-          num_transformer_layers / kPagesize;
+      num_total_blocks = total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim /
+                         num_kv_heads / num_transformer_layers / kPagesize;
     }
     printf("page manager singleton is initialized with %d blocks\n",
            num_total_blocks);
     page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
-    page_manager_singleton->kv_cache_size_per_layer = total_kv_cache_size * 1024 * 1024 / num_transformer_layers;
+    page_manager_singleton->kv_cache_size_per_layer =
+        total_kv_cache_size * 1024 * 1024 / num_transformer_layers;
   }
   return page_manager_singleton;
 }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1c2698afc..57f3d8645 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1307,9 +1307,12 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
     // append the token here
-    int idx_to_physical = append_token_to_block(request, request.tokens.back(), true);
-    bc.requestsInfo[request_index].num_kv_pages = get_num_blocks_allocated(request);
-    bc.requestsInfo[request_index].kv_last_page_len = get_len_last_block(request);
+    int idx_to_physical =
+        append_token_to_block(request, request.tokens.back(), true);
+    bc.requestsInfo[request_index].num_kv_pages =
+        get_num_blocks_allocated(request);
+    bc.requestsInfo[request_index].kv_last_page_len =
+        get_len_last_block(request);
     bc.requestsInfo[request_index].request_guid = request.guid;
 
     bc.num_tokens++;
@@ -2025,8 +2028,8 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
 void RequestManager::_append_block_to_request(Request &request,
                                               bool is_commit) {
   PageManager *page_manager = PageManager::get_page_manager();
-  // assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-  // assert(request.blocks.size() ==
+  // assert(request.page_last_committed <
+  // static_cast<int>(request.blocks.size())); assert(request.blocks.size() ==
   //        page_manager->get_block_table_indices(request.guid).size());
   // Append the logical block to the request
   // page attention: in this function we need to remember the last logical block
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index ed44e8944..903dadfcf 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -135,12 +135,11 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
                             sizeof(int32_t) * batch_size * max_num_pages,
                             cudaMemcpyHostToDevice,
                             stream));
-  checkCUDA(
-      cudaMemcpyAsync(attention_metadata->kv_last_page_len,
-                      kv_last_page_len_h,
-                      sizeof(int32_t) * batch_size,
-                      cudaMemcpyHostToDevice,
-                      stream));
+  checkCUDA(cudaMemcpyAsync(attention_metadata->kv_last_page_len,
+                            kv_last_page_len_h,
+                            sizeof(int32_t) * batch_size,
+                            cudaMemcpyHostToDevice,
+                            stream));
   checkCUDA(cudaMemcpyAsync(attention_metadata->q_indptr,
                             q_indptr_h,
                             sizeof(int32_t) * (batch_size + 1),
@@ -475,28 +474,6 @@ void RequestManager::load_batch_config_task(
       // prepare attention forward handler
       {
         int batch_size = batch_config->num_active_requests();
-        // static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
-        //     kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
-        //     kv_last_page_len_h[BatchConfig::MAX_NUM_REQUESTS];
-        // q_indptr_h[0] = 0;
-        // kv_indptr_h[0] = 0;
-        // for (int req_idx = 0, indptr_idx = 0;
-        //      req_idx < batch_config->max_requests_per_batch();
-        //      req_idx++) {
-        //   if (batch_config->request_available[req_idx]) {
-        //     int q_len = batch_config->requestsInfo[req_idx].num_tokens_in_batch;
-        //     int kv_len =
-        //         batch_config->requestsInfo[req_idx].num_tokens_in_batch +
-        //         batch_config->requestsInfo[req_idx]
-        //             .first_token_index_in_request;
-        //     q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-        //     kv_indptr_h[indptr_idx + 1] =
-        //         kv_indptr_h[indptr_idx] + round_up_pages(kv_len);
-        //     kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
-        //     indptr_idx++;
-        //   }
-        // }
-
         if (!batch_config->prompt_phase) {
           BatchDecodeHandler *handler = nullptr;
           if (handle.incr_attention_metadata->decode_handler_collections.count(

From dad3d0f64eaeaefa8c1f79979d5681bb1d0c7a44 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:29:49 -0500
Subject: [PATCH 612/667] Update request_manager.h

---
 include/flexflow/request_manager.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index cfd9a2c77..16aec6f6c 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -490,8 +490,6 @@ class RequestManager {
     std::vector<int> generated_tokens_per_step;
     // To calculate the E2E time of serving
     long long server_start_time = 0;
-    // added for seeing how many things are disabled
-    int num_disabled = 0;
   };
 
   ProfileInfo profiling;

From 1693455e87e9a91188689418391a174559e90f56 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:31:16 -0500
Subject: [PATCH 613/667] Update llama.cc

---
 inference/models/llama.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index cd9fce238..c48077de7 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -64,11 +64,10 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor w2 = nullptr;
 
-  printf("we are in llama model\n");
+  //metadata that needs to be sent to page manager in order to calculate the kv cache per layer
   ff.set_num_transformer_layers(llama_config.num_hidden_layers);
   ff.set_num_kv_heads(llama_config.num_key_value_heads);
   int qkv_dim = llama_config.hidden_size / llama_config.num_attention_heads * 2;
-  printf("qkv_dim: %d\n", qkv_dim);
   ff.set_qkv_dim(qkv_dim);
   ff.set_size_dt(data_type_size(input->data_type));
   for (int i = 0; i < llama_config.num_hidden_layers; i++) {
@@ -296,7 +295,6 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
-  printf("llama qkv dim in the end: %d\n", ff.qkv_dim);
 }
 
 }; // namespace FlexFlow

From a17c130f57a96638b90dfe01fd36351bba5e1579 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:32:07 -0500
Subject: [PATCH 614/667] Update spec_infer.cc

---
 inference/spec_infer/spec_infer.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index be256f080..e6bf0cdd5 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -88,7 +88,6 @@ void parse_input_args(char **argv,
                       bool &add_special_tokens) {
   for (int i = 1; i < argc; i++) {
     // llm model name
-    printf("argv[i]: %s\n", argv[i]);
     if (!strcmp(argv[i], "-llm-model")) {
       model_names.llm_model_name = std::string(argv[++i]);
       for (char &c : model_names.llm_model_name) {
@@ -242,7 +241,6 @@ void parse_input_args(char **argv,
   wordexp(paths.cache_folder_path.c_str(), &p, 0);
   paths.cache_folder_path = p.we_wordv[0];
   wordfree(&p);
-  printf("argv parsed end\n");
 }
 
 void get_model_meta(FilePaths &file_paths,
@@ -414,7 +412,6 @@ void FlexFlow::top_level_task(Task const *task,
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
   int argc = command_args.argc;
-  printf("begin parsing input args\n");
   parse_input_args(argv,
                    argc,
                    file_paths,
@@ -529,9 +526,6 @@ void FlexFlow::top_level_task(Task const *task,
     assert(false && "Invalid LLM model type passed (or no type was passed).");
   }
 
-  printf("after creating llm model we have tree model qkv_dim: %d\n",
-         tree_model.qkv_dim);
-
   // Create SSM models
   int num_ssms = model_metadata.ssm_model_types.size();
   std::vector<int> ssm_model_ids;

From 0f16daf659575994c6847fc5da2ace968e815f3e Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:32:58 -0500
Subject: [PATCH 615/667] Update trace_generator.cc

---
 inference/trace_generator/trace_generator.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index 602f884b9..f1ebf2927 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -93,7 +93,7 @@ void parse_input_args(char **argv,
                       bool &verbose,
                       int &max_sequence_length,
                       int &max_output_length,
-                      int &max_kv_cache_size,
+                      size_t &max_kv_cache_size,
                       double &scaling_factor) {
   for (int i = 1; i < argc; i++) {
     // llm model name
@@ -320,7 +320,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool verbose = false;
   int max_sequence_length = 256;
   int max_output_length = 512;
-  int max_kv_cache_size = -1;
+  size_t max_kv_cache_size = 0;
   double scaling_factor = 1.0;
 
   int max_requests_per_batch = 8;

From ff7de091d69e6c9a89eae0cd52a61db92b9f2bcf Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:36:30 -0500
Subject: [PATCH 616/667] Update tree_inc_multihead_self_attention.cu

---
 src/ops/tree_inc_multihead_self_attention.cu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 6846c048a..f804f849f 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -521,11 +521,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   //   delete[] temp_output;
   // }
-  // cudaError_t err = cudaDeviceSynchronize();
-  // if (err != cudaSuccess) {
-  //     std::cerr << "Kernel launch failed with error: " <<
-  //     cudaGetErrorString(err) << std::endl;
-  // }
 }
 
 } // namespace TreeIncMultiHeadAttention

From e3815a9cb022b8c01584ad088c0e70a4f4a6dd2b Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:37:04 -0500
Subject: [PATCH 617/667] Update tree_inc_multihead_self_attention.cu

---
 src/ops/tree_inc_multihead_self_attention.cu | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index f804f849f..aae1d2f11 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -72,9 +72,6 @@ __global__ void commit_tokens_kernel(
     }
   }
 
-  // int start = kv_indptr[requext_idx_in_batch];
-  // int end = kv_indptr[requext_idx_in_batch + 1] - 1;
-
   for (int i = 0; i < *num_committed_tokens; i++) {
     if (committedTokenInfos[i].request_index == requext_idx_in_batch) {
       int const index_in_kv_cache = committedTokenInfos[i].index_in_kv_cache;

From 38f6ef8cd0d379d5021ef364d171426a0cc25d10 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:37:40 -0500
Subject: [PATCH 618/667] Update tree_inc_multihead_self_attention.cu

---
 src/ops/tree_inc_multihead_self_attention.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index aae1d2f11..a5c98e414 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -79,8 +79,6 @@ __global__ void commit_tokens_kernel(
         continue;
       }
 
-      // int const req_id = committedTokenInfos[i].request_index;
-      // int const tok_id = committedTokenInfos[i].token_depth;
       int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
       int const page_from_idx =
           committedTokenInfos[i].index_in_kv_cache / kPagesize;

From 80ea225b3f9e356b2c6592177ade96d93b0b7bc5 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:39:11 -0500
Subject: [PATCH 619/667] Update page_manager.cc

---
 src/runtime/page_manager.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
index 55dcad23b..7fbb16bcd 100644
--- a/src/runtime/page_manager.cc
+++ b/src/runtime/page_manager.cc
@@ -115,8 +115,6 @@ void BlockAllocator::free(PhysicalTokenBlock &block) {
   }
   block.decr_ref_count();
   if (block.ref_count == 0) {
-    // printf("put block number: %d back to free_blocks\n",
-    // block.get_block_number());
     free_blocks.push_back(block);
   } else {
     // in current implementation this should not be the case
@@ -155,8 +153,6 @@ void PageManager::free_block_table(BlockTable &block_table) {
 
 void PageManager::free_request(RequestGuid const &request_guid) {
   // we only free the blocks that are already used
-  // assert(block_tables.find(request_guid) != block_tables.end());
-  printf("free the blocks for request %d\n", request_guid);
   BlockTable block_table = block_tables[request_guid];
   free_block_table(block_table);
   block_tables.erase(request_guid);
@@ -247,4 +243,4 @@ PageManager *PageManager::get_page_manager() {
   return page_manager_singleton;
 }
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow

From 5fe3a8a532211fbfec3a092fd58e4804226e8022 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:45:45 -0500
Subject: [PATCH 620/667] Update request_manager.cc

---
 src/runtime/request_manager.cc | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 57f3d8645..68addf25b 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -20,9 +20,6 @@
 #include <bitset>
 #include <cmath>
 #include <cstdio>
-#include <cstdlib>
-#include <exception>
-#include <execinfo.h>
 #include <filesystem>
 #include <future>
 #include <iomanip>
@@ -41,13 +38,6 @@ using tokenizers::Tokenizer;
 
 Legion::Logger log_req_mgr("RequestManager");
 
-void printStackTrace() {
-  void *array[10];
-  size_t size = backtrace(array, 10); // Get stack frames
-  backtrace_symbols_fd(
-      array, size, STDERR_FILENO); // Print stack trace to stderr
-}
-
 bool operator<(std::shared_ptr<TokenTreeNode> const &lhs,
                std::shared_ptr<TokenTreeNode> const &rhs) {
   if (lhs->gumbel) {
@@ -131,7 +121,7 @@ RequestManager::RequestManager()
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
   max_output_length = -1;
-  max_kv_cache_size = -1;
+  max_kv_cache_size = 0;
   max_tree_depth = -1;
   max_tree_width = -1;
   k = -1;
@@ -2017,9 +2007,6 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
   PageManager *page_manager = PageManager::get_page_manager();
   std::vector<int> block_table_indices =
       page_manager->get_block_table_indices(request.guid);
-  if (request.blocks.size() != block_table_indices.size()) {
-    // assert(request.blocks.size() == block_table_indices.size());
-  }
   return block_table_indices[idx_logical / kPagesize] * kPagesize +
          idx_logical % kPagesize;
 }
@@ -2028,9 +2015,6 @@ int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
 void RequestManager::_append_block_to_request(Request &request,
                                               bool is_commit) {
   PageManager *page_manager = PageManager::get_page_manager();
-  // assert(request.page_last_committed <
-  // static_cast<int>(request.blocks.size())); assert(request.blocks.size() ==
-  //        page_manager->get_block_table_indices(request.guid).size());
   // Append the logical block to the request
   // page attention: in this function we need to remember the last logical block
   // number that still contains committed tokens
@@ -2039,14 +2023,10 @@ void RequestManager::_append_block_to_request(Request &request,
   page_manager->allocate_one_block(request.guid);
   std::vector<int> block_table_indices =
       page_manager->get_block_table_indices(request.guid);
-  // assert(request.blocks.size() ==
-  //        page_manager->get_block_table_indices(request.guid).size());
   // update page_id_commit
   if (is_commit) {
     request.page_last_committed++;
     int size_blocks = request.blocks.size();
-    // assert(request.page_last_committed <
-    //        static_cast<int>(request.blocks.size()));
   }
 }
 
@@ -2060,14 +2040,10 @@ int RequestManager::append_token_to_block(Request &request,
   if (request.blocks.empty() || request.blocks.back().is_full()) {
     // Append a new logical block
     _append_block_to_request(request, is_commit);
-    // assert(request.blocks.size() ==
-    //        page_manager->get_block_table_indices(request.guid).size());
     // also allocate one physical page
   }
   // insert token to both logical block and physical block
   request.blocks.back().append_tokens({token}, is_commit);
-  // assert(request.blocks.size() ==
-  //        page_manager->get_block_table_indices(request.guid).size());
   int idx_logical = get_idx_last_logical_token(request);
   assert(idx_logical >= 0);
   int idx_physical = idx_logical_to_physical(request, idx_logical);
@@ -2079,12 +2055,9 @@ void RequestManager::reset_block_table(Request &request) {
   // get the indices of original physical block table for request
   PageManager *page_manager = PageManager::get_page_manager();
   assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
-  // assert(request.blocks.size() ==
-  //        page_manager->get_block_table_indices(request.guid).size());
   std::vector<int> block_table_indices =
       page_manager->get_block_table_indices(request.guid);
   // reset the block table according to the request's page_last_commit
-  // assert(block_table_indices.size() > request.page_last_committed);
   page_manager->free_multiple_blocks(request.guid,
                                      block_table_indices.size() -
                                          request.page_last_committed - 1);
@@ -2098,9 +2071,6 @@ void RequestManager::reset_block_table(Request &request) {
   // the indices of block table should be the same as the number of blocks
   std::vector<int> block_table =
       page_manager->get_block_table_indices(request.guid);
-
-  // assert(request.blocks.size() ==
-  //        page_manager->get_block_table_indices(request.guid).size());
   return;
 }
 
@@ -2889,8 +2859,6 @@ void RequestManager::terminate_background_server() {
     generated_tokens_per_step += ")";
     str += generated_tokens_per_step;
 
-    printf("there are %d requests disabled\n", profiling.num_disabled);
-
     std::string mean_generated_tokens_per_step =
         "\n mean_generated_tokens_per_step( ";
     double mean_generated_tokens =

From a721926becbfa60875c6cea6e06f731bbc332856 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:48:46 -0500
Subject: [PATCH 621/667] Update request_manager.cc

---
 src/runtime/request_manager.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 68addf25b..ffb5b8d71 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2466,7 +2466,6 @@ void RequestManager::start_background_server(FFModel *model) {
   {
     std::set_terminate([]() {
       RequestManager::terminate_background_server_at_exit();
-      printStackTrace();
       std::abort();
     });
   }
@@ -2711,10 +2710,6 @@ void RequestManager::terminate_background_server_at_exit() {
 
 void RequestManager::terminate_background_server() {
   if (is_background_server_serving()) {
-    printf("profiling llm step times size: %ld\n",
-           profiling.llm_step_times.size());
-    printf("profiling requests per step size: %ld\n",
-           profiling.requests_per_step.size());
     assert(profiling.llm_step_times.size() ==
            profiling.requests_per_step.size());
     // Write the last profiling statistics to output file

From 1e7e2ec3330d7b9b47129012602184eb2e4df2b2 Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:49:34 -0500
Subject: [PATCH 622/667] Update request_manager.cc

---
 src/runtime/request_manager.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index ffb5b8d71..f076817e2 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2740,8 +2740,6 @@ void RequestManager::terminate_background_server() {
            std::to_string(total_tokens / (total_time / 1e6)) + ")";
 
     double average_latency_per_request = 0;
-
-    // information dump
     for (auto const &profiling_info : profiling_requests) {
       int request_id = profiling_info.first;
       Request &request = all_requests[request_id];

From 1792981ac43678cb9dba2a0edcad15469b8ec92c Mon Sep 17 00:00:00 2001
From: Bob Chen <70640928+Bob-Chen222@users.noreply.github.com>
Date: Thu, 7 Nov 2024 20:51:12 -0500
Subject: [PATCH 623/667] Update request_manager.cc

---
 src/runtime/request_manager.cc | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index f076817e2..8b4cccafb 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2740,14 +2740,6 @@ void RequestManager::terminate_background_server() {
            std::to_string(total_tokens / (total_time / 1e6)) + ")";
 
     double average_latency_per_request = 0;
-    for (auto const &profiling_info : profiling_requests) {
-      int request_id = profiling_info.first;
-      Request &request = all_requests[request_id];
-      if (request.status != Request::COMPLETED) {
-        continue;
-      }
-    }
-
     std::string latency_per_request_ms = "\n latency_per_request_ms( ";
     for (auto const &profiling_info : profiling_requests) {
       double latency_ms = (profiling_info.second.finish_time -

From 95023e6e775c8ef575fd1f858cccc59f7cec4641 Mon Sep 17 00:00:00 2001
From: Bob-Chen222 <qinghanc@andrew.cmu.edu>
Date: Thu, 7 Nov 2024 18:06:08 -0800
Subject: [PATCH 624/667] final update

---
 inference/models/llama.cc      | 3 ++-
 src/runtime/request_manager.cc | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index c48077de7..414306877 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -64,7 +64,8 @@ void LLAMA::create_llama_model(FFModel &ff,
 
   Tensor w2 = nullptr;
 
-  //metadata that needs to be sent to page manager in order to calculate the kv cache per layer
+  // metadata that needs to be sent to page manager in order to calculate the kv
+  // cache per layer
   ff.set_num_transformer_layers(llama_config.num_hidden_layers);
   ff.set_num_kv_heads(llama_config.num_key_value_heads);
   int qkv_dim = llama_config.hidden_size / llama_config.num_attention_heads * 2;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 8b4cccafb..6f1370aff 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1174,7 +1174,6 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
         get_num_blocks_allocated(*request);
     if (bc.requestsInfo[request_index].num_kv_pages == 0) {
       // turn this request into not available for one round
-      profiling.num_disabled++;
       bc.request_available[request_index] = false;
     }
     bc.requestsInfo[request_index].kv_last_page_len =

From 9ce11b2b91d4302a6223c698fd6f1d29d398d143 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sat, 9 Nov 2024 03:38:47 +0000
Subject: [PATCH 625/667] feat: load weights in parallel

---
 include/flexflow/model.h             |   3 +
 include/flexflow/utils/file_loader.h |  28 ++++++++
 inference/python/chat.py             | 103 +++++++++++++++++++++++++++
 src/c/flexflow_c.cc                  |   5 +-
 src/mapper/mapper.cc                 |   6 ++
 src/runtime/file_loader.cc           |  86 ++++++++++++++++++++++
 src/runtime/model.cc                 |  45 ++++++++++++
 src/runtime/request_manager.cc       |   6 +-
 8 files changed, 278 insertions(+), 4 deletions(-)
 create mode 100644 inference/python/chat.py

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 32177a383..09a8dafc7 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -253,6 +253,9 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
+  LOAD_FLOAT_WEIGHT_TASK_ID,
+  LOAD_HALF_WEIGHT_TASK_ID,
+  LOAD_QUANT_WEIGHT_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index 8861cfc48..a6771ee6a 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -39,7 +39,26 @@ class FileDataLoader {
   void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
 
   void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
+#ifdef DEADCODE
   void load_weights(FFModel *ff);
+#endif
+
+  static void
+      load_float_weight_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void
+      load_half_weight_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void
+      load_quant_weight_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -54,3 +73,12 @@ class FileDataLoader {
   std::string weights_folder;
   bool use_full_precision;
 };
+
+struct WeightLoadTaskArgs {
+  FFModel *ff;
+  FileDataLoader *loader;
+  Layer *layer;
+  int weight_idx;
+  WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx)
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx) {}
+};
diff --git a/inference/python/chat.py b/inference/python/chat.py
new file mode 100644
index 000000000..95132443a
--- /dev/null
+++ b/inference/python/chat.py
@@ -0,0 +1,103 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    # Define sample configs
+    ff_init_configs = {
+        # required parameters
+        "num_gpus": 8,
+        "memory_per_gpu": 34000,
+        "zero_copy_memory_per_node": 200000,
+        # optional parameters
+        "num_cpus": 16,
+        "legion_utility_processors": 16,
+        "data_parallelism_degree": 1,
+        "tensor_parallelism_degree": 8,
+        "pipeline_parallelism_degree": 1,
+        "offload": False,
+        "offload_reserve_space_size": 8 * 1024,  # 8GB
+        "use_4bit_quantization": False,
+        "use_8bit_quantization": False,
+        "enable_peft": False,
+        "peft_activation_reserve_space_size": 1024,  # 1GB
+        "profiling": False,
+        "benchmarking": False,
+        "inference_debugging": False,
+        "fusion": True,
+    }
+    llm_configs = {
+        # required parameters
+        "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+        # optional parameters
+        "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+        "refresh_cache": False,
+        "full_precision": False,
+    }
+    # Merge dictionaries
+    ff_init_configs.update(llm_configs)
+    return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.llm_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+    )
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    llm.compile(
+        generation_config,
+        max_requests_per_batch=1,
+        max_seq_length=2048,
+        max_tokens_per_batch=256,
+    )
+
+    llm.start_server()
+
+    nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
+    llama_generic_system = "You are a helpful an honest programming assistant."
+
+
+    messages=[
+        {"role": "system", "content": nemotron_system},
+        {"role": "user", "content": "Is Rust better than Python?"},
+    ]
+    llm.generate(messages, max_new_tokens=1024)
+    
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow inference example (incremental decoding)")
+    main()
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index b81552043..39d7b1ec5 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2831,5 +2831,8 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_) {
   FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_);
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
-  handle->load_weights(model);
+  // handle->load_weights(model);
+  Context ctx = model->config.lg_ctx;
+  Runtime *runtime = model->config.lg_hlr;
+  handle->load_weights_parallel(model, ctx, runtime);
 }
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index 037ad1819..5314e9dfe 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -292,6 +292,12 @@ void FFMapper::select_task_options(MapperContext const ctx,
     output.initial_proc = all_cpus[0];
     return;
   }
+  if ((task.task_id == LOAD_FLOAT_WEIGHT_TASK_ID) ||
+      (task.task_id == LOAD_HALF_WEIGHT_TASK_ID) ||
+      (task.task_id == LOAD_QUANT_WEIGHT_TASK_ID)) {
+    output.initial_proc = all_cpus[0];
+    return;
+  }
   if (task.task_id == TOP_LEVEL_TASK_ID) {
     output.initial_proc = all_cpus[0];
     // control replicate top level task
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 05f3ec23f..bd6a862d0 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -16,6 +16,7 @@
 #include "flexflow/utils/file_loader.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/model.h"
 
 #include <vector>
 using namespace std;
@@ -786,6 +787,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
   free(data);
 }
 
+#ifdef DEADCODE
 void FileDataLoader::load_weights(FFModel *ff) {
   for (Layer *l : ff->layers) {
     if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
@@ -814,3 +816,87 @@ void FileDataLoader::load_weights(FFModel *ff) {
     }
   }
 }
+#endif
+
+void FileDataLoader::load_float_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+  args->loader->load_single_weight_tensor<float>(
+      args->ff, args->layer, args->weight_idx);
+}
+
+void FileDataLoader::load_half_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+  args->loader->load_single_weight_tensor<half>(
+      args->ff, args->layer, args->weight_idx);
+}
+
+void FileDataLoader::load_quant_weight_task(
+    Legion::Task const *task,
+    std::vector<Legion::PhysicalRegion> const &regions,
+    Legion::Context ctx,
+    Legion::Runtime *runtime) {
+  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
+  args->loader->load_quantization_weight(
+      args->ff, args->layer, args->weight_idx);
+}
+
+void FileDataLoader::load_weights_parallel(FFModel *ff,
+                                           Context ctx,
+                                           Runtime *runtime) {
+  std::vector<Future> futures;
+
+  for (Layer *l : ff->layers) {
+    if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
+      continue;
+    }
+    for (int i = 0; i < l->numWeights; i++) {
+      Tensor weight = l->weights[i];
+      if (weight == NULL) {
+        continue;
+      }
+
+      // Create task arguments
+      WeightLoadTaskArgs args(ff, this, l, i);
+
+      switch (weight->data_type) {
+        case DT_HALF: {
+          TaskLauncher launcher(
+              LOAD_HALF_WEIGHT_TASK_ID,
+              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+          futures.push_back(runtime->execute_task(ctx, launcher));
+          break;
+        }
+        case DT_FLOAT: {
+          TaskLauncher launcher(
+              LOAD_FLOAT_WEIGHT_TASK_ID,
+              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+          futures.push_back(runtime->execute_task(ctx, launcher));
+          break;
+        }
+        case DT_INT4:
+        case DT_INT8: {
+          TaskLauncher launcher(
+              LOAD_QUANT_WEIGHT_TASK_ID,
+              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+          futures.push_back(runtime->execute_task(ctx, launcher));
+          break;
+        }
+        default:
+          assert(false && "Unsupported data type");
+      }
+    }
+  }
+
+  // Wait for all tasks to complete
+  for (Future &f : futures) {
+    f.get_void_result();
+  }
+}
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index ecd92700d..f48fab25b 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -4533,6 +4533,51 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LOAD_FLOAT_WEIGHT_TASK_ID,
+                                   "load_float_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_float_weight_task>(
+          registrar, "load_float_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_float_weight_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LOAD_HALF_WEIGHT_TASK_ID,
+                                   "load_half_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_half_weight_task>(
+          registrar, "load_half_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_half_weight_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LOAD_QUANT_WEIGHT_TASK_ID,
+                                   "load_quant_weight_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    if (pre_register) {
+      Runtime::preregister_task_variant<FileDataLoader::load_quant_weight_task>(
+          registrar, "load_quant_weight_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FileDataLoader::load_quant_weight_task>(
+          registrar);
+    }
+  }
   // ElementUnary task
   {
     TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 43513627a..61d3b25f6 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2347,7 +2347,7 @@ void RequestManager::serve_decoding(FFModel *llm) {
   assert(im->model_weights_loaders.find(llm) !=
          im->model_weights_loaders.end());
   // Load model weights
-  im->model_weights_loaders[llm]->load_weights(llm);
+  im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
   // init operators
   im->init_operators_inference(llm);
   // Legion futures for inc_decoding and spec_infer
@@ -2401,7 +2401,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
+    im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
     // init operators
     im->init_operators_inference(llm);
   }
@@ -2412,7 +2412,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) {
     assert(im->model_weights_loaders.find(ssm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[ssm]->load_weights(ssm);
+    im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime);
     // init operators
     im->init_operators_inference(ssm);
   }

From b885c63f216a55047fa6486a86e7592112256b60 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 14 Nov 2024 07:53:40 -0800
Subject: [PATCH 626/667] fix: compile bug

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 61d3b25f6..55ee6ea5e 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -2475,7 +2475,7 @@ void RequestManager::serve_spec_infer_sync(FFModel *llm) {
     assert(im->model_weights_loaders.find(llm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[llm]->load_weights(llm);
+    im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime);
     // init operators
     im->init_operators_inference(llm);
   }
@@ -2486,7 +2486,7 @@ void RequestManager::serve_spec_infer_sync(FFModel *llm) {
     assert(im->model_weights_loaders.find(ssm) !=
            im->model_weights_loaders.end());
     // Load model weights
-    im->model_weights_loaders[ssm]->load_weights(ssm);
+    im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime);
     // init operators
     im->init_operators_inference(ssm);
   }

From 9e062be7d5b565de4a53702e46120d5d90f600fe Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 14 Nov 2024 08:28:00 -0800
Subject: [PATCH 627/667] feat: upgrade to llama3 rope

---
 .../inc_multihead_self_attention_kernels.cu   | 40 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index ea65b1fce..28257edbd 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -358,7 +358,6 @@ void apply_pos_encoding_to_tokens_in_batch(
       m->local_hidden_size);
 }
 
-// TODO: upgrade to llama3 rope, same as apply_pos_encoding_to_tokens_in_batch
 __global__ void apply_pos_encoding_to_streaming_proj_kernel(
     half *kv_cache,
     BatchConfig::PerRequestInfo const *requestInfos,
@@ -366,6 +365,12 @@ __global__ void apply_pos_encoding_to_streaming_proj_kernel(
     int const max_num_pages,
     int num_kv_heads,
     int head_dim,
+    float rope_theta,
+    bool llama3_rope,
+    float factor,
+    float low_freq_factor,
+    float high_freq_factor,
+    int original_max_position_embeddings,
     StreamingCacheInfo const *streaming_cache_infos,
     uint32_t const max_num_requests) {
   int const kv_hidden_size = num_kv_heads * head_dim;
@@ -398,7 +403,27 @@ __global__ void apply_pos_encoding_to_streaming_proj_kernel(
   // Apply the rotary position encoding.
   cuFloatComplex cii = {kv_cache[real_part_idx], kv_cache[complex_part_idx]};
   size_t pos = token_idx;
-  float freq = pos * (1.0 / pow(10000.0, (float)2 * offset_in_head / head_dim));
+  float freq = pos * (1.0 / pow(rope_theta, (float)2 * offset_in_head / head_dim));
+
+  if (llama3_rope) {
+    float pi = CUDART_PI_F;
+    float wavelen = 2 * pi / freq;
+    float low_freq_wavelen =
+        original_max_position_embeddings / low_freq_factor;
+    float high_freq_wavelen =
+        original_max_position_embeddings / high_freq_factor;
+    if (wavelen < high_freq_wavelen) {
+    } else if (wavelen > low_freq_wavelen) {
+      freq = freq / factor;
+    } else {
+      assert(low_freq_wavelen != high_freq_wavelen);
+      float smooth =
+          (original_max_position_embeddings / wavelen - low_freq_factor) /
+          (high_freq_factor - low_freq_factor);
+      freq = ((1 - smooth) * freq / factor + smooth * freq);
+    }
+  }
+
   cuFloatComplex complex_pos = {cos(freq), sin(freq)};
   cii = cuCmulf(cii, complex_pos);
   kv_cache[real_part_idx] = cii.x;
@@ -411,6 +436,10 @@ void apply_pos_encoding_to_streaming_proj(
     BatchConfig const *bc,
     cudaStream_t stream) {
   assert(m->streaming_cache);
+  // apply rotary embedding if needed
+  if (!m->rotary_embedding_meta->apply_rotary_embedding) {
+    return;
+  }  
   int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
   int num_tokens = 0;
   for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch();
@@ -427,6 +456,7 @@ void apply_pos_encoding_to_streaming_proj(
   int const max_num_pages = round_up_pages(
       BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
       BatchConfig::max_spec_tree_token_num());
+  bool llama3_rope = (m->rotary_embedding_meta->rope_type == "llama3");
   apply_pos_encoding_to_streaming_proj_kernel<<<GET_BLOCKS(parallelism),
                                                 min(CUDA_NUM_THREADS,
                                                     parallelism),
@@ -438,6 +468,12 @@ void apply_pos_encoding_to_streaming_proj(
       max_num_pages,
       m->num_kv_heads,
       m->qk_dim,
+      m->rotary_embedding_meta->rope_theta,
+      llama3_rope,
+      m->rotary_embedding_meta->factor,
+      m->rotary_embedding_meta->low_freq_factor,
+      m->rotary_embedding_meta->high_freq_factor,
+      m->rotary_embedding_meta->original_max_position_embeddings,
       m->streaming_cache_infos,
       bc->max_requests_per_batch());
 }

From b798385bdb1148852361f5ad794f853e2425d0d5 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 15 Nov 2024 12:17:26 -0500
Subject: [PATCH 628/667] Specscheduler evaluation support code (#1541)

---
 .gitignore                                    |   3 +
 CMakeLists.txt                                |   1 +
 benchmarking/average_accepted_tokens.pdf      | Bin 0 -> 15738 bytes
 benchmarking/benchmark_incr_dec.sh            |  88 ++
 benchmarking/benchmark_specinfer.sh           | 109 +++
 benchmarking/get_sharegpt_trace.py            | 206 +++++
 benchmarking/get_wildchat_trace.py            |  64 ++
 benchmarking/plot_results.ipynb               | 776 ++++++++++++++++++
 .../queueing_time_vs_arrival_rate.pdf         | Bin 0 -> 18042 bytes
 benchmarking/throughput_vs_tpot.pdf           | Bin 0 -> 28243 bytes
 benchmarking/ttft_vs_arrival_rate.pdf         | Bin 0 -> 16898 bytes
 include/flexflow/batch_config.h               |   1 +
 include/flexflow/config.h                     |   1 +
 include/flexflow/inference.h                  |   1 +
 include/flexflow/model.h                      |   4 +-
 .../ops/spec_inc_multihead_self_attention.h   |   2 +
 ...spec_inc_multihead_self_attention_params.h |   3 +-
 include/flexflow/optimizer.h                  |   8 +-
 .../parallel_ops/kernels/allreduce_kernels.h  |   8 +-
 include/flexflow/request_manager.h            |  88 +-
 include/flexflow/utils/communication_buffer.h |   5 +-
 include/flexflow/utils/file_loader.h          |  52 +-
 include/flexflow/utils/memory_allocator.h     |   5 +-
 inference/incr_decoding/incr_decoding.cc      |  23 +-
 inference/simplified_infer/CMakeLists.txt     |  74 ++
 inference/simplified_infer/incr_dec.cc        | 473 +++++++++++
 inference/simplified_infer/specinfer.cc       | 692 ++++++++++++++++
 inference/spec_infer/spec_infer.cc            |  48 +-
 inference/trace_generator/trace_generator.cc  |  48 +-
 inference/utils/mem_analysis.py               | 115 +++
 python/flexflow/core/__init__.py              |   1 +
 python/flexflow/serve/models/llama.py         |   4 +
 python/flexflow/serve/serve.py                |   4 +-
 src/c/flexflow_c.cc                           |   3 +-
 src/mapper/mapper.cc                          |  37 +-
 src/ops/add_bias_residual_layer_norm.cpp      |   3 +-
 src/ops/add_bias_residual_layer_norm.cu       |   3 +-
 src/ops/arg_topk.cu                           |   4 +-
 src/ops/argmax.cpp                            |   3 +-
 src/ops/argmax.cu                             |   6 +-
 src/ops/fused.cc                              |   4 +-
 src/ops/fused.cpp                             |   2 +-
 src/ops/gumbel_topk.cu                        |   2 +-
 src/ops/inc_multihead_self_attention.cpp      |   6 +-
 src/ops/inc_multihead_self_attention.cu       |   6 +-
 src/ops/kernels/linear_kernels.cu             |   2 +-
 src/ops/kernels/residual_rms_norm_kernels.cpp |   3 +-
 src/ops/kernels/residual_rms_norm_kernels.cu  |   3 +-
 src/ops/kernels/rms_norm_kernels.cpp          |   3 +-
 src/ops/kernels/rms_norm_kernels.cu           |   3 +-
 src/ops/layer_norm.cu                         |   3 +-
 src/ops/residual_layer_norm.cpp               |   3 +-
 src/ops/residual_layer_norm.cu                |   3 +-
 src/ops/sampling.cpp                          |   6 +-
 src/ops/sampling.cu                           |   6 +-
 src/ops/spec_inc_multihead_self_attention.cc  |  23 +-
 src/ops/spec_inc_multihead_self_attention.cpp |   6 +-
 src/ops/tree_inc_multihead_self_attention.cpp |   6 +-
 src/parallel_ops/allreduce.cc                 |   6 +-
 .../kernels/allreduce_kernels.cpp             |  12 +-
 src/parallel_ops/kernels/allreduce_kernels.cu | 129 +--
 src/runtime/batch_config.cc                   |  42 +
 src/runtime/file_loader.cc                    | 217 ++---
 src/runtime/graph.cc                          |  36 +-
 src/runtime/inference_manager.cc              |  72 +-
 src/runtime/memory_allocator.cc               |  31 +-
 src/runtime/model.cc                          |  49 +-
 src/runtime/optimizer_kernel.cpp              |  12 +-
 src/runtime/optimizer_kernel.cu               |   8 +-
 src/runtime/parallel_tensor.cc                |  60 +-
 src/runtime/request_manager.cc                | 326 ++++++--
 src/utils/communication_buffer.cu             |  22 +-
 tests/inference/huggingface_inference.py      |   2 +-
 73 files changed, 3585 insertions(+), 495 deletions(-)
 create mode 100644 benchmarking/average_accepted_tokens.pdf
 create mode 100755 benchmarking/benchmark_incr_dec.sh
 create mode 100755 benchmarking/benchmark_specinfer.sh
 create mode 100644 benchmarking/get_sharegpt_trace.py
 create mode 100644 benchmarking/get_wildchat_trace.py
 create mode 100644 benchmarking/plot_results.ipynb
 create mode 100644 benchmarking/queueing_time_vs_arrival_rate.pdf
 create mode 100644 benchmarking/throughput_vs_tpot.pdf
 create mode 100644 benchmarking/ttft_vs_arrival_rate.pdf
 create mode 100644 inference/simplified_infer/CMakeLists.txt
 create mode 100644 inference/simplified_infer/incr_dec.cc
 create mode 100644 inference/simplified_infer/specinfer.cc
 create mode 100644 inference/utils/mem_analysis.py

diff --git a/.gitignore b/.gitignore
index f21d30b2a..d7917b34d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -192,3 +192,6 @@ inference_tensors
 tests/inference/python_test_configs/*.json
 
 core.*
+*.out
+sharegpt.json
+wildchat.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 577a2215d..978d84de4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -580,6 +580,7 @@ if(NOT BUILD_LEGION_ONLY)
 
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
+    add_subdirectory(inference/simplified_infer)
     add_subdirectory(inference/incr_decoding)
     add_subdirectory(inference/trace_generator)
   endif()
diff --git a/benchmarking/average_accepted_tokens.pdf b/benchmarking/average_accepted_tokens.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..896519d5d47abe1cf317cf20a46c0599432e6e0f
GIT binary patch
literal 15738
zcmb_@2{={V7q^h%nh7bxxrR!HJ6|r5IWoj0Q<Ui%ifg(gO(-HHbA!s1h(t+c&MXQk
zl4e3tq$DN2eXe@n(*1w0=X*Y${hWQ**~8jvt+Uo|@3W4axt@U%Mj4NgD|!kQRv<7C
z8uE47jo7paLRszeqd+KKXPUFOuLp!OclMwJLRip%3~6d2C{#DNq3UlP41B3H2uE*#
zvfbvgo8n4?R2F~ghtRfI(wu1&2){@$cc#%O0aS<p|3siHJ)PaWs2&jU=iLBbS4#>F
z+5x83GXShmLTC`m$Oo{X^Yg0v^J)Y+{>BdG?*+ho!`uf@0C$V*qsWv%-=F|jupXFy
zGK4avxOqA2_=bQP(coW|h=&L&M92|=(glkGJp&=qVoPl*)ffE2z|DRa1bq9K<V+}3
z51J>0`8k_`mp51|gfj330??zl`npkImIG-46lWhqNJi7C0qP);KWbujRTVAl`0!*$
zb?knQ3PXgqYo=E0&fF8P-S&*URISvyP_tQHKfAoVSNrV4Fh1P6V+{#7adOYZyKO45
z-$s(f7QSSd?)UoG8hyBNf1BJw^o)nu9P5N!^2uIB)gu$z=gXhJO82h4Dpu*z<BC71
z7b|3d^}7@qJo=_uv^drJirYmYq?QSfI&#Opmj^vYq}Qy?iQIm`^3$%bO6d`+>_juB
zL?Zm(x9d(mh{`P6p6L7X^BpD5hH2iM+vsx*@p@09SNoJ`lFpswv+AGbo2&Id?~{f)
zmB(8X_^i>qgeohMiXM`#vVGIc(zGyoOGe&GYTfY)g6)kYPV1qsyUzr_4maDDupMF8
z^g<?DpsVDo*ahXS$IZSu@fS3FwOOTpXw3Ruee$^x%#K1qFS4zEw$}9ltyRc<_uN~r
zbdImE)2)&&v)Rh&a}c8JdlM1;WTp9zqJHoDd7Jr%TSjb1--px3RWxU(zrCd%46MvI
zvk@JO-D+dseqCl>{X(tBi_ZRIHsLOD8?WUg?qsE03a>VgE3MbmMUoA#<!>n{8hfvQ
z<F1~)vd(sV;i0O7PpRic2Kv~Jj(%`YKUB+cf3w=e%0YhQ{ELXJjpJ;e>K?4#zUJNr
z6uQ%`Qkrr!UM>Fi;f(8*?V;08QfB&6bkXV6MlNOtCp@~I&E*K(H#{cqiYfYjcj(Oy
zlZ?2|{ltFim&kX1tSst0$Fj}|tUkI=RqkTqP|g~pj<2?!mtgZ)raDwT>?e6Km93@j
zE(cGftiR;;hr2UY7<lI2%*^6Eo0s<dbH7DC?+B)(he_;-7&1HNXrHRHVPUltGW*3*
zL%k1d6XMf)p^vztQ-bZKS=3$iq<Iwul<l{<YIW>j!mM~3`}*B?etD~7EMkwn!nohZ
zF<~Sh-3vicqN(0or40`rKC0s{L0+ofkew3w<9on0{{u${Vp`t>OcgwLP~0=$)AKDM
zv|*xHbzYWFDBnN1DZA!HZ~6gV$Kuih0W%{4EwhqxCr#fCYzb*NUv2c5vc>&y{2B)z
zrkh1}W2g!3uZPQOgqwq>Q~NKuH*L7ucg=xG>C<*OQEZ{i@%?S4?_Y|LyJx=FkpxoH
zwJ+Jk@Tuf%s5nfE?MB?)lcqJb-<I^EcSf%Jf#q128lO+9P)TWv#n*kIp<hm1I4}7T
z(c<|)L|^d*Ze5FqAKCff=TEWQrn+0&ubw~n;LQg7yuEm7{Z6u<Q*f!H!>3PYYu~l(
zeR`ll*DElz$Wk)srk9>^V7j}Wylb2}wq;#0=LV7*k@LZr|MbJHA(5uCDWMmG`qe^`
zFMre;J3pXoThLOT%97V*XQX3cU=^2EmL-2ezp_zU{RQF{htO0_`15y}@;>WIC5S<f
zF8UT)cbjIFSewjn*~ijm&jxj_;jQ)_z9hC!R3WtF<1yyP*GwC$akr|r<{IyNbhjqm
zYs1Y>J+?}D;r(6$G&gVO(1qe#dwSHAXw&Zl_G{$Jjr&GDYr6{j>YvW$r`M)?CfHkH
zFj(v_S59|nFmVU)d%qgO{(FgABVm9m`wy2!h^yqOLv!fu&$}h?RW)oTCVSPFd7o${
z(We@3V?KF1xt%*EQ#L4aEzi%=glv|-mwWd`6$$lGxqCnGZK29*KJ)oc3zwf|NwSbl
zJf@f4p6c;t*8xp#UvU=(Yxz9{{q($8;J^QqM-B4rL>3Hp`%Oez-7d05&24|DnG2G2
z8tNE~u>h0y4Ap|PcC(LMd&1mfiigoq+DxX;R7LhXwt}-zV4HuNuuIz1EkRikCCa*9
zkIP4<{CRc@*GvlGgnE!hje-H>1nHuLt4I2##h>m{c{{g8Op4}h9%`Iam}%8e6Hq3o
zp7zf1$oP@Gvk%_kdWy%xE`E$EdYEf=SYo<XMzKEkxuHHgQlz%0UE-QzuK2@&+lN?Q
zIlhmu-F9WuIKS6(>ri9ij^Peg6EPq4qOcowuJu-&lOp$!5ACBnMAfcD&kjB9zJuSs
zZ;f;VIr7MRjU0l*=o&%ehFPO#*5TU8Z$+&>hZmUqJUuEH2$Vr6BP>JbxNG?BX!DXX
zCc&t*VZ^XO8bUtGK}$0_wSA>f`-<z_VR&0FUK4#@8#bFYOto=5)_+fpg`?^cb0aQi
zY68J#aq7H0gFmLGczW@=zX`JwY1U#h6w7dC%Dr<Y&?z+F7WUQyrgmZe4hBs8jv;;;
z6bZcydpgk$+q~|C$sLYYmdxL+w)3^k_eC8Dy~DSo^%kcHw^YuP$W#{Jh|={zPL4ZS
zgmpy3+=d3!MPs+Ri`1T3S0MC2*8jO)mb&TIb9`kz11}>-Sf<}fj5FX61{O=iq@wcQ
z91`O1z#OtH01j0n&A${})_m}zR!Q%`IF4gH^qOC;iO?5;1L5--pC722m$jORc|N&b
z^H}!wk#w1q^=BRwx=fGg^F5PlJHMKn0aGyYTf!6$zYOIO$m$QyYIBU$T8=WGw|m~c
zFQYB@Yv2NrSzTDMM2GnSGyCa~802M(;;(Ys&^y;p?ycSwf1#!Fr2V8-uGB4}&)^T+
z>+29{28xmWEj3bdLm|88)-r2|dez6781Txn_}BK1bcQC&AA1po_c4=_Gvdo$S-GR(
z1d0o57)owPIyYBRywX}Ui!;#GR95Dab>8soW1K^?kgII|nU%6{g=PvO1iYGEM~dY5
zu;SaMUpHO7`O$tDGkxu{Rx|OAxc7S8t5y1}uSHX2Rs2OwC7QZBYy&I1%*jJFmB*5k
zch00I=hrD9Gn(w{F}mCAhTSe{b6Q==V8Rvk)u7FK3^@1huBcY{GX1GTHf}&X*m`T;
z9TM*cN-`<J?W+1Z@A^UhPtP?X5>Khr#wtZ9q?7vGzLiSpuYG&u^|y>d`Rl@RbGoUU
zN?)&8xbWip?5DYT_P7m)WEf}%#@$P_LnQvU$>4B!E3|nRfmzT!YTX}C7X6{_FzE<W
zbhWgUO<Qr2h$S59;TOwX;m4+T7=Fgddsfvk-~@*JOE^(c{clc8JMQp=nRBposav9%
zcV$V<V49D7IDEs&rK$`iyGm^LJ7K?`+ujv?oMqG%*J(HS&3JN$?bmA_KnHiy?l~O`
z&)M)}^8UGiO;<OIK4ZWYyz^MX6%M_OXG$Z<m<pnZ?u=%nGJoB-{_~%5H&mQ@et2NL
z+}q>g{UZ9i(<)5tEG19klaC&LJLy+?Zc5#|)7|~0eRg`xo0O3*VV4it2l_h<9PQRO
zy*uizc5~E%B+5=Sy?mii@!~V(5Z=CZ=XZP&*j&|>Dfy6~%&9`N$)U*OT`zw>{9ylF
z9QNgI6Gv}y;eM%2JN34Yj<Rowk#<cN9U0W-e7m*Y(>^+NuRe?RhpDo=`#C=hJm1}V
zjlJO1eRX;Fk}(=tnJRh=DcL%eKlvu-@gAMduO$j{pA8>J;R4ow@N6D_*!ue9{PTEL
z<Ah>Q<foTcslCchRhDY{E^cN4MP9Mn&1TG3_1tc#=H{RF*4LE}OnmfiSHpd-J!#$|
zAzQz#l+1ay&Y0J_e<0s%;>xD}t{Ex&KAIE**)sqcSo|^~xPGE(z)B>?(V6qvsL9Ji
zHX5C|WIK}&wLJH#WEJcqhgo`<g|<uz>W;^x3Q1uPphmF=ZerCZ)%s8SZM#KvXrMIU
zoOirb-`K9cQ+IDUdGoof%M)uz&y~pp8pZ8GXx167j+4>F0(LZo^dPOCln>-rYrJ<v
zteW0hwEkQsepKM@<ldA$2ZDsQT`e85x`(vCcSUH;@e=P4?T2l$0ylVK*QW&{EoLvT
zEmTu>N|t;5IqYd>s{4UZvXKD#$bq6Z&kkl{S8}C?X!99kN&OQx_){h7Muf?*nb+kt
zGerM{PoiHV_r6$A;62#Kta+Hpc;1nXf$$i?9~}0-pC^Ka5w#J`UHM&ju8yhqL)H4G
z%wt~``p9=zacIev-;#52m0?C<zLKJ!eoZ3etUKfpU^Az<Epki!m6t}V!qZ>v4321D
z+f3Qai<7EKx4xg-oXp*-Pt@2XfPE~-uZFH<3wrGSB&G)W>bea1o<UkqpWx>Q8(HsK
zmcMu#A?Q>1E%190m(TYnLT17H-x<9TiXKTTs-p}FCpX!$X;|Dg#vDBP&O2*;R@H_J
zN4zz6N9(0;@OT+>=gx*M-tpV-Jzj4|dOAM4<l~xeu(hS9mdDgIiDOoe)1w^!L1g}L
zXLU{AcRn+ul>h^QF@QHJxc`<Hjz|O;1C3`*G%#R8<Y`$SM~WKOb+zgt)460hTmZ&k
z(^`|Q%3i$gm;y@&n^$B$dnxoTUMW6nqkv+xH|!8EwTdzT<2Xj3XBiF=M=Uc345`?3
z5%2(cVX=TYtc4+F2s}INxWL;^5hSOym=!0TIb`pw3Ap8CA6TabT{FKzau9RyV)D-x
z^!QsiI0o1U3yA*-rU~0fNvufj_I;6>=<TZqv}!M1DknakJzSA=+U(>cvaLr*;&jMq
z8JQCImH~S=$o-|INmK2YGCrZpGcP5jQ)0QCqZ0jkiw3FZPlzrAr*_)1PY}Nzx7*48
zxZb4KM#MvTV5O^9MkA?>8s{if8Xt7!lBMkybQbqfT6seX6lj(Ex$OjYcvLlU{rT95
zF9V9fq(i)8T*P>!@W|JVp%KI*+2i{rnAZeoFE~E?Dyd&NDhI9Q6zR2a`H=DE=&tHu
zn&gcQf`K|`w3UUB_ug*I<3D3wbUi1NY~fd+d1<uh!r?c&9F!QyivbeD;z|E4FAa0`
z6M9JQF;XOIQg%wlPtu@J$$jEQG0PcwbCW~mIN2_N@Gab!l&gbsS2BykYh!gjQeTf9
zsfP-6$<X~!zG&Kb9x+i!O(Fxg*;&Q)M4di{E#oaMPlJ6O>3comXy97PxV7Q>n9I^9
z-r-KjpWnFuLWqds`&QzP^+_5-&?_=IrtRW|l)IaK?xrBr+`Cy71#iD}6Ean8uQVdZ
ziKSrQN#+GhA1vC`nxi)AW_rI!=5&A0wt>@M#r<yXdg5l5+Cr1;_?l66{|HIAVrDRg
z$Zi_{WORH+$ZI2tTq$9H?og0m>a*1&s|C|N4vW2TTbN~^KV~n@K!S{5BZ;sK36hPe
zO<EiSBdZ)TA55O@+(+L2tm?b(9$j+5#;O=hGy34&lega(I|}+ow%y!Wykk?|0p%U`
zr}sry#>UHOmOoAZfGl^UDVM5BY?<Q1p4+IGfA4a4)>oahNcNLvlUZE#2TAjTsIWVI
zB|NuYX<)TWWQXN-nyD6IX9TqcoJSIc^K;EF*ww$V_%JquUAfyg?jo0~_-D?tl{!9#
zjm+<~nDoX|tgf6q;UOzjEBwsEMciqm``U_PF)O{a{o^gs1^4sgs`MH)x0!z1kY=J}
z-AuI<+clRG_wWF1<;DX1RLWG>_Um0-wlVtzzmVn(zmD{JBcs%)y;Cu7q;O4G!TV0e
zB&V$O!7VQb?1BV}C2rrkPYahy?e#M$8#~lGy=|U0V1F;DDu5$f>y4@QWF>F0L1|+n
zR`XoEd}y{}$(l1&2Dm%U`~tl*4XrF|egstU<1g)zF0++Bd&5O9{<wFbMm1}%$wU*W
zw?U}hd*WPg)-@b%BQoMiMbz#wt>Ei5pZv-%Y{rdd`!LWc1F(s~Edy<uCTR1rU^rMN
z_C;RMg0w<9t+(?HEbKCV9Nfbt*%sz)!4@ueEaceQSKR}AFZ7AR9)lUX$nq}8Q``J5
zpe@DY<@ZjxCWQH?>{h(w8M4#(zG;J~(q18@i&xT<#e8plNT1n~>XXQ$->Jl<)90Nu
ziw$Q#u=>=Nph2&TEziDqU;Q%2bl)3q#9%=TFzd1&b5jeW<18S^jtGC#Lhekix}<wr
zcVwaOE^j1>mGf#+a@r|dE<VgrDMgL1iO4{eA8QX~_gC75Cbji1oEnj~wD?NSS9Z~H
zj9%NBB`~RxwMWpdwE81SG3Rt(4ZBq2EmmCC3fF^1*xUKTsYoF+?jUhq``r3ldt4e`
z+jnn@e@gvs;7(b0lqbPW=jrOOXlRW2+G@oVB2frycc0}wPaBWDQ(wVai`WQ-lwM9+
zwsqs>j+fVmdxKSUm8M?iAN=^u(>mQLvfFstHe^?%=Ba|y!5@-zLSLDxOLs^ja<S@4
z3ODvQzN3bJ+uTl`{Lwb^RpKh&y*Y;r1`=d|uSv^+c=2}x>L3o*1uJb%1melIb;$sv
z#PeehIF0arIJ%One4l~T%G7l`g>Bilx;`~e(5ikFpw#wsC7<8=!%w=bGSYhXdL<fd
zmw%coxNm{iW8KO8eci6-X1H9x%WPu6B?Bb7tUatga7KC<j)7WBzi`7>=_fyxU4}OS
zJI<Osy7!XP$vdRoGSBU9$!$3q*}jx*2c>n3^5*aXveJ29jW|~ZrzW)QF)oT9Pt0F+
z;mOCr($5BY{lfOsAG$Bz%eJ_8x0Te&pLJfi;@bX)y#14*4u%yIDff`#?F;7Nij-#o
z2s!N?U3<p`Uhqj2M<_%%m+VnK@}7cyxbW?mh{MFe8J8C2Fp;8>nmfEq`Im&sR<?>g
z`9PHSWZGU4YgC^z6-o`x_RiV<c2=AT$BPcAi<a9VQPcgLDea)+sDAwM9|~F)c3N|-
zkAm+!$Fv#xi43!ysP@~`blT5YwV7G#c=hYmp_6ithfy}g(Pt((gDTumRrTke43Fp}
zbL_ps#Jkh2;Ywjne%kxDn<Bn==m!|YJ&aXVLnhfjAVs`>kSS?D=%7;i+DE4Jw#|zV
zQm-;l4qwJZjmM8RGSChKsEb+7NuU177XOuCk2Z0cSq>eBm@aDh#5Xp7HR^MZcIavy
zZkKKKB@Q<J&a{Svn@TvI!&}5YG=IaoRz0$X){nAqC&m<Z=Y9)r61)6)=#fitPPPL|
zO<J%cX|PaB=78F@mgMqs+z;ALsd40l(89)zBUI-V%9lTA&61@)K1}Ghcy}f%VWYwx
z$=y$Ejyei#9ne(1gf)9s&0_zmI<-*a$;4*KV6`a*i)8>)3Cnn3a6}x~u%k<@&{(1c
znuuq15sF-;tDMR4>l{w&a=@+3yMa#N0-u4cT+K^A0sixV7|#Hcsw`tufy3Sen6w1H
zLNH1M92dB=(a!*D3Sw^SD2p0$H{ONM0n6kvBBDLQPC-8WO^}FQ>FF3TJ&BAJiR~}9
zvM|XFh_wAJ4RDV1*Y;d>8K7OGgCrP+=2+Qv-f~XnXsh&$zj$oOf;GCEO|mdF6RCQn
z;?9*<PIoJ4LKBvQkD4<CIrBrdnx<87g+45+d06B5vT|(L_)*N}d!3@^_$npD7@Qp7
z89+e%G7b(HiYkrYT3OUYk-dYtJIxOvef~}=8`EtMp7jhQ!T>j63Cq|v=qDNnutFSq
zGv^~@!sdoHC2yr*ZZ8md?z<6#)AN5E-71qOs5r7}ANpY1)IGQIw|hy}>T6f7(%*2Z
zZTn~`bD_Fr<LNyyyT@E9L$`X=*_zhgcO(_K`RBw1Sp{wMGwDp7ws<xhY?pFy3oixn
zxF}HE`{iAqhSlp0&R+SzI}-zKTW`6g={?jx$-X}RU}~X)-#PCu-NtBM)*m`Tr-ov+
z#<;_q?^%8x5*!~K3?2!0^4%w3*w5dZh+HiZ5q!>J;dSq9^@4wtdgl$XV)2RBdr$QS
zx9oCB5)NDu`&2%EWRJ3apP1V7WL(Qo-uI~cx1G`}kC;r%Ztm^;HqRm{@BWQ}C>g;O
z;xepD|D?Jg3zA!Rzgd>dR~Z(~PIs(4amLs6u{XJ{#}+zLDHQSWNkq(O#IbU@XDcVJ
zuzI-8y(Ll!f<vr9Vb-~Wo+;#mx0}3DX&*g_feKt#ge2T~LsN3ZGWpenpTEx@3ds9#
ze^%FTde?NV%QMp1nDDl{5*+v5Sw5X5s%KYJw9V(XJT2-wq%gkS*~`wjo%80oN_Q>J
zr<SBAg&Lcs5Ncl-td$Xx!7RgUG|X?uvx2uJj*)9@>q6fbc!<Or=Pi6Toc$u*Euv4}
zNI0D&j|xhWv$K4zb|9^hcVE;SsvA*mqdqFRL~f=F@$q2H@T!?-kJ7H|;?q4zhq!PX
zFRzv8F(#hz+bC3#`LqG|jmL+^Vtq`qbO+Rx>(o~c1+?(3=Du@UC{~(#t0~(;aX`X_
zov~{*8$Rl;N=)J%CM?+gm<r^%YMHP;k?LFXdS^xfFN=V9(wQn5N83_mK>;s<wrOXb
zy;4}^&7JdW%Va9JwZ6nO`kx`6$g17rcd--oX1(QO1|nhrRhHvx^#6J1<-~eu-4{)Z
zl=|njJp6LhTkD@`I7o1wmZ#zxblJ{xD@Hl`hks5(b2)4=KXGn6$N5psJJ$H=Jo2Xr
zzLfLhMX#gARxs<Vac9691AKzTEn~ZBYTo7pwD-LdK|t4yidRaq$8Ngz<FM{s&ZKlt
z<JVbJDLc~cQf6dGzHJq6bdQ;(dQDNg?p-aN&*5ecUNE=fzam$<wZq{7wG?lJ>Tghu
zPv3Ln#7M#ZqjotVlY@_Y<ip=;VDGqJuO^`jHSY7CG`rP&^<9s8sw_)P#mP@P7ZGEU
z59P8G6b4R_&zUK`S)G1ibpL@9Od3tk<DY_8ucC1VH<a?bcc-fIp@l5FQ~e7Q!&8LV
zbUn7%;JGfg-VN0u)W?&uPFFYhIkB`~Kbm{-3b~4pvSO6k@rl|Xerv=Drur8f;-<f`
zU7p*i!a!~eki)WGX)PYH;L+?!22omRi7`6xp$}7I$D0Vw;olE@jwtXjy(;55#b9a%
zaU4Tj#_J^glO#c~7h(;pj#zK#5HVBKe&f!!g#z~8E!l6G@!}OCrguYEd1}5bHOQ*y
z+B2%SHA(M|NRov4CD&;64TdI$u1SXFl1F6D*j{%$b=qp<b@Hn+ucrFie)~tQdhveo
zb1`q(tmd}8b4ZyqJ#sY(^KG?NmHLr5qpv&lpW0*I&+DN}#lCR*>aSaQG^VtmzoyeE
zs$&Mxr`qy~$0R~!?C*sV7(fQ__Taxa@=9dmfF_80AQ0i0iTlbvwCOFaBBXkuJSkb<
zfY<3W=1o-0)&q9X&79uGoADi5k*}^6c0_g>4z#L8x1Wr?m82B8Ur%ue)3~WIVxsV)
zq~vT-N85GwW>o)PK7}t2k5Z6>O=3@+g<G73hiprNkeBy|Smrhq$UYv7UC?X_RAN;+
zV9*t|Yv{t&6EA0zM7(BB`ScWrmYAqt&Mgu5-WIrX-7b5N+iH@X>}_9)9}AExnO8;L
z#@!EmwV}t_piCr~Xt>+K+xNr|+pS%ZR0o6OGAAz-NN>u_ZR=Qvc%6_T8L*MX`Q4df
zTEN=v{+G^o5YEQl#2LAd)fkqZuHWaVdVQYT(*4H1^_>CP4+}UZN<ECbx=6+6tJCZ}
zX=jD1a*uX4n{6|zvB8ZYNy3~3F7`HeyjGod;T#a$Wocn~CRA+x`_zYz>WAi6C<Yz~
zWS|xX$Vp`x=7+-(Edi24GYjVBMKO5squ3FyD8qF!O<Y-*%N6b8;+^Z{?!x9~2VtNJ
z7+0P;BgDdh8wOx$InjrH)*05YO1Bu#Br(iFW-u}XJcPk5Cx|B)^RglV9?Hl*QP(0h
z;*WhB8MyF1>>O3dRU^_W{B}f^hVmYkbHY+cvGR{M3(`*L+i6;blCAmLULGrIZbv@N
zRGf@3+8Bj2k3F>}QHesm88$%Vxppnp^5Th-*c^1imojf*$z9nyq=S++A8Fi(dOBDk
z(R#7B@_BY_&Eeb?7)}POVE{>R0F?jtIV_n(1&)5~2jU<UMje>whYS<vYIzF!SF!lh
z$SLafR>AGZBCT$W(`L(3p9rc9?WIIEa<&x;z9&Z+=2uxGA5EObUcE{1U~MH|tRKo9
z@^6dS+;H&{SIca9>D4!_5($#F!ZKNUb=aoH(KTuJKa{xdIF#m9mk@F^L!nox#dalm
z?K9Q)$7izkDP3cWu0EZ193f#HBXUUlW9F9lgI`u#*+Sj##f|*pV$2i6g2SH}a+CHe
z23tsd#%;g|jXqhc|5Q6%F6k7hse<<T#n-OV%=yQ0Dz$1G-|n!tbA6z>_I*mTBKB>l
z?PuDj-j*X=_=)Vfkw@HucuIUyGy9b%q^RuE7S<s*b(4ASVOo%%c+WS8ac=##X;p24
zoRCCH>kD3<ti!t`bK)|H&zXm_H~R>)^S-{`u=94f@K(`1Ni!=7Oh&`sE?oV1qo%J<
zO5%N3GXv!?!cfZzCrTDZ0W8`am7NNokGfxa;hL^+Y-pj6<1RZJPplTr{&i;D1xV4R
z>P=#yXTu8|CgR1)V}}&_d~AobtRN#-b3#_|$I#D`kNr<^b|m+faJv`r&Be(-9c_Ar
z5%aj&M|sFU?s!$>w02vOL!+FKsL3N|84upgr?JXqff_2%h=Fao^GT}p8l2z>e54^V
z^77f-`>fi6am5Dl#DG=bFQO7?QQPD5G2?+sKD#<iRrhp_+ZIg?rARvZE3X$t6^VbI
zCbEcHcoj`)8P@%v@=XU`OMVezJaS*8XK??$wyAn6r>3)gR4wA8*DtD~2fxQ%9T}OO
zW7*}jBbtE>8GtN|$}*yb=}9AAU4Xe~GU`S1kK*b^Hq~CP@SLc<qRSo>J~ssUZQ9i3
zFRM&*8;v`ma8`V<?mFc9x?6Rj?UrUEM*CT|l5Ki$OMi!8U_BPYTyD_TzbU(puf>5w
z&@El6rVf&JOqjL)WS9K8^QQW<go-lvT^iYSG2<LfB`a*gepIxC8`sb7>={bmDCa74
zY%2VnG?}QL#ec(UJmB*2>(9<A_z8cW>(>47e&r3BO)nTMn*o4RS;k>E?gpL=cJ)Li
z!8~^ZMBcjLe{yRmo^A;DH-2JvXXI^dnZ(w)51S)TMTslBS_WpW(9&<tb6RtG^DzY&
zproD@oM6Bb18f9RU;oq7XvC@WhN0<-*y4FemR%=0{Q2RF@@26<KFba?bvxN7GT*Fg
z8`3Q33FwqnKfr$$Ib^fe{zmcL2WPlr&&1X8ym)5X$Call${<I8zyLI1G0Tt>!GgpK
zc4Xr-!t+|Paj1}O*&FW7*WQckwwAo(5LK3fJUvBD&PTqn)C@hXoKf+#uVXiUB&|#-
zFzi-&73xY-U5tD5kvm?eZml@8X@zcIcfN(LsASi}i<TQop0a8lze(kL=6WMMy&O3b
z1G#JvQ`Ej!pf4oi)44vEPz&}Z&(q&OYqh-CoVGfgZH2X}c;9H>WU9utb=@>!?88Dr
z`)W=hed$fJC-NQ%joITLU<DsG3l!~UsZ#%eHe+HE@^5E$yRbFo`cR{9>F4WgHs9t~
zT-aij_P5S}DE>VTS<KU>Ctss<oC7I8Yo-<!de#bhl-<tOL6*+cKv<&zfizuD=K#<d
zW#as^jwKKfD7cRvCD1j%%a7(80Ab+pLxxb6K`u0U7C1Zyh^1!EJ|LG`XX((3{dE@e
zr<E{hECy1-gAFuB1sw5WupkOMg4w|orp~kgFM5)-G8zp76Y&2}4NkTO%$bASZ;)RN
z`v;I1u1^K&(r{+@zZvPjNC=dkm%BR!<aNV|+&dunrFWp8vn$9k2Y$rW*T=^hLV51<
z^Q2HAlpiI)%hwIUgPm?DCBPR#`BK43XnR48<{3Z%b$5^vjzGD41ydjbu(Uu*Fx-#g
z<>5(#;BysMUvFP3L<G0Afe0Y@m9Y^7KPV<3eH@4eg7=7)AXgoNlf~>2Bya`<7;phS
zA(R^!2}}W@Kq9>-gz^IU<KXrl2;~i-d>|B<2r%LYVSwOK0bp{#6@&_cP{BZaSOh8*
zWMqPjag>dh8_g342gW+TC!f>x^(%YjUwimXqjd5A5A^}vcsqLp0s$^dk=lW-a4J0s
z3-+Dx4Db&nEc|*$*V)f-QDA5U%Jx?k$O(Kx;N_}K_3)+u#!;3uijOs1F?9~1uLvv%
zoTB~vQ2~8@@N#xQL^S&UM?w6*j)Mc}6c0=l54LQoI8}&<BO*v(tD~ZdhOoeM<AAc^
zLI8Zi_u=+9kg`a>;#A=F|F?qpbsq=X!UZ1VS0R8G?-(>J098CFU?7qzqyoS@5tupw
zOs)bK0v5uO2nbMsaT0;;f<eHurvgk}MHMiB26gxf-^YOhK9NuX{oyf5sw8k-q`_DE
zVsI)5_zE)tFA6S1pb)q(0W5rR4A2(LORs}19v0+p!G)kgqF+fMkP`u&h{1vdF3yVw
z^Adn#z>(m7;PVP_RRzL=uk?b2+Y#wPfUofUbfv)p1UvwNBH%d*pbuRrznFusbo+s4
z!oxq%u!vx`;ibV>Q0V%Ar&}xt!cU$3Gt^=QFt|7b7OWU_Bfwk%&f)H`8T?Y$uPaa)
zI8j-Y5`6@`3TOn<t6}zGJNRdOxD`yK8xt4_{{7D=FjHtWf`0Gk03;#_`c+^<ShNAy
z4*o23Sp&O(ZQ{2=w+GlJeigc{{Im(!9)2${U-aK#4A?e)6?o-DGy&)k@Cw@igF;dT
zI)Uv1#H8QLFPr#vg@pq2j-X%R`fr7<>&5$Un|}&y7qC|0>%S@KzavL&5M$`R5O`!D
zM0(INf-o?A0O897q9+uCnkfV{OQ)GZSa=T#XqFIAGN>Vd%f|i;c=ixbIQ_0Ogr$cd
z*n?w#&fyASf6lQO41fou_g{=P^gJFgt{bqH#W5&wOrnpr=oLXAr2le<z#<oGaJT_0
zqz?l}AYe3$cRV2A4d`t=A)ubc+F~#R2Y&RQUci_ZYkMI23lgvtZ;1Xr0<hr&;eSp?
zg##kJhcATxxjZ=h;eSr$2Z8q$;6CEFC`oS~0D)8%dMyyx=g%5FE`r_8BAp&@!DC7P
z1p?(yi7v)e@V;u14#F&msq}gf1e_GT77YBwVl5O7uyh(dHq+7H@8Jy|^jCB<0MFFo
zQ2)jYc$#1LwK2a#g?~OfL+NURMkrTja6SP?Qf=)2YzDHO{%wZ)pUu$9Feq>a07eyz
z_*?XXx%<h@zbwLQ{)-4N|1Tme%U_FG4CRYo-l2@W++dgko=I3#08#1s27%B^U(e6a
zH9(O1x3ezzybK0&8vyF^^Y*2Ad$~ZtIAxMDRuS@~(fk6{P$-{&ZYlc)c*rAA0DZXy
zxq`3Ne(&Yy<_@_yyY2zA{2W0Z%nlFl<xACrzv|ndr-nsifse-!z~Lr_pkgl%1D2m`
zg#=LC5wMFufN1vf4+JX$Fno9D9~un#;6I4^lZL@#;V17;8jb`!_YxW&B)|PdgWn7<
zZHEC7VJQuZUOF#m0bt&r?a*i({CvZI|L_Oa2^{JY8XAlKi-sZmy$&oMeyaZ%4?sQG
zbNoresDJ>zgoeX|Z*cyk0bYnp$3tUO{u&Rk_jfxi=I`;aIM^rt!3$6@03l0hu!CMg
z!=Yin@n>HUL6_<hh!pm5OWLWx9{o=m9u51;B{UrD2A9zAAUH0i5&vEXo&@{xKl_46
z|CfFVDsW6$(vI|(F416$zjz_yVK?<>U!p1;G5(~fpg|h`QW^#Xyrnd7Ao~}MfgPw2
z;qSPX%t3;~!=E%&4ESPXDGmR(ZKx9AD7mCB>2IBr(6A5xvmMwX{53DYG=H~Kg#+1=
zzNEjNHw+q1BKxx)22J=&_87D(*qtqD2flP$ItL&Tm&yR_!2Z%3@I$}bE-k>>%bOBF
z|3=!<E0hBK8iXSI`ogZ59>k2O?!Evw(SHCZ7)WytpwV3*1`i&3gq)oIHUq@}0c7U|
A761SM

literal 0
HcmV?d00001

diff --git a/benchmarking/benchmark_incr_dec.sh b/benchmarking/benchmark_incr_dec.sh
new file mode 100755
index 000000000..3ddcb2271
--- /dev/null
+++ b/benchmarking/benchmark_incr_dec.sh
@@ -0,0 +1,88 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/../build"
+
+# export BUILD_TYPE=Debug
+# ../config/config.linux
+make -j install
+
+model_name=meta-llama/Llama-3.1-70B-Instruct
+NGPUS=8
+NCPUS=16
+FSIZE=36000
+ZSIZE=200000
+CSIZE=100000
+
+# comment these lines in for debugging
+# model_name=meta-llama/Llama-3.1-8B-Instruct
+# NGPUS=8
+# FSIZE=36000
+# ZSIZE=30000
+# CSIZE=100000
+
+
+
+MAX_SEQ_LEN=7000
+tokens_per_batch=1024
+
+batch_sizes=(
+    8
+    4
+)
+
+request_per_second_values=(
+    -1
+    1
+    2
+    4
+    8
+)
+
+dataset_name="sharegpt"
+dataset_fp="../wildchat/${dataset_name}.json"
+partition_name="all"
+
+export LEGION_BACKTRACE=1
+
+# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)"
+# python ../inference/utils/download_hf_model.py --half-precision-only $model_name --refresh-cache
+
+for k in "${!request_per_second_values[@]}"; do
+for j in "${!batch_sizes[@]}"; do
+    batch_size=${batch_sizes[$j]}
+    request_per_second=${request_per_second_values[$k]}
+    
+    echo "Running dataset ${dataset_fp} with model ${model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}"
+    # create model name version where "/" is replaced with "-"
+    model_name_=$(echo $model_name | tr / -)
+    if [ $request_per_second -gt 0 ]; then
+        rate=$request_per_second
+    else
+        rate="offline"
+    fi
+    log_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log"
+    output_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json"
+    metrics_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
+    rm $metrics_fp $output_fp $log_fp || true
+
+    time ./inference/suffix_decoding/incr_dec \
+        -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
+        -tensor-parallelism-degree $NGPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
+        --fusion \
+        --max-sequence-length $MAX_SEQ_LEN \
+        --max-requests-per-batch $batch_size \
+        --max-tokens-per-batch $tokens_per_batch \
+        --max-output-length 1024 \
+        --request-per-second ${request_per_second} \
+        -llm-model $model_name \
+        -trace ${dataset_fp} \
+        -trace-output-path ${output_fp} \
+        -csv-output-path $metrics_fp \
+        -target-partition ${partition_name} \
+        2>&1 | tee ${log_fp}
+done
+done
\ No newline at end of file
diff --git a/benchmarking/benchmark_specinfer.sh b/benchmarking/benchmark_specinfer.sh
new file mode 100755
index 000000000..5fe881f08
--- /dev/null
+++ b/benchmarking/benchmark_specinfer.sh
@@ -0,0 +1,109 @@
+#! /usr/bin/env bash
+set -x
+set -e
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/../build"
+
+# export BUILD_TYPE=Debug
+# ../config/config.linux
+make -j
+source ./set_python_envs.sh
+# reset
+
+model_name=meta-llama/Llama-3.1-70B-Instruct
+NGPUS=8
+NCPUS=16
+FSIZE=36000
+ZSIZE=200000
+CSIZE=100000
+
+# comment these lines in for debugging
+# model_name=meta-llama/Llama-3.1-8B-Instruct
+# NGPUS=8
+# FSIZE=36000
+# ZSIZE=30000
+# CSIZE=100000
+######################################
+
+small_model_names=(
+    Zhuominc/Llama-3-330M
+    meta-llama/Llama-3.2-1B-Instruct
+    meta-llama/Llama-3.2-3B-Instruct
+    meta-llama/Llama-3.1-8B-Instruct
+)
+
+MAX_SEQ_LEN=7000
+tokens_per_batch=1024
+max_tree_depth=8
+expansion_degree=3
+
+batch_sizes=(
+    8
+    4
+)
+
+request_per_second_values=(
+    -1
+    1
+    2
+    4
+    8
+)
+
+dataset_name="sharegpt"
+dataset_fp="../wildchat/${dataset_name}.json"
+partition_name="all"
+
+export LEGION_BACKTRACE=1
+
+# python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='meta-llama/Llama-3.1-70B-Instruct', allow_patterns='*.safetensors', max_workers=30)"
+python ../inference/utils/download_hf_model.py --half-precision-only $model_name
+for small_model_name in "${small_model_names[@]}"; do
+    python ../inference/utils/download_hf_model.py --half-precision-only $small_model_name
+done
+
+for k in "${!request_per_second_values[@]}"; do
+for j in "${!batch_sizes[@]}"; do
+for i in "${!small_model_names[@]}"; do
+    small_model_name=${small_model_names[$i]}
+    batch_size=${batch_sizes[$j]}
+    request_per_second=${request_per_second_values[$k]}
+    
+    echo "Running dataset ${dataset_fp} with model ${model_name}, draft model ${small_model_name}, batch size ${batch_size}, tokens per batch ${tokens_per_batch}, and request per second ${request_per_second}"
+    # create model name version where "/" is replaced with "-"
+    model_name_=$(echo $model_name | tr / -)
+    small_model_name_=$(echo $small_model_name | tr / -)
+    if [ $request_per_second -gt 0 ]; then
+        rate=$request_per_second
+    else
+        rate="offline"
+    fi
+    log_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.log"
+    output_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.json"
+    metrics_fp="/usr/FlexFlow/inference/output/specinfer_llm_${model_name_}_ssm_${small_model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
+    rm $metrics_fp $output_fp $log_fp || true
+
+    time ./inference/suffix_decoding/specinfer \
+        -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
+        -tensor-parallelism-degree $NGPUS \
+        -ssm-tp-degree $NGPUS \
+        -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
+        --fusion \
+        --max-sequence-length $MAX_SEQ_LEN \
+        --max-requests-per-batch $batch_size \
+        --max-tokens-per-batch $tokens_per_batch \
+        --max-output-length 1024 \
+        --max-tree-depth ${max_tree_depth} \
+        --expansion-degree ${expansion_degree} \
+        --request-per-second ${request_per_second} \
+        -llm-model $model_name \
+        -ssm-model $small_model_name \
+        -trace ${dataset_fp} \
+        -trace-output-path ${output_fp} \
+        -csv-output-path $metrics_fp \
+        -target-partition ${partition_name} \
+        2>&1 | tee ${log_fp}
+done
+done
+done
\ No newline at end of file
diff --git a/benchmarking/get_sharegpt_trace.py b/benchmarking/get_sharegpt_trace.py
new file mode 100644
index 000000000..dbe8f4d3b
--- /dev/null
+++ b/benchmarking/get_sharegpt_trace.py
@@ -0,0 +1,206 @@
+from dataclasses import asdict, dataclass, field
+import json
+import os
+import random
+import requests
+from tqdm.asyncio import tqdm
+from typing import List, Optional
+from collections import OrderedDict
+from transformers import AutoTokenizer
+
+SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+
+@dataclass
+class TraceEntry:
+    prompt: str
+    response: str
+    prompt_length: int
+    response_length: int
+
+@dataclass
+class TracePartition:
+    partition_name: str
+    model_name: str
+    num_warmup_requests: int
+    training_entries: List[TraceEntry]
+    eval_entries: List[TraceEntry]
+
+@dataclass
+class TraceMetadata:
+    avg_entries_per_partition: float
+    max_prompt_length: int
+    min_prompt_length: int
+    avg_prompt_length: float
+    max_response_length: int
+    min_response_length: int
+    avg_response_length: float
+    max_total_length: int
+
+@dataclass
+class Trace:
+    partitions: List[TracePartition]
+    metadata: TraceMetadata = field(default_factory=lambda: TraceMetadata(0, 0, 0, 0, 0, 0, 0,0))
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    # Check if the cache file already exists
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as f, tqdm(
+        desc=filename,
+        total=total_size,
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            f.write(chunk)
+            bar.update(len(chunk))
+
+    return filename
+
+def get_warmup_entries(model_name: str, num_warmup_requests: int) -> List[TraceEntry]:
+    """
+    Get a list of warmup entries for a model.
+    
+    Args:
+    model_name (str): The name of the model.
+    num_warmup_requests (int): The number of warmup requests to generate.
+    
+    Returns:
+    List[TraceEntry]: A list of warmup entries.
+    """
+    warmup_entries = []
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    for i in range(num_warmup_requests):
+        prompt = "Hello, how are you?"
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        response = "I'm doing well, thank you for asking."
+        prompt_length = len(tokenizer(prompt)["input_ids"])
+        response_length = len(tokenizer(response)["input_ids"])
+        warmup_entries.append(TraceEntry(prompt, response, prompt_length, response_length))
+    return warmup_entries
+
+def build_trace(model_name: str, num_entries: int, num_warmup_requests: int, seed: int):
+    # Download sharegpt if necessary
+    dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f, object_pairs_hook=OrderedDict)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+        if data["conversations"][0]["from"] == "human" and data["conversations"][1]["from"] == "gpt"
+    ]
+
+    # Shuffle the dataset.
+    random.seed(seed)
+    random.shuffle(dataset)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    trace = Trace(partitions=[])
+    partition = TracePartition(
+        partition_name="all",
+        model_name=model_name,
+        num_warmup_requests=num_warmup_requests,
+        training_entries=[],
+        eval_entries=[],
+    )
+    trace_metadata = TraceMetadata(
+        avg_entries_per_partition=0,
+        max_prompt_length=0,
+        min_prompt_length=float("inf"),
+        avg_prompt_length=0,
+        max_response_length=0,
+        min_response_length=float("inf"),
+        avg_response_length=0,
+        max_total_length=0,
+    )
+
+    partition.eval_entries += get_warmup_entries(model_name, num_warmup_requests)
+    
+    for i in tqdm(range(len(dataset))):
+        if len(partition.eval_entries) == num_entries:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        response = dataset[i][1]
+        prompt_length = len(tokenizer(prompt)["input_ids"])
+        response_length = len(tokenizer(response)["input_ids"])
+        new_entry = TraceEntry(prompt, response, prompt_length, response_length)
+        partition.eval_entries.append(new_entry)
+        trace_metadata.max_prompt_length = max(trace_metadata.max_prompt_length, prompt_length)
+        trace_metadata.min_prompt_length = min(trace_metadata.min_prompt_length, prompt_length)
+        trace_metadata.avg_prompt_length += prompt_length
+        trace_metadata.max_response_length = max(trace_metadata.max_response_length, response_length)
+        trace_metadata.min_response_length = min(trace_metadata.min_response_length, response_length)
+        trace_metadata.avg_response_length += response_length
+        trace_metadata.max_total_length = max(trace_metadata.max_total_length, prompt_length + response_length)
+    trace_metadata.avg_prompt_length /= len(partition.eval_entries)
+    trace_metadata.avg_response_length /= len(partition.eval_entries)
+    trace_metadata.avg_entries_per_partition = len(partition.eval_entries)
+
+    trace.partitions.append(partition)
+    trace.metadata = trace_metadata
+
+    return trace
+
+def save_trace(trace: Trace, output_path: str):
+    """
+    Save a Trace instance to a JSON file.
+    
+    Args:
+    trace (Trace): The trace to save.
+    output_path (str): The path where the JSON file will be saved.
+    """
+    # Convert the Trace instance to a dictionary
+    trace_dict = asdict(trace)
+    
+    # Save the dictionary as a JSON file
+    with open(output_path, 'w') as f:
+        json.dump(trace_dict, f, indent=2)
+    
+    print(f"Trace saved to {output_path}")
+
+if __name__ == "__main__":
+    # Change directory to that holding this script
+    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    num_entries=125
+    num_warmup_requests=8
+    seed=42
+
+    trace = build_trace("meta-llama/Llama-3.1-70B-Instruct", num_entries, num_warmup_requests, seed)
+    print(trace.metadata)
+    # Save prompts list to a json file
+    save_trace(trace, "sharegpt.json")
\ No newline at end of file
diff --git a/benchmarking/get_wildchat_trace.py b/benchmarking/get_wildchat_trace.py
new file mode 100644
index 000000000..53ee46efb
--- /dev/null
+++ b/benchmarking/get_wildchat_trace.py
@@ -0,0 +1,64 @@
+import datasets
+from transformers import AutoTokenizer
+from tqdm import tqdm
+import json, os
+
+def build_trace(dataset: datasets.Dataset, model_name: str, num_entries: int, seed: int):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    
+    dataset = dataset["train"].filter(
+        lambda x: x["model"] == "gpt-4" and x["turn"] == 1 and x["language"] == "English"
+    ).shuffle(seed=seed).select(range(num_entries))
+    pairs = []
+    for row in dataset:
+        assert len(row["conversation"]) == 2
+        assert row["conversation"][0]["role"] == "user"
+        assert row["conversation"][1]["role"] == "assistant"
+        pairs.append((
+            row["conversation"][0]["content"],
+            row["conversation"][1]["content"],
+        ))
+
+    prompts = []
+    avg_prompt_length = 0
+    min_prompt_length = float("inf")
+    max_prompt_length = 0
+    avg_response_length = 0
+    min_response_length = float("inf")
+    max_response_length = 0
+    max_total_length = 0
+    for prompt, response in tqdm(pairs, desc="Processing HF trace"):
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        prompt_length = len(tokenizer(prompt)["input_ids"])
+        response_length = len(tokenizer(response)["input_ids"])
+        prompts.append(prompt)
+        avg_prompt_length += prompt_length
+        avg_response_length += response_length
+        min_prompt_length = min(min_prompt_length, prompt_length)
+        min_response_length = min(min_response_length, response_length)
+        max_prompt_length = max(max_prompt_length, prompt_length)
+        max_response_length = max(max_response_length, response_length)
+        max_total_length = max(max_total_length, prompt_length + response_length)
+    avg_prompt_length /= len(prompts)
+    avg_response_length /= len(prompts)
+
+    return prompts, max_prompt_length, max_response_length, avg_prompt_length, avg_response_length, min_prompt_length, min_response_length, max_total_length
+
+if __name__ == "__main__":
+    # Change directory to that holding this script
+    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    dataset = datasets.load_dataset("allenai/WildChat")
+    prompts, max_prompt_length, max_response_length, avg_prompt_length, avg_response_length, min_prompt_length, min_response_length, max_total_length = build_trace(dataset, "meta-llama/Llama-3.1-70B-Instruct", 250, 42)
+    print(f"Number of prompts: {len(prompts)}")
+    print(f"Prompt lengths: [{min_prompt_length} -> {max_prompt_length}] (avg: {avg_prompt_length})")
+    print(f"Response lengths: [{min_response_length} -> {max_response_length}] (avg: {avg_response_length})")
+    print(f"Max total length: {max_total_length}")
+    # Save prompts list to a json file
+
+    with open("wildchat.json", "w") as f:
+        json.dump(prompts, f, indent=2)
\ No newline at end of file
diff --git a/benchmarking/plot_results.ipynb b/benchmarking/plot_results.ipynb
new file mode 100644
index 000000000..39047b86c
--- /dev/null
+++ b/benchmarking/plot_results.ipynb
@@ -0,0 +1,776 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/usr/FlexFlow/inference/output\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "os.chdir(\"/usr/FlexFlow/inference/output\")\n",
+    "print(os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "small_model_names = [\n",
+    "    \"Zhuominc/Llama-3-330M\",\n",
+    "    \"meta-llama/Llama-3.2-1B-Instruct\",\n",
+    "    # \"meta-llama/Llama-3.2-3B-Instruct\",\n",
+    "    \"meta-llama/Llama-3.1-8B-Instruct\",\n",
+    "]\n",
+    "batch_sizes=[4,8]\n",
+    "arrival_rates=[\"offline\", \"1\", \"2\", \"4\", \"8\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_speculation_len(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    return df[\"num_speculated_tokens\"].mean()\n",
+    "\n",
+    "def get_accepted_len(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    return df[\"num_accepted_tokens\"].mean()\n",
+    "\n",
+    "def get_acceptance_rates(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    # group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    num_speculated_tokens = df[\"num_speculated_tokens\"].sum()\n",
+    "    num_accepted_tokens = df[\"num_accepted_tokens\"].sum()\n",
+    "    return num_accepted_tokens/num_speculated_tokens\n",
+    "\n",
+    "def get_tpot(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    min_time = group[\"timestamp\"].min()[\"timestamp\"]\n",
+    "    max_time = group[\"timestamp\"].max()[\"timestamp\"]\n",
+    "    num_tokens = group[\"num_generated_tokens\"].sum()[\"num_generated_tokens\"]\n",
+    "    tpots = (max_time - min_time) / num_tokens / 1000\n",
+    "    return tpots.mean()\n",
+    "\n",
+    "def get_throughput(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1 or request_step_idx is < 0\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0) & (df[\"request_step_idx\"] >= 0)]\n",
+    "    num_tokens = df[\"num_generated_tokens\"].sum()\n",
+    "    total_time = df[\"timestamp\"].max() - df[\"timestamp\"].min() # in microseconds\n",
+    "    total_time = total_time / 1000000 # convert to seconds\n",
+    "    throughput = num_tokens / total_time # (tokens/sec)\n",
+    "    return throughput\n",
+    "\n",
+    "def get_ttft(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0)]\n",
+    "    group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+    "    # convert to milliseconds from microseconds\n",
+    "    return ttft.mean()[1] / 1000\n",
+    "\n",
+    "def get_queueing_time(filepath):\n",
+    "    df = pd.read_csv(filepath)\n",
+    "    # remove entries where is_warmup_request is 1\n",
+    "    df = df[(df[\"is_warmup_request\"] == 0)]\n",
+    "    group = df.groupby(\"request_guid\", as_index=False)\n",
+    "    # in each group, find the difference between the timestampt at request_step_idx=-1 and the timestamp at request_step_idx=-2.\n",
+    "    queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+    "    # convert to seconds from microseconds\n",
+    "    return queueing_time.mean()[1] / 1000000\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA9wAAAPECAYAAABc1TPrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACsUklEQVR4nOzdeXxM1//H8fdkMUlI7EEIiVC72rWU2PdYqlVULW1RS7X17V6K0qKtLqpFUZRYaqmuqrSxdaHW6rf2SilaSxGEiMz5/eGX+ZpOkIm5YvT1fDzyYM49c+9n7owb7zn3nmszxhgBAAAAAACv8svuAgAAAAAAuBURuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQC3hAYNGqhixYrZXUamzZo1S2XLllVgYKDy5MmT3eX4tAYNGqhBgwY3ZFvDhw+XzWbTsWPHbsj2AAC+jcANAP/w3nvvyWazqXbt2tldyk0nKipKNptNjz76qNuylStXymazaeHChdlQmW/ZsWOHevbsqZiYGE2ZMkXvv/9+pp739NNPy2az6b777rO4Qu/7/vvvNXz4cJ08efKGbzs9JF/r50aF9n+jzz77TLGxsQoPD1dISIhKliypTp066auvvnL2OXTokIYPH64tW7ZkX6EA4GUB2V0AANxs4uPjFRUVpfXr12vPnj0qVapUdpd005kyZYqee+45RUREZHcpPmnlypVyOBx6++23M/35MsZo7ty5ioqK0meffabTp08rNDTU4kq95/vvv9eIESPUs2fPGz6if/fdd7vs5zNnzqhfv37q0KGD7r77bmd7oUKFbmhd/xavv/66nnrqKcXGxuq5555TSEiI9uzZoxUrVmjevHlq0aKFpEuBe8SIEYqKilKVKlWyt2gA8BICNwBcZt++ffr++++1ePFi9e3bV/Hx8Ro2bNgNrcHhcOjChQsKCgq6odvNrAoVKmjnzp0aM2aMxo8fn93l3FDeem+OHDkiSR4Fz5UrV+qPP/7Qt99+q+bNm2vx4sXq0aPHddXxb1G5cmVVrlzZ+fjYsWPq16+fKleurG7dumVjZbcGY4zOnz+v4OBgt2UXL17UyJEj1bRpU3399dduy9P/LQDArYpTygHgMvHx8cqbN69at26te+65R/Hx8c5lqampypcvn3r16uX2vKSkJAUFBenJJ590tqWkpGjYsGEqVaqU7Ha7IiMj9fTTTyslJcXluTabTQMHDlR8fLwqVKggu93uPM3y9ddfV506dZQ/f34FBwerevXqGZ6yfe7cOQ0aNEgFChRQaGio2rZtq4MHD8pms2n48OEufQ8ePKgHH3xQhQoVkt1uV4UKFfTBBx9keh9FRUWpe/fumjJlig4dOnTVvj179lRUVJRbe/opvhnthwULFqh8+fIKDg7WnXfeqW3btkmSJk+erFKlSikoKEgNGjRQYmJihtvcuHGj6tSpo+DgYEVHR2vSpElufbzx3lzJe++95+wbERGhAQMGuJxGHRUV5fwSp2DBghm+RxmJj49X+fLl1bBhQzVp0sTls3m5gwcP6qGHHlJERITsdruio6PVr18/Xbhwwdnn5MmTeuKJJxQVFSW73a5ixYqpe/fuLtclZ2UflSlTRkFBQapevbpWr17t7DN8+HA99dRTkqTo6GjnKdyXv4ezZ89W9erVFRwcrHz58qlz5846cOCA2+t7//33FRMTo+DgYNWqVUtr1qy55r7LrG+//Vb16tVTzpw5lSdPHrVr107bt2+/5vN+//13lSpVShUrVtRff/0l6dI+fvzxxxUZGSm73a5SpUpp7NixcjgczuclJibKZrPp9ddfd74uu92umjVr6qeffnLZxp9//qlevXqpWLFistvtKlKkiNq1a3fFfwfpevbsqVy5cum3335T8+bNlTNnTkVEROill16SMcalr8Ph0FtvvaUKFSooKChIhQoVUt++fXXixAmXflFRUWrTpo2WLVumGjVqKDg4WJMnT85w+8eOHVNSUpLq1q2b4fLw8HBJl75QqlmzpiSpV69ezs/IjBkznH3XrVunFi1aKHfu3AoJCVFsbKy+++47l/WlH1t27NihTp06KSwsTPnz59djjz2m8+fPX3VfAYAlDADAqWzZsuahhx4yxhizevVqI8msX7/eufzBBx80efLkMSkpKS7PmzlzppFkfvrpJ2OMMWlpaaZZs2YmJCTEPP7442by5Mlm4MCBJiAgwLRr187luZJMuXLlTMGCBc2IESPMu+++azZv3myMMaZYsWKmf//+ZsKECeaNN94wtWrVMpLM559/7rKOTp06GUnmgQceMO+++67p1KmTuf32240kM2zYMGe/P//80xQrVsxERkaal156yUycONG0bdvWSDJvvvnmNfdPiRIlTOvWrc3evXtNQECAefTRR53LEhISjCSzYMECZ1uPHj1MiRIl3NYzbNgw889fQZJM5cqVTWRkpBkzZowZM2aMyZ07tylevLiZMGGCKV++vBk3bpwZMmSIyZEjh2nYsKHL82NjY01ERIQJDw83AwcONOPHjzd33XWXkWSmTZvm7Oet9yYj6a+rSZMm5p133jEDBw40/v7+pmbNmubChQvGGGM+/vhj06FDByPJTJw40cyaNcts3br1qvv9/PnzJk+ePGbkyJHGGGM+/PBD4+/vbw4fPuzS7+DBgyYiIsL52iZNmmSGDh1qypUrZ06cOGGMMeb06dOmYsWKxt/f3/Tu3dtMnDjRjBw50tSsWdP52jzdRxUrVjQFChQwL730khk7dqwpUaKECQ4ONtu2bTPGGLN161bTpUsX5+ds1qxZZtasWebMmTPGGGNGjRplbDabue+++8x7771nRowYYQoUKGCioqKcdRtjzNSpU40kU6dOHTN+/Hjz+OOPmzx58piSJUua2NjYq+7Dyx09etTt38by5ctNQECAue2228yrr77qrCFv3rxm3759zn7p7/HRo0eNMcbs2bPHFC9e3FSpUsXZdvbsWVO5cmWTP39+8/zzz5tJkyaZ7t27G5vNZh577DHnuvbt22ckmapVq5pSpUqZsWPHmldffdUUKFDAFCtWzPmZMcaYOnXqmNy5c5shQ4aYqVOnmldeecU0bNjQrFq16qqvtUePHiYoKMiULl3aPPDAA2bChAmmTZs2RpIZOnSoS9+HH37YBAQEmN69e5tJkyaZZ555xuTMmdPl82vMpeNAqVKlTN68ec2zzz5rJk2aZBISEjLcflpamgkODjbVq1c3x48fv2Kdf/75p3nppZeMJNOnTx/nZ2Tv3r3GGGO++eYbkyNHDnPnnXeacePGmTfffNNUrlzZ5MiRw6xbt87t/alUqZKJi4szEyZMMN26dXMeHwHgRiNwA8D/27Bhg5Fkli9fbowxxuFwmGLFirn8B3nZsmVGkvnss89cntuqVStTsmRJ5+NZs2YZPz8/s2bNGpd+kyZNMpLMd99952yTZPz8/Mx///tft5qSk5NdHl+4cMFUrFjRNGrUyNm2ceNGI8k8/vjjLn179uzpFioeeughU6RIEXPs2DGXvp07dza5c+d2294/pQduY4zp1auXCQoKMocOHTLGeCdw2+12l3AzefJkI8kULlzYJCUlOdufe+45I8mlb2xsrJFkxo0b52xLSUkxVapUMeHh4c7A4K335p+OHDlicuTIYZo1a2bS0tKc7RMmTDCSzAcffOD2+tMD2rUsXLjQSDK7d+82xhiTlJRkgoKC3L4k6d69u/Hz83N+8XM5h8NhjDHmxRdfNJLM4sWLr9jH030kyWzYsMHZ9vvvv5ugoCDToUMHZ9trr73m9p4ZY0xiYqLx9/c3L7/8skv7tm3bTEBAgLP9woULJjw83FSpUsXlC6/333/fSLruwJ3+Obk8FG7dutX4+fmZ7t27O9suf++2b99uIiIiTM2aNc3ff//t7DNy5EiTM2dOs2vXLpftPvvss8bf39/s37/fGPO/wJ0/f36X53/yyScux5kTJ04YSea1117L9GtM16NHDyPJ5csxh8NhWrdubXLkyOH8DK5Zs8ZIMvHx8S7P/+qrr9zaS5QoYSSZr776KlM1pH/mcubMaVq2bGlefvlls3HjRrd+P/30k5Fkpk+f7tLucDhM6dKlTfPmzZ2fUWMuHR+jo6NN06ZNnW3p70/btm1d1tG/f38j6ZpfbgGAt3FKOQD8v/j4eBUqVEgNGzaUJOds0PPmzVNaWpokqVGjRipQoIDmz5/vfN6JEye0fPlyl5mjFyxYoHLlyqls2bI6duyY86dRo0aSpISEBJdtx8bGqnz58m41XX5N5IkTJ3Tq1CnVq1dPmzZtcrann+Lcv39/l+f+cyZxY4wWLVqkuLg4GWNc6mrevLlOnTrlst5rGTJkiC5evKgxY8Zk+jnX0rhxY5dT0NNniu/YsaPLBGHp7b/99pvL8wMCAtS3b1/n4xw5cqhv3746cuSINm7cKMl7780/rVixQhcuXNDjjz8uP7///Xrt3bu3wsLC9MUXX2RmF2QoPj5eNWrUcE78FRoaqtatW7ucVu5wOLRkyRLFxcWpRo0abutIP4V/0aJFuv3229WhQ4cr9vF0H915552qXr2683Hx4sXVrl07LVu2zPlv50oWL14sh8OhTp06uWyrcOHCKl26tHNbGzZs0JEjR/TII48oR44czuf37NlTuXPnvuo2ruXw4cPasmWLevbsqXz58jnbK1eurKZNm+rLL790e84vv/yi2NhYRUVFacWKFcqbN69z2YIFC1SvXj3lzZvX5TU1adJEaWlpLqfbS9J9993n8vx69epJ+t/nOzg4WDly5NDKlSvdTu/OrIEDBzr/nn4ZwIULF7RixQpnzblz51bTpk1daq5evbpy5crl9p5HR0erefPmmdr2iBEjNGfOHFWtWlXLli3TCy+8oOrVq6tatWqZOmV/y5Yt2r17t7p27arjx487azt79qwaN26s1atXu5yqL0kDBgxweZx+PMzovQQAKzFpGgBISktL07x589SwYUPt27fP2V67dm2NGzdO33zzjZo1a6aAgAB17NhRc+bMUUpKiux2uxYvXqzU1FSXwL17925t375dBQsWzHB7/5woKDo6OsN+n3/+uUaNGqUtW7a4XDt7+fXPv//+u/z8/NzW8c/Zr48ePaqTJ0/q/fffv+JtqDyZwKhkyZJ64IEH9P777+vZZ5/N9POupnjx4i6P04NUZGRkhu3/DB8RERHKmTOnS9ttt90m6dL1snfccYfX3pt/+v333yVJZcqUcWnPkSOHSpYs6VzuqZMnT+rLL7/UwIEDtWfPHmd73bp1tWjRIu3atUu33Xabjh49qqSkpGvei3zv3r3q2LHjVft4uo9Kly7t1ue2225TcnKyjh49qsKFC191W8aYDNchSYGBgZL+t3//2S8wMFAlS5a88ovJhCu9d5JUrlw5LVu2TGfPnnX5bMXFxalQoUJatmyZcuXK5fKc3bt36+eff870/vvn5z49fKd/vu12u8aOHav//Oc/KlSokO644w61adNG3bt3v+q+Tefn5+e2jy7/d5Fe86lTp5zXVF+r5sz+u0jXpUsXdenSRUlJSVq3bp1mzJihOXPmKC4uTr/88stVJyLcvXu3JF11ksBTp065fGnxz89JTEyM/Pz8rnnNOwB4G4EbAHRpsqTDhw9r3rx5mjdvntvy+Ph4NWvWTJLUuXNnTZ48WUuXLlX79u310UcfqWzZsrr99tud/R0OhypVqqQ33ngjw+39M0BmNLvvmjVr1LZtW9WvX1/vvfeeihQposDAQE2fPl1z5szx+DWmjwB169btiv9xvXwm58x44YUXNGvWLI0dO1bt27d3W/7PidHSXWnU09/f36N2849JnzLDG+/NjbRgwQKlpKRo3LhxGjdunNvy+Ph4jRgxwqvb9HQfXe+2bDabli5dmuH7/M8we7Po2LGjZs6cqfj4eJezKqRLr6lp06Z6+umnM3xuethNl5nP9+OPP664uDgtWbJEy5Yt09ChQzV69Gh9++23qlq16nW+mks1h4eHX3Eyvn9+eZDVfxdhYWFq2rSpmjZtqsDAQM2cOVPr1q1TbGzsVWuTpNdee+2Ktwu71ufkSsciALAagRsAdCm0hIeH691333VbtnjxYn388ceaNGmSgoODVb9+fRUpUkTz58/XXXfdpW+//VYvvPCCy3NiYmK0detWNW7cOMv/0Vu0aJGCgoK0bNky2e12Z/v06dNd+pUoUUIOh0P79u1zGdW5fDRUuvQf5tDQUKWlpalJkyZZqumfYmJi1K1bN02ePNl5mvfl8ubN6zJDd7qsjvZey6FDh9xGInft2iVJzlPVvfHeZKREiRKSpJ07d7qMJl64cEH79u3L8j6Pj49XxYoVM7w93eTJkzVnzhyNGDFCBQsWVFhYmH755Zerri8mJiZTfTzZR+kjkJfbtWuXQkJCnEHtSuuJiYmRMUbR0dFuQfRy6ft39+7dzlPbpUt3D9i3b5/LF16euvy9+6cdO3aoQIECbmdOvPbaawoICFD//v0VGhqqrl27urymM2fOeO3f2eXr/c9//qP//Oc/2r17t6pUqaJx48Zp9uzZV32ew+HQb7/95rJ/M/p3sWLFCtWtW/eGfclUo0YNzZw5U4cPH5Z09c+IdCmsZ3af7t6922UUfs+ePXI4HBneNQEArMQ13AD+9c6dO6fFixerTZs2uueee9x+Bg4cqNOnT+vTTz+VdOn0zHvuuUefffaZZs2apYsXL7qcTi5JnTp10sGDBzVlypQMt3f27Nlr1uXv7y+bzeYyGpyYmKglS5a49Eu/jvK9995zaX/nnXfc1texY0ctWrQow8B19OjRa9aUkSFDhig1NVWvvvqq27KYmBidOnVKP//8s7Pt8OHD+vjjj7O0rWu5ePGiy+2JLly4oMmTJ6tgwYLOa4y98d5kpEmTJsqRI4fGjx/vMjI5bdo0nTp1Sq1bt/Z4nQcOHNDq1avVqVOnDD+bvXr10p49e7Ru3Tr5+fmpffv2+uyzz7Rhwwa3daXX1LFjR23dujXD9yC9j6f76IcffnC5/v/AgQP65JNP1KxZM+fobXpg/ecXMHfffbf8/f01YsQItzMWjDE6fvy4pEvhrGDBgpo0aZLLLc5mzJiR4Zc6nihSpIiqVKmimTNnuqzrl19+0ddff61WrVq5Pcdms+n999/XPffcox49ejiPD9Kl/ffDDz9o2bJlbs87efKkLl686FF9ycnJbre0iomJUWhoqNtt2q5kwoQJzr8bYzRhwgQFBgaqcePGzprT0tI0cuRIt+devHgxy/s4OTlZP/zwQ4bLli5dKul/p/Jf6TNSvXp1xcTE6PXXX9eZM2fc1pPRseufX56mHw9btmzp2QsAgOvECDeAf71PP/1Up0+fVtu2bTNcfscdd6hgwYKKj493Buv77rtP77zzjoYNG6ZKlSqpXLlyLs954IEH9NFHH+mRRx5RQkKC6tatq7S0NO3YsUMfffSR8/61V9O6dWu98cYbatGihbp27aojR47o3XffValSpVwCbPXq1dWxY0e99dZbOn78uO644w6tWrXKOYJ1+ajRmDFjlJCQoNq1a6t3794qX768/v77b23atEkrVqzQ33//7fH+Sx/lnjlzptuyzp0765lnnlGHDh00aNAgJScna+LEibrttts8mqAtsyIiIjR27FglJibqtttu0/z587Vlyxa9//77zmuBvfHeZKRgwYJ67rnnNGLECLVo0UJt27bVzp079d5776lmzZrq1q2bx+ucM2eOjDFX/Gy2atVKAQEBio+PV+3atfXKK6/o66+/VmxsrPr06aNy5crp8OHDWrBggdauXas8efLoqaee0sKFC3XvvffqwQcfVPXq1fX333/r008/1aRJk3T77bd7vI8qVqyo5s2ba9CgQbLb7c4vfy4/1T39C48XXnhBnTt3VmBgoOLi4hQTE6NRo0bpueeeU2Jiotq3b6/Q0FDt27dPH3/8sfr06aMnn3xSgYGBGjVqlPr27atGjRrpvvvu0759+zR9+vTrvoZbujRi3bJlS91555166KGHdO7cOb3zzjvKnTv3Fe+T7ufnp9mzZ6t9+/bq1KmTvvzySzVq1EhPPfWUPv30U7Vp00Y9e/ZU9erVdfbsWW3btk0LFy5UYmKiChQokOnadu3apcaNG6tTp04qX768AgIC9PHHH+uvv/5S586dr/n8oKAgffXVV+rRo4dq166tpUuX6osvvtDzzz/vPAMhNjZWffv21ejRo7VlyxY1a9ZMgYGB2r17txYsWKC3335b99xzT6ZrTpecnKw6derojjvuUIsWLRQZGamTJ09qyZIlWrNmjdq3b+88JT4mJkZ58uTRpEmTFBoaqpw5c6p27dqKjo7W1KlT1bJlS1WoUEG9evVS0aJFdfDgQSUkJCgsLEyfffaZy3b37duntm3bqkWLFvrhhx80e/Zsde3a9brOhACALMmWudEB4CYSFxdngoKCzNmzZ6/Yp2fPniYwMNB5Oy2Hw2EiIyONJDNq1KgMn3PhwgUzduxYU6FCBWO3203evHlN9erVzYgRI8ypU6ec/SSZAQMGZLiOadOmmdKlSxu73W7Kli1rpk+fnuEttc6ePWsGDBhg8uXLZ3LlymXat29vdu7caSSZMWPGuPT966+/zIABA0xkZKQJDAw0hQsXNo0bNzbvv//+NffV5bcFu9zu3buNv7+/223BjDHm66+/NhUrVjQ5cuQwZcqUMbNnz77ibcH+uR/Sb5v0z9shZXQLstjYWFOhQgWzYcMGc+edd5qgoCBTokQJM2HCBLd6vfHeXMmECRNM2bJlTWBgoClUqJDp16+fy72kjcn8bcEqVapkihcvftU+DRo0MOHh4SY1NdUYc+mWXN27dzcFCxY0drvdlCxZ0gwYMMDlVlrHjx83AwcONEWLFjU5cuQwxYoVMz169HC5XZyn+2j27NnOz2rVqlUzvC/zyJEjTdGiRY2fn5/bLcIWLVpk7rrrLpMzZ06TM2dOU7ZsWTNgwACzc+dOl3W89957Jjo62tjtdlOjRg2zevVqExsbe923BTPGmBUrVpi6deua4OBgExYWZuLi4syvv/7q0iej9y45OdnExsaaXLlymR9//NEYc+l+588995wpVaqUyZEjhylQoICpU6eOef311523qLvS5zt9v6bXd+zYMTNgwABTtmxZkzNnTpM7d25Tu3Zt89FHH13ztfbo0cPkzJnT7N2713lv9UKFCplhw4a53L4u3fvvv2+qV69ugoODTWhoqKlUqZJ5+umnnbf/M+bKx4GMpKammilTppj27dubEiVKGLvdbkJCQkzVqlXNa6+95vK5NObSLdHKly9vAgIC3G4RtnnzZnP33Xeb/PnzG7vdbkqUKGE6depkvvnmG2ef9Pfn119/Nffcc48JDQ01efPmNQMHDjTnzp3LVM0A4E02Y7Iw4wwA4Ka3ZcsWVa1aVbNnz9b999+f3eXgFmWz2TRgwACXU5Zx8+jZs6cWLlyY4anYt6Lhw4drxIgROnr0qEdnEQCAVbiGGwBuAefOnXNre+utt+Tn56f69etnQ0UAAADgGm4AuAW8+uqr2rhxoxo2bKiAgAAtXbpUS5cuVZ8+fbx6CycAAABkHoEbAG4BderU0fLlyzVy5EidOXNGxYsX1/Dhw91uVwYAAIAbh2u4AQAAAACwANdwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcA4KpmzJghm82mDRs2XLFPYmKibDabXn/99auuKyoqSjabTU2aNMlw+ZQpU2Sz2a65vasZPny4bDabjh07dsU+K1eulM1m08KFCzO93k6dOslms+mZZ5656jptNptmz56dYZ+6devKZrOpYsWKGS5PS0tTRESEbDabli5dmunaJOmJJ55QtWrVlC9fPoWEhKhcuXIaPnx4pu+/PHHiRN17770qXry4bDabevbs6dH209/bjH5Kly7t1n/atGkqV66cgoKCVLp0ab3zzjtufXr27OmynoCAAEVGRqpz58769ddfM1VXZj6/1+PXX3/V8OHDlZiYaMn6faUGAEDGmKUcAHBDBQUFKSEhQX/++acKFy7ssiw+Pl5BQUE6f/58NlWXsaSkJH322WeKiorS3LlzNWbMGNlstgz7BgUFac6cOerWrZtLe2Jior7//nsFBQVdcTvffvutDh8+rKioKMXHx6tly5aZrvGnn35SvXr11KtXLwUFBWnz5s0aM2aMVqxYodWrV8vP7+rfsY8dO1anT59WrVq1dPjw4UxvN91bb73lFu5///13DRkyRM2aNXNpnzx5sh555BF17NhRgwcP1po1azRo0CAlJye7faFht9s1depUSdLFixe1d+9eTZo0SV999ZV+/fVXRUREeFyrN/36668aMWKEGjRooKioqH9tDQCAjBG4AQA3VN26dfXTTz9p/vz5euyxx5ztf/zxh9asWaMOHTpo0aJF2Vihu0WLFiktLU0ffPCBGjVqpNWrVys2NjbDvq1atdKnn36qY8eOqUCBAs72OXPmqFChQipdurROnDiR4XNnz56tatWqqUePHnr++ed19uxZ5cyZM1M1rl271q0tJiZGTz75pNavX6877rjjqs9ftWqVc3Q7V65cmdrm5dq3b+/WNmrUKEnS/fff72w7d+6cXnjhBbVu3dp5hkHv3r3lcDg0cuRI9enTR3nz5nX2DwgIcPvy4o477lCbNm30xRdfqHfv3h7Xml2MMTp//ryCg4OzuxQAwA3CKeUAgBsqKChId999t+bMmePSPnfuXOXNm1fNmzd3e05qaqp27NiRpZFXb4iPj1fTpk3VsGFDlStXTvHx8Vfs265dO9ntdi1YsMClfc6cOerUqZP8/f0zfN65c+f08ccfq3PnzurUqZPOnTunTz755LrqTh/tPHny5DX7lihR4oqj9lk1Z84cRUdHq06dOs62hIQEHT9+XP3793fpO2DAAJ09e1ZffPHFNdebfmZEQEDWxg169uypXLly6eDBg2rfvr1y5cqlggUL6sknn1RaWppL33nz5ql69eoKDQ1VWFiYKlWqpLffflvSpdPV7733XklSw4YNnae+r1y5UtKl/d+mTRstW7ZMNWrUUHBwsCZPnuy8BGPGjBlutdlsNg0fPtyl7eDBg3rooYcUEREhu92u6Oho9evXTxcuXLhmDQCA7EXgBgDccF27dtX69eu1d+9eZ9ucOXN0zz33KDAw0K3/wYMHVa5cOT333HM3skxJ0qFDh5SQkKAuXbpIkrp06aKFCxfqwoULGfYPCQlRu3btNHfuXGfb1q1b9d///lddu3a94nY+/fRTnTlzRp07d1bhwoXVoEGDqwb7jFy8eFHHjh3ToUOH9PXXX2vIkCEKDQ1VrVq1PFqPN2zevFnbt293e82bN2+WJNWoUcOlvXr16vLz83Muv9yxY8d07Ngx/fXXX/rhhx/0xBNPKH/+/GrTpk2W60tLS1Pz5s2VP39+vf7664qNjdW4ceP0/vvvO/ssX75cXbp0Ud68eTV27FiNGTNGDRo00HfffSdJql+/vgYNGiRJev755zVr1izNmjVL5cqVc65j586d6tKli5o2baq3335bVapU8ajOQ4cOqVatWpo3b57uu+8+jR8/Xg888IBWrVql5OTkTNUAAMg+nFIOALjhGjVqpMKFC2vu3LkaMmSItm/fri1btujtt9/Wb7/9lt3luZg7d67sdrvatWsnSercubNefPFFffnllxmeRi1d+kIhLi5OBw4cUGRkpOLj41WyZMmrntY9e/Zs1alTR5GRkc7t9O/fX0ePHlXBggUzVeuGDRt05513Oh+XKVNGn376qfLly5fJV+s96V8WXH46uSQdPnxY/v7+Cg8Pd2nPkSOH8ufPr0OHDrm0nz171u31Fy1aVF9//XWm90tGzp8/r/vuu09Dhw6VJD3yyCOqVq2apk2bpn79+kmSvvjiC4WFhWnZsmUZnplQsmRJ1atXT+PHj1fTpk3VoEEDtz579uzRV1995XLmhieTmz333HP6888/tW7dOpcvKV566SUZY5QnT55r1gAAyD6McAMAbjh/f3916tTJOQocHx+vyMhI1atXL8P+UVFRMsZkeAqu1eLj49W6dWuFhoZKkkqXLq3q1atfdfS5WbNmypcvn+bNmydjjObNm+ccIc/I8ePHtWzZMpc+HTt2lM1m00cffZTpWsuXL6/ly5dryZIlevrpp5UzZ85Mz1LuTQ6HQ/PmzVPVqlXdRlrPnTunHDlyZPi8oKAgnTt3zq1t+fLlWr58uZYtW6bJkycrV65catWqlXbt2nVddT7yyCMuj+vVq+fyhU+ePHl09uxZLV++PMvbiI6OzvAyicxwOBxasmSJ4uLi3M4IkOT1SwAAAN7HCDcAIFt07dpV48eP19atWzVnzhx17tz5pgsQ27dv1+bNm9W9e3ft2bPH2d6gQQO9++67SkpKUlhYmNvzAgMDde+992rOnDmqVauWDhw4cNXTyefPn6/U1FRVrVrVZTu1a9dWfHy8BgwYIEn6+++/XU5lDw4OVu7cuZ2Pw8LCnLdca9eunebMmaN27dpp06ZNuv3227O+I/7fuXPndOrUKZe2f840L12agO3gwYN64okn3JYFBwdf8XT8jCYU8/f3d7uNXKtWrVS6dGk999xzzgntjh496tInX758Vwz20qUg/88R8rx587pMaNe/f3999NFHatmypYoWLapmzZqpU6dOatGixRXX+0/R0dGZ7vtPR48eVVJS0hVvIwcAuPkxwg0AyBa1a9dWTEyMHn/8ce3bt++qgTS7pN9P+4knnlDp0qWdP+PGjdP58+evOpt6165dtWXLFg0fPly33367ypcvf8W+6aPldevWddnO2rVr9cMPPzhHXe+++24VKVLE+XP5LO8ZufvuuyVdmvjLG+bPn++y/SJFilzx9fj5+WU4ql+kSBGlpaXpyJEjLu0XLlzQ8ePHM3Wbr2LFiqlMmTJavXq1JOnAgQNudX3//fdXXceVJq+7XHh4uLZs2aJPP/1Ubdu2VUJCglq2bKkePXpc87npMpqR/EpfLP1zwjYAgO9jhBsAkG26dOmiUaNGqVy5ch5PJmU1Y4zmzJmjhg0bus2oLUkjR45UfHy8evXqleHz77rrLhUvXlwrV67U2LFjr7idffv26fvvv9fAgQPdbjXmcDj0wAMPaM6cORoyZIjGjRvnMgJ7rXCakpIih8PhNiqdVc2bN7/m6dUpKSlatGiRGjRokGF96e/zhg0b1KpVK2f7hg0b5HA4Mv05uHjxovN0+cKFC7vV5Y0RfenSteVxcXGKi4uTw+FQ//79NXnyZA0dOlSlSpXK0lkZ6bc9++fs8b///rvL44IFCyosLEy//PLLVdd3s50ZAgD4HwI3ACDbPPzww/L391ft2rWv2i81NVV79+5V7ty5rziq6m3fffedEhMT9dJLL+mee+5xW75r1y4NHTpUhw4dyjBY2mw2jR8/Xps3b9YDDzxwxe2kj24//fTTzgnTLjd16lTFx8dryJAhql69eobrOHnypHLmzOk2w/vUqVMluc4InpycrP3796tAgQIu9wnPjKuNaqf78ssvdfLkSbfJ0tI1atRI+fLl08SJE10C98SJExUSEqLWrVtfs45du3Zp586dzv0RFBTkdtq5Nxw/flz58+d3Pvbz81PlypUlXfpiQZLzPumZufVaurCwMBUoUECrV6/W448/7mx/7733XPr5+fmpffv2mj17tjZs2OB2HbcxRjabLUs1AABuDAI3ACBTPvjgA3311Vdu7Zef1vzNN9/o/Pnzbn3at2+f4XWoJUqUcLvncEbSbwvWo0ePTE+c9sYbbygkJMSlzc/PT88//7zz8aJFi7Rjxw635/bo0UPx8fHy9/e/YgBs27atXnjhBc2bN0+DBw/OsE+7du2cs5tfSXx8vKpUqZJh2E7fzqOPPqpNmzapWrVqGfZZuXKlBg0apHvuuUelS5fWhQsXtGbNGi1evFg1atRQt27dnH3Xr1+vhg0batiwYS77/rPPPtPWrVslXfqC4+eff9aoUaOcNaQHzWuJj4+X3W5Xx44dM1weHByskSNHasCAAbr33nvVvHlzrVmzRrNnz9bLL7/sNqP6xYsXnaf2OxwOJSYmatKkSXI4HBo2bFimasqqhx9+WH///bcaNWqkYsWK6ffff9c777yjKlWqOCeDq1Klivz9/TV27FidOnVKdrtdjRo1cpuFPaN1jxkzRg8//LBq1Kih1atXZzgJ3CuvvKKvv/5asbGx6tOnj8qVK6fDhw9rwYIFWrt2rfLkyZPlGgAA1iNwAwAyZeLEiRm29+zZ0/n3r776KsNQHhUVdcMnfho9erRbm7+/v0vgvtK1zbGxsVqwYIHq1KlzxVtqVaxYUdHR0Zo9e/YVA/e1bNq0STt27HDemiojcXFxevTRRzV79uwrBu5KlSqpYcOG+uSTT3T48GEZYxQTE6MXX3xRTz311FUnD0u3aNEizZw50/l48+bNzntiFytWLFOBOykpSV988YVat27tMpnbP/Xv31+BgYEaN26cPv30U0VGRurNN9/M8Jr0lJQUlzMEwsLCVLNmTc2aNUuNGze+Zk3Xo1u3bnr//ff13nvv6eTJkypcuLDuu+8+DR8+XH5+l6bBKVy4sCZNmqTRo0froYceUlpamhISEq4Zdl988UUdPXpUCxcudE7MtnTpUrfnFS1aVOvWrdPQoUMVHx+vpKQkFS1aVC1btnR+oZTVGgAA1rMZY0x2FwEAAAAAwK2GWcoBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAMhGM2bMkM1m04YNGyzfls1m0/Dhwy3fDgAAuITADQD410gPt5f/hIeHq2HDhlq6dGmW1/vKK69oyZIl3ivUQ2vXrlXLli1VtGhRBQUFqXjx4oqLi9OcOXOyrSZva9q0qWw2mwYOHHhd69m4caPatGmjwoULK1euXKpcubLGjx+vtLQ0L1UKAMD/BGR3AQAA3GgvvfSSoqOjZYzRX3/9pRkzZqhVq1b67LPP1KZNG4/X98orr+iee+5R+/btvV/sNSxYsED33XefqlSposcee0x58+bVvn37tHr1ak2ZMkVdu3Z19j137pwCAnzvV//ixYv1ww8/XPd6Nm7cqDp16qh06dJ65plnFBISoqVLl+qxxx7T3r179fbbb3uhWgAA/sf3fusCAHCdWrZsqRo1ajgfP/TQQypUqJDmzp2bpcCdnYYPH67y5cvrxx9/VI4cOVyWHTlyxOVxUFDQjSzNK86fP6///Oc/euaZZ/Tiiy9e17omT54sSVq9erXy5csnSerbt69iY2M1Y8YMAjcAwOs4pRwA8K+XJ08eBQcHu43+vv7666pTp47y58+v4OBgVa9eXQsXLnTpY7PZdPbsWc2cOdN5mnrPnj2dyw8ePKiHHnpIERERstvtio6OVr9+/XThwgWX9aSkpGjw4MEqWLCgcubMqQ4dOujo0aPXrH3v3r2qWbOmW9iWpPDwcLda06/hTkxMdDu9/vKfy61bt04tWrRQ7ty5FRISotjYWH333XcufU6fPq3HH39cUVFRstvtCg8PV9OmTbVp0yZnn+TkZO3YsUPHjh275utK9+qrr8rhcOjJJ5/M9HOuJCkpSUFBQcqTJ49Le5EiRRQcHHzd6wcA4J8Y4QYA/OucOnVKx44dkzFGR44c0TvvvKMzZ86oW7duLv3efvtttW3bVvfff78uXLigefPm6d5779Xnn3+u1q1bS5JmzZqlhx9+WLVq1VKfPn0kSTExMZKkQ4cOqVatWjp58qT69OmjsmXL6uDBg1q4cKGSk5NdQvKjjz6qvHnzatiwYUpMTNRbb72lgQMHav78+Vd9LSVKlNA333yjP/74Q8WKFcv0PihYsKBmzZrl0paamqonnnjCpa5vv/1WLVu2VPXq1TVs2DD5+flp+vTpatSokdasWaNatWpJkh555BEtXLhQAwcOVPny5XX8+HGtXbtW27dvV7Vq1SRJ69evV8OGDTVs2LBMTd62f/9+jRkzRh988IFXAnGDBg00f/589e3bV4MHD3aeUr548WK99tpr171+AADcGAAA/iWmT59uJLn92O12M2PGDLf+ycnJLo8vXLhgKlasaBo1auTSnjNnTtOjRw+353fv3t34+fmZn376yW2Zw+FwqalJkybONmOMeeKJJ4y/v785efLkVV/TtGnTjCSTI0cO07BhQzN06FCzZs0ak5aW5tZXkhk2bNgV19W/f3/j7+9vvv32W2eNpUuXNs2bN3epLTk52URHR5umTZs623Lnzm0GDBhw1VoTEhKuWcPl7rnnHlOnTh2X+q+1jau5ePGiGThwoAkMDHS+9/7+/mbixIlZXicAAFfDCDcA4F/n3Xff1W233SZJ+uuvvzR79mw9/PDDCg0N1d133+3sd/mo6okTJ5SWlqZ69epp7ty519yGw+HQkiVLFBcX53K9eLp/nrbdp08fl7Z69erpzTff1O+//67KlStfcTsPPvigihYtqjfeeEMJCQlKSEjQyJEjVbJkSc2aNUt16tS5Zq2S9OGHH+q9997TuHHj1LBhQ0nSli1btHv3bg0ZMkTHjx936d+4cWPNmjVLDodDfn5+ypMnj9atW6dDhw4pIiIiw200aNBAxphM1ZOQkKBFixZp3bp1meqfGf7+/oqJiVHz5s117733KigoSHPnztWjjz6qwoULZ8ukdwCAWxuBGwDwr1OrVi2XENylSxdVrVpVAwcOVJs2bZynVH/++ecaNWqUtmzZopSUFGf/f4bljBw9elRJSUmqWLFipmoqXry4y+O8efNKuhT0r6V58+Zq3ry5kpOTtXHjRs2fP1+TJk1SmzZttGPHDrdruf9py5YteuSRR9SlSxcNHjzY2b57925JUo8ePa743FOnTilv3rx69dVX1aNHD0VGRqp69epq1aqVunfvrpIlS16z/n+6ePGiBg0apAceeEA1a9b0+PlXMmbMGL399tvavXu3cuXKJUnq1KmTGjZsqAEDBqhNmzY+OYs7AODmxaRpAIB/PT8/PzVs2FCHDx92hsw1a9aobdu2CgoK0nvvvacvv/xSy5cvV9euXTM9SusJf3//DNs92VZISIjq1aunCRMmaMiQITpx4sQ17y9+4sQJdezYUbfddpumTp3qsszhcEiSXnvtNS1fvjzDn8uD62+//aZ33nlHEREReu2111ShQoUs3d/8ww8/1M6dO9W3b18lJiY6f6RLk7MlJiYqOTnZ4/W+9957atSokbPmdG3bttWhQ4ec2wAAwFv4GhcAAF0aVZWkM2fOSJIWLVqkoKAgLVu2THa73dlv+vTpbs/NaMS7YMGCCgsL0y+//GJRxVeXPoJ/+PDhK/ZxOBy6//77dfLkSa1YsUIhISEuy9MnfwsLC1OTJk2uuc0iRYqof//+6t+/v44cOaJq1arp5ZdfVsuWLT2qff/+/UpNTVXdunXdln344Yf68MMP9fHHH3t8Cvhff/2ltLQ0t/bU1FRJ//sMAADgLYxwAwD+9VJTU/X1118rR44cKleunKRLI842m80loCUmJmrJkiVuz8+ZM6dOnjzp0ubn56f27dvrs88+04YNG9ye461R8m+++SbD9i+//FKSVKZMmSs+d8SIEVq2bJnmzp2r6Ohot+XVq1dXTEyMXn/9decXEZdLv21ZWlqaTp065bIsPDxcERERLqfiZ/a2YJ07d9bHH3/s9iNJrVq10scff6zatWtfdR0Zue2227R8+XKX69HT0tL00UcfKTQ01PkFAwAA3sIINwDgX2fp0qXasWOHJOnIkSOaM2eOdu/erWeffVZhYWGSpNatW+uNN95QixYt1LVrVx05ckTvvvuuSpUqpZ9//tllfdWrV9eKFSv0xhtvKCIiQtHR0apdu7ZeeeUVff3114qNjVWfPn1Urlw5HT58WAsWLNDatWvd7gedFe3atVN0dLTi4uIUExOjs2fPasWKFfrss89Us2ZNxcXFZfi8bdu2aeTIkapfv76OHDmi2bNnuyzv1q2b/Pz8NHXqVLVs2VIVKlRQr169VLRoUR08eFAJCQkKCwvTZ599ptOnT6tYsWK65557dPvttytXrlxasWKFfvrpJ40bN865zszeFqxs2bIqW7Zshsuio6PdRrYbNGigVatWXfNLjGeffVbdunVT7dq11adPHwUHB2vu3LnauHGjRo0apcDAwKs+HwAATxG4AQD/Oi+++KLz70FBQSpbtqwmTpyovn37OtsbNWqkadOmacyYMXr88ccVHR2tsWPHKjEx0S1wv/HGG+rTp4+GDBmic+fOqUePHqpdu7aKFi2qdevWaejQoYqPj1dSUpKKFi2qli1bup2+nVVTp07VJ598oo8++kiHDh2SMUYlS5bUCy+8oGeeeeaKk4AdP35cxhitWrVKq1atcluefk/yBg0a6IcfftDIkSM1YcIEnTlzRoULF1bt2rWd+yskJET9+/fX119/rcWLF8vhcKhUqVJ677331K9fP6+8zqtJr+la7r//fhUoUECjR4/Wa6+9pqSkJJUpU0aTJk1yee8BAPAWm7Fi5hcAAIAb4PTp08qXL5/eeustDRgwILvLAQDABddwAwAAn7V69WoVLVpUvXv3zu5SAABwwwg3AAAAAAAWYIQbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsIBP34fb4XDo0KFDCg0Nlc1my+5yAAAAAAC3OGOMTp8+rYiICPn5XX0M26cD96FDhxQZGZndZQAAAAAA/mUOHDigYsWKXbWPTwfu0NBQSZdeaFhYWDZXg1tZamqqvv76azVr1kyBgYHZXQ4AXDeOawBuNRzXcKMkJSUpMjLSmUevxqcDd/pp5GFhYQRuWCo1NVUhISEKCwvjAA7glsBxDcCthuMabrTMXNbMpGkAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWMCnr+HOrLS0NKWmpmZ3GfBhqampCggI0Pnz55WWlpbl9QQGBsrf39+LlQEAAAC4Wd3SgdsYoz///FMnT57M7lLg44wxKly4sA4cOHDd93zPkyePChcuzL3jAQAAgFvcLR2408N2eHi4QkJCCDjIMofDoTNnzihXrlzXvLn9lRhjlJycrCNHjkiSihQp4s0SAQAAANxkbtnAnZaW5gzb+fPnz+5y4OMcDocuXLigoKCgLAduSQoODpYkHTlyROHh4ZxeDgAAANzCbtlJ09Kv2Q4JCcnmSgBX6Z9J5hUAAAAAbm23bOBOx2nkuNnwmQQAAAD+HW75wA3fEBUVpbfeeiu7y8iS4cOHq0qVKtldBgAAAICbDIH7JvbDDz/I399frVu3zu5SMnQjQ/LNEmr9/f21ZMmS7C4DAAAAgA+4ZSdNu5qoZ7+4YdtKHJP1sDxt2jQ9+uijmjZtmg4dOqSIiAgvVgYAAAAAsBIj3DepM2fOaP78+erXr59at26tGTNmuPX57LPPVLNmTQUFBalAgQLq0KGDc1lKSoqeeeYZRUZGym63q1SpUpo2bZpz+S+//KKWLVsqV65cKlSokB544AEdO3bMubxBgwYaOHCgBg4cqNy5c6tAgQIaOnSojDHO5b///rueeOIJ2Ww2l+uS165dq3r16ik4OFiRkZEaNGiQzp4961x+5MgRxcXFKTg4WNHR0YqPj7/u/XXgwAF16tRJefLkUb58+dSuXTslJiY6l/fs2VPt27fX66+/riJFiih//vwaMGCAy8Rlhw8fVuvWrZ11zZkzx2UUv3LlypKkDh06yGazKSoqyqWGWbNmKSoqSrlz51bnzp11+vTp635dAAAAAHwXgfsm9dFHH6ls2bIqU6aMunXrpg8++MAZdiXpiy++UIcOHdSqVStt3rxZ33zzjWrVquVc3r17d82dO1fjx4/X9u3bNXnyZOXKlUuSdPLkSTVq1EhVq1bVhg0b9NVXX+mvv/5Sp06dXGqYOXOmAgICtH79er399tt64403NHXqVEnS4sWLVaxYMb300ks6fPiwDh8+LEnau3evWrRooY4dO+rnn3/W/PnztXbtWg0cONC53p49e+rAgQNKSEjQwoUL9d577znvTZ0Vqampat68uUJDQ7VmzRp99913ypUrl1q0aKELFy44+yUkJGjv3r1KSEjQzJkzNWPGDJcvMrp3765Dhw5p5cqVWrRokd5//32Xur799ltJ0vTp03X48GH99NNPzmV79+7VkiVL9Pnnn+vzzz/XqlWrNGbMmCy/JgAAAAC+7195SrkvmDZtmrp16yZJatGihU6dOqVVq1apQYMGkqSXX35ZnTt31ogRI5zPuf322yVJu3bt0kcffaTly5erSZMmkqSSJUs6+02YMEFVq1bVK6+84mz74IMPFBkZqV27dum2226TJEVGRurNN9+UzWZTmTJltG3bNr355pvq3bu38uXLJ39/f4WGhqpw4cLO9YwePVr333+/Hn/8cUlS6dKlNX78eMXGxmrixInav3+/li5dqvXr16tmzZrO11quXLks76v58+fL4XBo6tSpzpH26dOnK0+ePFq5cqWaNWsmScqbN68mTJggf39/lS1bVq1bt9Y333yj3r17a8eOHVqxYoV++ukn1ahRQ5I0depUlS5d2rmdAgUKSJLy5Mnj8pqlS/fpnjFjhkJDQyVJDzzwgL755hu9/PLLWX5dAAAAAHwbI9w3oZ07d2r9+vXq0qWLJCkgIED33XefyynhW7ZsUePGjTN8/pYtW+Tv76/Y2NgMl2/dulUJCQnKlSuX86ds2bKSLo3UprvjjjtcThW/8847tXv3bqWlpV2x9q1bt2rGjBku627evLkcDof27dun7du3KyAgQNWrV3c+p2zZssqTJ8+1d8xVtrlnzx6FhoY6t5kvXz6dP3/e5fVUqFBB/v7+zsdFihRxjmDv3LlTAQEBqlatmnN5qVKllDdv3kzVEBUV5Qzb/1w3AAAAgH8nRrhvQtOmTdPFixddJkkzxshut2vChAnKnTu3goODr/j8qy2TLl0fHhcXp7Fjx7otK1KkSNYL//919+3bV4MGDXJbVrx4ce3ateu61n+lbVavXj3Da8ELFizo/HtgYKDLMpvNJofD4ZUarFw3AAAAAN9E4L7JXLx4UR9++KHGjRvnPBU6Xfv27TV37lw98sgjqly5sr755hv16tXLbR2VKlWSw+HQqlWrnKeUX65atWpatGiRoqKiFBBw5Y/AunXrXB7/+OOPKl26tHOUOEeOHG6j3dWqVdOvv/6qUqVKZbjOsmXL6uLFi9q4caPzlPKdO3fq5MmTV6zjWqpVq6b58+crPDxcYWFhWVpHmTJldPHiRW3evNk5+r5nzx6dOHHCpV9gYOBVR/gBAAAAIB2nlN9kPv/8c504cUIPPfSQKlas6PLTsWNH52nlw4YN09y5czVs2DBt375d27Ztc45YR0VFqUePHnrwwQe1ZMkS7du3TytXrtRHH30kSRowYID+/vtvdenSRT/99JP27t2rZcuWqVevXi5hcv/+/Ro8eLB27typuXPn6p133tFjjz3mXB4VFaXVq1fr4MGDzhnOn3nmGX3//fcaOHCgtmzZot27d+uTTz5xTppWpkwZtWjRQn379tW6deu0ceNGPfzww9cclZekc+fOacuWLS4/e/fu1f33368CBQqoXbt2WrNmjfP1Dho0SH/88Uem9nvZsmXVpEkT9enTR+vXr9fmzZvVp08fBQcHu5xWHxUVpW+++UZ//vmnWxgHAAAAgMsRuG8y06ZNU5MmTZQ7d263ZR07dtSGDRv0888/q0GDBlqwYIE+/fRTValSRY0aNdL69eudfSdOnKh77rlH/fv3V9myZdW7d2/nrbkiIiL03XffKS0tTc2aNVOlSpX0+OOPK0+ePPLz+99Honv37jp37pxq1aqlAQMG6LHHHlOfPn2cy1966SUlJiYqJibGeep25cqVtWrVKu3atUv16tVT1apV9eKLL7qcHj99+nRFREQoNjZWd999t/r06aPw8PBr7ptdu3apatWqLj99+/ZVSEiIVq9ereLFi+vuu+9WuXLl9NBDD+n8+fMejXh/+OGHKlSokOrXr68OHTqod+/eCg0NVVBQkLPPa6+9puXLlysyMlJVq1bN9LoBAAAA/PvYzOX3mvIxSUlJyp07t06dOuUWrM6fP699+/YpOjraJTAhcxo0aKAqVao470H9b/THH38oMjJSK1asUMOGDZWUlKSwsDCXLyWygs8mgJtBamqqvvzyS7Vq1cptHgoA8EUc13CjXC2H/hPXcAP/79tvv9WZM2dUqVIlHT58WE8//bSioqJUv3797C4NAAAAgA8icAP/LzU1Vc8//7x+++03hYaGqk6dOoqPj1dgYCAzjgMAAADwGIEbGVq5cmV2l3DDNW/eXM2bN8/uMgAAAADcIpg0DQAAAAAAC2Rr4B4+fLhsNpvLT9myZbOzJAAAAAAAvCLbTymvUKGCVqxY4XwcEODdknx4EnbcovhMAgCAzIh69ovsLsGn2P2NXq0lVRy+TClptuwux6ckjmmd3SXcsrI9cAcEBKhw4cJeX2/6rQCSk5MVHBzs9fUDWZWcnCxJ3K4CAAAAuMVle+DevXu3IiIiFBQUpDvvvFOjR49W8eLFM+ybkpKilJQU5+OkpCRJl2aXTk1NdesfGhqqv/76Sw6HQyEhIbLZ+KYLWWOM0YULF3Tu3Lksf46MMUpOTtbRo0cVFhYmh8PB7OcAsk36782Mfn8CuDnY/TkrzhN2P+PyJzKP3wWe8WR/2Uw2nt+6dOlSnTlzRmXKlNHhw4c1YsQIHTx4UL/88otCQ0Pd+g8fPlwjRoxwa58zZ45CQkIy3EZoaKhCQ0Pl58f8cMh+DodDp0+f1unTp7O7FAAAAABZkJycrK5du+rUqVMKCwu7at9sDdz/dPLkSZUoUUJvvPGGHnroIbflGY1wR0ZG6tixY1d9oWlpabp48SLXziLLLl68qO+//1516tTJ8jwDNptNAQEB8vf393J1AOC51NRULV++XE2bNuUSF+AmVXH4suwuwafY/YxG1nBo6AY/pTg4s9UTvwzn1rieSEpKUoECBTIVuLP9lPLL5cmTR7fddpv27NmT4XK73S673e7WHhgYeNX/LPAfCVyv1NRUXbx4Ubly5eLzBOCWcq3foQCyDxN/ZU2Kw8a+8xC/Bzzjyf66qc6zPnPmjPbu3asiRYpkdykAAAAAAFyXbA3cTz75pFatWqXExER9//336tChg/z9/dWlS5fsLAsAAAAAgOuWraeU//HHH+rSpYuOHz+uggUL6q677tKPP/6oggULZmdZAAAAAABct2wN3PPmzcvOzQMAAAAAYJmb6hpuAAAAAABuFQRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxA4AYAAAAAwAIEbgAAAAAALEDgBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACN03gHjNmjGw2mx5//PHsLgUAAAAAgOt2UwTun376SZMnT1blypWzuxQAAAAAALwi2wP3mTNndP/992vKlCnKmzdvdpcDAAAAAIBXZHvgHjBggFq3bq0mTZpkdykAAAAAAHhNQHZufN68edq0aZN++umnTPVPSUlRSkqK83FSUpIkKTU1VampqZbUCEhyfr74nAG4VXBcA25+dn+T3SX4FLufcfkTmcfvAs94sr+yLXAfOHBAjz32mJYvX66goKBMPWf06NEaMWKEW/vXX3+tkJAQb5cIuFm+fHl2lwAAXsVxDbh5vVoruyvwTSNrOLK7BJ/z5ZdfZncJPiU5OTnTfW3GmGz5CmjJkiXq0KGD/P39nW1paWmy2Wzy8/NTSkqKyzIp4xHuyMhIHTt2TGFhYTesdvz7pKamavny5WratKkCAwOzuxwAuG4c14CbX8Xhy7K7BJ9i9zMaWcOhoRv8lOKwZXc5PuWX4c2zuwSfkpSUpAIFCujUqVPXzKHZNsLduHFjbdu2zaWtV69eKlu2rJ555hm3sC1JdrtddrvdrT0wMJD/LOCG4LMG4FbDcQ24eaWkERqzIsVhY995iN8DnvFkf2Vb4A4NDVXFihVd2nLmzKn8+fO7tQMAAAAA4GuyfZZyAAAAAABuRdk6S/k/rVy5MrtLAAAAAADAKxjhBgAAAADAAgRuAAAAAAAsQOAGAAAAAMACBG4AAAAAACxwU02aBgBAVkU9+0V2l+BT7P5Gr9aSKg5fxv1qPZQ4pnV2lwAA8BGMcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAW8Dhwb9q0Sdu2bXM+/uSTT9S+fXs9//zzunDhgleLAwAAAADAV3kcuPv27atdu3ZJkn777Td17txZISEhWrBggZ5++mmvFwgAAAAAgC/yOHDv2rVLVapUkSQtWLBA9evX15w5czRjxgwtWrTI2/UBAAAAAOCTPA7cxhg5HA5J0ooVK9SqVStJUmRkpI4dO+bd6gAAAAAA8FEeB+4aNWpo1KhRmjVrllatWqXWrVtLkvbt26dChQp5vUAAAAAAAHyRx4H7rbfe0qZNmzRw4EC98MILKlWqlCRp4cKFqlOnjtcLBAAAAADAFwV4+oTKlSu7zFKe7rXXXpO/v79XigIAAAAAwNd5HLjTXbhwQUeOHHFez52uePHi110UAAAAAAC+zuPAvWvXLj300EP6/vvvXdqNMbLZbEpLS/NacQAAAAAA+CqPA3evXr0UEBCgzz//XEWKFJHNZrOiLgAAAAAAfJrHgXvLli3auHGjypYta0U9AAAAAADcEjyepbx8+fLcbxsAAAAAgGvwOHCPHTtWTz/9tFauXKnjx48rKSnJ5QcAAAAAAGThlPImTZpIkho3buzSzqRpAAAAAAD8j8eBOyEhwYo6AAAAAAC4pXgcuGNjY62oAwAAAACAW4rH13BL0po1a9StWzfVqVNHBw8elCTNmjVLa9eu9WpxAAAAAAD4Ko8D96JFi9S8eXMFBwdr06ZNSklJkSSdOnVKr7zyitcLBAAAAADAF3kcuEeNGqVJkyZpypQpCgwMdLbXrVtXmzZt8mpxAAAAAAD4Ko8D986dO1W/fn239ty5c+vkyZPeqAkAAAAAAJ/nceAuXLiw9uzZ49a+du1alSxZ0itFAQAAAADg6zwO3L1799Zjjz2mdevWyWaz6dChQ4qPj9eTTz6pfv36WVEjAAAAAAA+x+Pbgj377LNyOBxq3LixkpOTVb9+fdntdj355JN69NFHragRAAAAAACf43Hgvnjxol544QU99dRT2rNnj86cOaPy5csrV65cOnbsmAoUKGBFnQAAAAAA+BSPTynv3LmzjDHKkSOHypcvr1q1ailXrlz666+/1KBBAwtKBAAAAADA93gcuPfv36+HH37Ype3w4cNq0KCBypYt67XCAAAAAADwZR4H7i+//FLff/+9Bg8eLEk6dOiQGjRooEqVKumjjz7yeoEAAAAAAPgij6/hLliwoL7++mvdddddkqTPP/9c1apVU3x8vPz8PM7vAAAAAADckjwO3JIUGRmp5cuXq169emratKlmzZolm83m7doAAAAAAPBZmQrcefPmzTBQJycn67PPPlP+/PmdbX///bf3qgMAAAAAwEdlKnC/9dZbFpcBAAAAAMCtJVOBu0ePHlbXAQAAAADALSVL13CnpaVpyZIl2r59uySpQoUKatu2rfz9/b1aHAAAAAAAvsrjwL1nzx61atVKBw8eVJkyZSRJo0ePVmRkpL744gvFxMR4vUgAAAAAAHyNx/fxGjRokGJiYnTgwAFt2rRJmzZt0v79+xUdHa1BgwZZUSMAAAAAAD7H4xHuVatW6ccff1S+fPmcbfnz59eYMWNUt25drxYHAAAAAICv8niE22636/Tp027tZ86cUY4cObxSFAAAAAAAvi7TgXv16tVKTU1VmzZt1KdPH61bt07GGBlj9OOPP+qRRx5R27ZtrawVAAAAAACfkenA3bBhQ504cULjx49XTEyM7rzzTgUFBSkoKEh169ZVqVKl9Pbbb1tZKwAAAAAAPiPT13AbYyRJefLk0SeffKLdu3drx44dkqRy5cqpVKlS1lQIAAAAAIAP8mjSNJvN5vx76dKlVbp0aa8XBAAAAADArcCjwN2zZ0/Z7far9lm8ePF1FQQAAAAAwK3Ao8AdGhqq4OBgq2oBAAAAAOCW4VHgHj9+vMLDw62qBQAAAACAW0amZym//PptAAAAAABwdZkO3OmzlAMAAAAAgGvLdOBOSEhQvnz5rKwFAAAAAIBbRqav4Y6NjbWyDgAAAAAAbimZHuEGAAAAAACZR+AGAAAAAMACBG4AAAAAACyQqWu4k5KSMr3CsLCwLBcDAAAAAMCtIlOBO0+ePJm+D3daWtp1FQQAAAAAwK0gU4E7ISHB+ffExEQ9++yz6tmzp+68805J0g8//KCZM2dq9OjR1lQJAAAAAICPyVTgvvyWYC+99JLeeOMNdenSxdnWtm1bVapUSe+//7569Ojh/SoBAAAAAPAxHk+a9sMPP6hGjRpu7TVq1ND69eu9UhQAAAAAAL7O48AdGRmpKVOmuLVPnTpVkZGRXikKAAAAAABfl6lTyi/35ptvqmPHjlq6dKlq164tSVq/fr12796tRYsWeb1AAAAAAAB8kccj3K1atdKuXbsUFxenv//+W3///bfi4uK0a9cutWrVyooaAQAAAADwOR6PcEuXTit/5ZVXvF0LAAAAAAC3DI9HuCVpzZo16tatm+rUqaODBw9KkmbNmqW1a9d6tTgAAAAAAHyVx4F70aJFat68uYKDg7Vp0yalpKRIkk6dOsWoNwAAAAAA/8/jwD1q1ChNmjRJU6ZMUWBgoLO9bt262rRpk1eLAwAAAADAV3kcuHfu3Kn69eu7tefOnVsnT570Rk0AAAAAAPg8jwN34cKFtWfPHrf2tWvXqmTJkl4pCgAAAAAAX+dx4O7du7cee+wxrVu3TjabTYcOHVJ8fLyefPJJ9evXz4oaAQAAAADwOR7fFuzZZ5+Vw+FQ48aNlZycrPr168tut+vJJ5/Uo48+akWNAAAAAAD4HI8Dt81m0wsvvKCnnnpKe/bs0ZkzZ1S+fHnlypXLivoAAAAAAPBJHp9S/uCDD+r06dPKkSOHypcvr1q1ailXrlw6e/asHnzwQStqBAAAAADA53gcuGfOnKlz5865tZ87d04ffvihV4oCAAAAAMDXZfqU8qSkJBljZIzR6dOnFRQU5FyWlpamL7/8UuHh4ZYUCQAAAACAr8l04M6TJ49sNptsNptuu+02t+U2m00jRozwanEAAAAAAPiqTAfuhIQEGWPUqFEjLVq0SPny5XMuy5Ejh0qUKKGIiAhLigQAAAAAwNdkOnDHxsZKkvbt26fIyEj5+Xl8+TcAAAAAAP8aHt8WrESJEjpx4oSmTZum7du3S5LKly+vXr16uYx6AwAAAADwb+bxMPXq1asVFRWl8ePH68SJEzpx4oTGjx+v6OhorV692ooaAQAAAADwOR6PcA8YMED33XefJk6cKH9/f0mXZinv37+/BgwYoG3btnm9SAAAAAAAfI3HI9x79uzRf/7zH2fYliR/f38NHjxYe/bs8WpxAAAAAAD4Ko8Dd7Vq1ZzXbl9u+/btuv32271SFAAAAAAAvs7jU8oHDRqkxx57THv27NEdd9whSfrxxx/17rvvasyYMfr555+dfStXrnzVdU2cOFETJ05UYmKiJKlChQp68cUX1bJlS0/LAgAAAADgpuJx4O7SpYsk6emnn85wmc1mkzFGNptNaWlpV11XsWLFNGbMGJUuXVrGGM2cOVPt2rXT5s2bVaFCBU9LAwAAAADgpuFx4N63b5/XNh4XF+fy+OWXX9bEiRP1448/ErgBAAAAAD4tS/fhtkJaWpoWLFigs2fP6s4777RkGwAAAAAA3CgeB25JmjVrliZNmqR9+/bphx9+UIkSJfTWW28pOjpa7dq182hd27Zt05133qnz588rV65c+vjjj1W+fPkM+6akpCglJcX5OCkpSZKUmpqq1NTUrLwUIFPSP198zoCbl93fZHcJPsXuZ1z+RObxuwA3Csc1z3BcyzqOa57xZH/ZjDEefSInTpyoF198UY8//rhefvll/fLLLypZsqRmzJihmTNnKiEhwaNiL1y4oP379+vUqVNauHChpk6dqlWrVmUYuocPH64RI0a4tc+ZM0chISEebRcAAAAAAE8lJyera9euOnXqlMLCwq7a1+PAXb58eb3yyitq3769QkNDtXXrVpUsWVK//PKLGjRooGPHjl1X8U2aNFFMTIwmT57stiyjEe7IyEgdO3bsmi8UuB6pqalavny5mjZtqsDAwOwuB0AGKg5flt0l+BS7n9HIGg4N3eCnFIctu8vxKb8Mb57dJeBfguOaZziuZR3HNc8kJSWpQIECmQrcWZo0rWrVqm7tdrtdZ8+e9XR1bhwOh0uo/uc27Ha7W3tgYCAhCDcEnzXg5pWSxn+usiLFYWPfeYjfA7hR+LeZNRzXPMdxzTOe7C+PA3d0dLS2bNniNnnaV199pXLlynm0rueee04tW7ZU8eLFdfr0ac2ZM0crV67UsmV8mwcAAAAA8G0eB+7BgwdrwIABOn/+vIwxWr9+vebOnavRo0dr6tSpHq3ryJEj6t69uw4fPqzcuXOrcuXKWrZsmZo2beppWQAAAAAA3FQ8DtwPP/ywgoODNWTIEOfF4hEREXr77bfVuXNnj9Y1bdo0TzcPAAAAAIBPyNJtwe6//37df//9Sk5O1pkzZxQeHu7tugAAAAAA8GlZmjTt4sWLKl26tEJCQpy349q9e7cCAwMVFRXl7RoBAAAAAPA5fp4+oWfPnvr+++/d2tetW6eePXt6oyYAAAAAAHyex4F78+bNqlu3rlv7HXfcoS1btnijJgAAAAAAfJ7Hgdtms+n06dNu7adOnVJaWppXigIAAAAAwNd5HLjr16+v0aNHu4TrtLQ0jR49WnfddZdXiwMAAAAAwFd5PGna2LFjVb9+fZUpU0b16tWTJK1Zs0ZJSUn69ttvvV4gAAAAAAC+yOMR7vLly+vnn39Wp06ddOTIEZ0+fVrdu3fXjh07VLFiRStqBAAAAADA52TpPtwRERF65ZVXvF0LAAAAAAC3DI9HuKdPn64FCxa4tS9YsEAzZ870SlEAAAAAAPg6jwP36NGjVaBAAbf28PBwRr0BAAAAAPh/Hgfu/fv3Kzo62q29RIkS2r9/v1eKAgAAAADA13kcuMPDw/Xzzz+7tW/dulX58+f3SlEAAAAAAPg6jwN3ly5dNGjQICUkJCgtLU1paWn69ttv9dhjj6lz585W1AgAAAAAgM/xeJbykSNHKjExUY0bN1ZAwKWnOxwOde/eXS+//LLXCwQAAAAAwBd5HLhz5Mih+fPna9SoUdqyZYuCg4NVqVIllShRwor6AAAAAADwSVm6D7cklS5dWqVLl5YkJSUlaeLEiZo2bZo2bNjgteIAAAAAAPBVWQ7ckpSQkKAPPvhAixcvVu7cudWhQwdv1QUAAAAAgE/zOHAfPHhQM2bM0PTp03Xy5EmdOHFCc+bMUadOnWSz2ayoEQAAAAAAn5PpWcoXLVqkVq1aqUyZMtqyZYvGjRunQ4cOyc/PT5UqVSJsAwAAAABwmUyPcN9333165plnNH/+fIWGhlpZEwAAAAAAPi/TI9wPPfSQ3n33XbVo0UKTJk3SiRMnrKwLAAAAAACflunAPXnyZB0+fFh9+vTR3LlzVaRIEbVr107GGDkcDitrBAAAAADA52Q6cEtScHCwevTooVWrVmnbtm2qUKGCChUqpLp166pr165avHixVXUCAAAAAOBTPArclytdurReeeUVHThwQLNnz1ZycrK6dOnizdoAAAAAAPBZ13Ufbkny8/NTXFyc4uLidOTIEW/UBAAAAACAz8vyCHdGwsPDvbk6AAAAAAB8llcDNwAAAAAAuITADQAAAACABTIVuMePH6/z589Lkvbv3y9jjKVFAQAAAADg6zIVuAcPHqykpCRJUnR0tI4ePWppUQAAAAAA+LpMzVIeERGhRYsWqVWrVjLG6I8//nCOeP9T8eLFvVogAAAAAAC+KFOBe8iQIXr00Uc1cOBA2Ww21axZ062PMUY2m01paWleLxIAAAAAAF+TqcDdp08fdenSRb///rsqV66sFStWKH/+/FbXBgAAAACAz8pU4Jak0NBQVaxYUdOnT1fdunVlt9utrAsAAAAAAJ+W6cCdrkePHpKkjRs3avv27ZKk8uXLq1q1at6tDAAAAAAAH+Zx4D5y5Ig6d+6slStXKk+ePJKkkydPqmHDhpo3b54KFizo7RoBAAAAAPA5mbot2OUeffRRnT59Wv/973/1999/6++//9Yvv/yipKQkDRo0yIoaAQAAAADwOR6PcH/11VdasWKFypUr52wrX7683n33XTVr1syrxQEAAAAA4Ks8HuF2OBwKDAx0aw8MDJTD4fBKUQAAAAAA+DqPA3ejRo302GOP6dChQ862gwcP6oknnlDjxo29WhwAAAAAAL7K48A9YcIEJSUlKSoqSjExMYqJiVF0dLSSkpL0zjvvWFEjAAAAAAA+x+NruCMjI7Vp0yatWLFCO3bskCSVK1dOTZo08XpxAAAAAAD4Ko8DtyTZbDY1bdpUTZs29XY9AAAAAADcEjw+pRwAAAAAAFwbgRsAAAAAAAsQuAEAAAAAsIBHgfvixYv68MMP9ddff1lVDwAAAAAAtwSPAndAQIAeeeQRnT9/3qp6AAAAAAC4JXh8SnmtWrW0ZcsWC0oBAAAAAODW4fFtwfr376/BgwfrwIEDql69unLmzOmyvHLlyl4rDgAAAAAAX+Vx4O7cubMkadCgQc42m80mY4xsNpvS0tK8Vx0AAAAAAD7K48C9b98+K+oAAAAAAOCW4nHgLlGihBV1AAAAAABwS8nSfbhnzZqlunXrKiIiQr///rsk6a233tInn3zi1eIAAAAAAPBVHgfuiRMnavDgwWrVqpVOnjzpvGY7T548euutt7xdHwAAAAAAPsnjwP3OO+9oypQpeuGFF+Tv7+9sr1GjhrZt2+bV4gAAAAAA8FUeB+59+/apatWqbu12u11nz571SlEAAAAAAPg6jwN3dHS0tmzZ4tb+1VdfqVy5ct6oCQAAAAAAn+fxLOWDBw/WgAEDdP78eRljtH79es2dO1ejR4/W1KlTragRAAAAAACf43HgfvjhhxUcHKwhQ4YoOTlZXbt2VUREhN5++2117tzZihoBAAAAAPA5HgduSbr//vt1//33Kzk5WWfOnFF4eLi36wIAAAAAwKdlKXBL0pEjR7Rz505Jks1mU8GCBb1WFAAAAAAAvs7jSdNOnz6tBx54QBEREYqNjVVsbKwiIiLUrVs3nTp1yooaAQAAAADwOR4H7ocffljr1q3TF198oZMnT+rkyZP6/PPPtWHDBvXt29eKGgEAAAAA8Dken1L++eefa9myZbrrrrucbc2bN9eUKVPUokULrxYHAAAAAICv8niEO3/+/MqdO7dbe+7cuZU3b16vFAUAAAAAgK/zOHAPGTJEgwcP1p9//uls+/PPP/XUU09p6NChXi0OAAAAAABflalTyqtWrSqbzeZ8vHv3bhUvXlzFixeXJO3fv192u11Hjx7lOm4AAAAAAJTJwN2+fXuLywAAAAAA4NaSqcA9bNgwq+sAAAAAAOCW4vEs5Zc7c+aMHA6HS1tYWNh1FQQAAAAAwK3A40nT9u3bp9atWytnzpzOmcnz5s2rPHnyMEs5AAAAAAD/z+MR7m7duskYow8++ECFChVymUwNAAAAAABc4nHg3rp1qzZu3KgyZcpYUQ8AAAAAALcEj08pr1mzpg4cOGBFLQAAAAAA3DI8HuGeOnWqHnnkER08eFAVK1ZUYGCgy/LKlSt7rTgAAAAAAHyVx4H76NGj2rt3r3r16uVss9lsMsbIZrMpLS3NqwUCAAAAAOCLPA7cDz74oKpWraq5c+cyaRoAAAAAAFfgceD+/fff9emnn6pUqVJW1AMAAAAAwC3B40nTGjVqpK1bt1pRCwAAAAAAtwyPR7jj4uL0xBNPaNu2bapUqZLbpGlt27b1WnEAAAAAAPgqjwP3I488Ikl66aWX3JYxaRoAAAAAAJd4HLgdDocVdQAAAAAAcEvx+BpuAAAAAABwbR6PcGd0KvnlXnzxxSwXAwAAAADArcLjwP3xxx+7PE5NTdW+ffsUEBCgmJgYAjcAAAAAAMpC4N68ebNbW1JSknr27KkOHTp4pSgAAAAAAHydV67hDgsL04gRIzR06FBvrA4AAAAAAJ/ntUnTTp06pVOnTnlrdQAAAAAA+DSPTykfP368y2NjjA4fPqxZs2apZcuWXisMAAAAAABf5nHgfvPNN10e+/n5qWDBgurRo4eee+45rxUGAAAAAIAv8zhw79u3z4o6AAAAAAC4pXjtGm4AAAAAAPA/mR7hfvDBB6/Zx2azadq0addVEAAAAAAAt4JMB+4TJ05ccVlaWppWrFihlJQUAjcAAAAAAPIgcH/88ccZtn/yySd6/vnnZbfb9eKLL3qtMAAAAAAAfFmWr+H+7rvvVK9ePXXt2lVt2rTRb7/9pmeffdabtQEAAAAA4LM8Dty//vqr4uLi1KBBA912223auXOnxo4dq7x581pRHwAAAAAAPinTgfvAgQPq1auXbr/9dgUEBOjnn3/WtGnTVKxYMSvrAwAAAADAJ2X6Gu4yZcrIZrNp8ODBqlu3rnbv3q3du3e79Wvbtq1XCwQAAAAAwBdlOnCfP39ekvTaa6/ptddey7CPzWZTWlqadyoDAAAAAMCHZTpwOxwOK+sAAAAAAOCWkuVZygEAAAAAwJURuAEAAAAAsACBGwAAAAAAC2Rr4B49erRq1qyp0NBQhYeHq3379tq5c2d2lgQAAAAAgFdka+BetWqVBgwYoB9//FHLly9XamqqmjVrprNnz2ZnWQAAAAAAXLdMz1J+uZMnT2rhwoXau3evnnrqKeXLl0+bNm1SoUKFVLRo0Uyv56uvvnJ5PGPGDIWHh2vjxo2qX79+VkoDAAAAAOCm4HHg/vnnn9WkSRPlzp1biYmJ6t27t/Lly6fFixdr//79+vDDD7NczKlTpyRJ+fLly3B5SkqKUlJSnI+TkpIkSampqUpNTc3ydoFrSf988TkDbl52f5PdJfgUu59x+ROZx+8C3Cgc1zzDcS3rOK55xpP9ZTPGePSJbNKkiapVq6ZXX31VoaGh2rp1q0qWLKnvv/9eXbt2VWJioqf1Srp0n++2bdvq5MmTWrt2bYZ9hg8frhEjRri1z5kzRyEhIVnaLgAAAAAAmZWcnKyuXbvq1KlTCgsLu2pfjwN37ty5tWnTJsXExLgE7t9//11lypTR+fPns1R0v379tHTpUq1du1bFihXLsE9GI9yRkZE6duzYNV8ocD1SU1O1fPlyNW3aVIGBgdldDoAMVBy+LLtL8Cl2P6ORNRwausFPKQ5bdpfjU34Z3jy7S8C/BMc1z3BcyzqOa55JSkpSgQIFMhW4PT6l3G63O0/lvtyuXbtUsGBBT1cnSRo4cKA+//xzrV69+ophO33bdrvdrT0wMJAQhBuCzxpw80pJ4z9XWZHisLHvPMTvAdwo/NvMGo5rnuO45hlP9pfHs5S3bdtWL730kvO8dZvNpv379+uZZ55Rx44dPVqXMUYDBw7Uxx9/rG+//VbR0dGelgMAAAAAwE3J48A9btw4nTlzRuHh4Tp37pxiY2NVqlQphYaG6uWXX/ZoXQMGDNDs2bM1Z84chYaG6s8//9Sff/6pc+fOeVoWAAAAAAA3FY9PKc+dO7eWL1+utWvX6ueff9aZM2dUrVo1NWnSxOONT5w4UZLUoEEDl/bp06erZ8+eHq8PAAAAAICbRZbuwy1Jd911l+66667r2riH87UBAAAAAOAzPA7c48ePz7DdZrMpKChIpUqVUv369eXv73/dxQEAAAAA4Ks8Dtxvvvmmjh49quTkZOXNm1eSdOLECYWEhChXrlw6cuSISpYsqYSEBEVGRnq9YAAAAAAAfIHHk6a98sorqlmzpnbv3q3jx4/r+PHj2rVrl2rXrq23335b+/fvV+HChfXEE09YUS8AAAAAAD7B4xHuIUOGaNGiRYqJiXG2lSpVSq+//ro6duyo3377Ta+++qrHtwgDAAAAAOBW4vEI9+HDh3Xx4kW39osXL+rPP/+UJEVEROj06dPXXx0AAAAAAD7K48DdsGFD9e3bV5s3b3a2bd68Wf369VOjRo0kSdu2bVN0dLT3qgQAAAAAwMd4HLinTZumfPnyqXr16rLb7bLb7apRo4by5cunadOmSZJy5cqlcePGeb1YAAAAAAB8hcfXcBcuXFjLly/Xjh07tGvXLklSmTJlVKZMGWefhg0beq9CAAAAAAB8kMeBO13ZsmVVtmxZb9YCAAAAAMAtI0uB+48//tCnn36q/fv368KFCy7L3njjDa8UBgAAAACAL/M4cH/zzTdq27atSpYsqR07dqhixYpKTEyUMUbVqlWzokYAAAAAAHyOx5OmPffcc3ryySe1bds2BQUFadGiRTpw4IBiY2N17733WlEjAAAAAAA+x+PAvX37dnXv3l2SFBAQoHPnzilXrlx66aWXNHbsWK8XCAAAAACAL/I4cOfMmdN53XaRIkW0d+9e57Jjx455rzIAAAAAAHyYx9dw33HHHVq7dq3KlSunVq1a6T//+Y+2bdumxYsX64477rCiRgAAAAAAfI7HgfuNN97QmTNnJEkjRozQmTNnNH/+fJUuXZoZygEAAAAA+H8eBe60tDT98ccfqly5sqRLp5dPmjTJksIAAAAAAPBlHl3D7e/vr2bNmunEiRNW1QMAAAAAwC3B40nTKlasqN9++82KWgAAAAAAuGV4HLhHjRqlJ598Up9//rkOHz6spKQklx8AAAAAAJCFSdNatWolSWrbtq1sNpuz3Rgjm82mtLQ071UHAAAAAICP8jhwJyQkWFEHAAAAAAC3FI8Dd2xsrBV1AAAAAABwS/H4Gm5JWrNmjbp166Y6dero4MGDkqRZs2Zp7dq1Xi0OAAAAAABf5XHgXrRokZo3b67g4GBt2rRJKSkpkqRTp07plVde8XqBAAAAAAD4oizNUj5p0iRNmTJFgYGBzva6detq06ZNXi0OAAAAAABf5XHg3rlzp+rXr+/Wnjt3bp08edIbNQEAAAAA4PM8DtyFCxfWnj173NrXrl2rkiVLeqUoAAAAAAB8nceBu3fv3nrssce0bt062Ww2HTp0SPHx8XryySfVr18/K2oEAAAAAMDneHxbsGeffVYOh0ONGzdWcnKy6tevL7vdrieffFKPPvqoFTUCAAAAAOBzPA7cNptNL7zwgp566int2bNHZ86cUfny5ZUrVy4r6gMAAAAAwCd5fEr57NmzlZycrBw5cqh8+fKqVasWYRsAAAAAgH/wOHA/8cQTCg8PV9euXfXll18qLS3NiroAAAAAAPBpHgfuw4cPa968ebLZbOrUqZOKFCmiAQMG6Pvvv7eiPgAAAAAAfJLHgTsgIEBt2rRRfHy8jhw5ojfffFOJiYlq2LChYmJirKgRAAAAAACf4/GkaZcLCQlR8+bNdeLECf3+++/avn27t+oCAAAAAMCneTzCLUnJycmKj49Xq1atVLRoUb311lvq0KGD/vvf/3q7PgAAAAAAfJLHI9ydO3fW559/rpCQEHXq1ElDhw7VnXfeaUVtAAAAAAD4LI8Dt7+/vz766CM1b95c/v7+Lst++eUXVaxY0WvFAQAAAADgqzwO3PHx8S6PT58+rblz52rq1KnauHEjtwkDAAAAAEBZvIZbklavXq0ePXqoSJEiev3119WoUSP9+OOP3qwNAAAAAACf5dEI959//qkZM2Zo2rRpSkpKUqdOnZSSkqIlS5aofPnyVtUIAAAAAIDPyfQId1xcnMqUKaOff/5Zb731lg4dOqR33nnHytoAAAAAAPBZmR7hXrp0qQYNGqR+/fqpdOnSVtYEAAAAAIDPy/QI99q1a3X69GlVr15dtWvX1oQJE3Ts2DErawMAAAAAwGdlOnDfcccdmjJlig4fPqy+fftq3rx5ioiIkMPh0PLly3X69Gkr6wQAAAAAwKd4PEt5zpw59eCDD2rt2rXatm2b/vOf/2jMmDEKDw9X27ZtragRAAAAAACfk+XbgklSmTJl9Oqrr+qPP/7Q3LlzvVUTAAAAAAA+77oCdzp/f3+1b99en376qTdWBwAAAACAz/NK4AYAAAAAAK4I3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABQjcAAAAAABYgMANAAAAAIAFCNwAAAAAAFiAwA0AAAAAgAUI3AAAAAAAWIDADQAAAACABbI1cK9evVpxcXGKiIiQzWbTkiVLsrMcAAAAAAC8JlsD99mzZ3X77bfr3Xffzc4yAAAAAADwuoDs3HjLli3VsmXL7CwBAAAAAABLZGvg9lRKSopSUlKcj5OSkiRJqampSk1Nza6y8C+Q/vnicwbcvOz+JrtL8Cl2P+PyJzKP3wW4UTiueYbjWtZxXPOMJ/vLZoy5KT6RNptNH3/8sdq3b3/FPsOHD9eIESPc2ufMmaOQkBALqwMAAAAAQEpOTlbXrl116tQphYWFXbWvTwXujEa4IyMjdezYsWu+UOB6pKamavny5WratKkCAwOzuxwAGag4fFl2l+BT7H5GI2s4NHSDn1Ictuwux6f8Mrx5dpeAfwmOa57huJZ1HNc8k5SUpAIFCmQqcPvUKeV2u112u92tPTAwkBCEG4LPGnDzSknjP1dZkeKwse88xO8B3Cj828wajmue47jmGU/2F/fhBgAAAADAAtk6wn3mzBnt2bPH+Xjfvn3asmWL8uXLp+LFi2djZQAAAAAAXJ9sDdwbNmxQw4YNnY8HDx4sSerRo4dmzJiRTVUBAAAAAHD9sjVwN2jQQDfJnG0AAAAAAHgV13ADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFiBwAwAAAABgAQI3AAAAAAAWIHADAAAAAGABAjcAAAAAABYgcAMAAAAAYAECNwAAAAAAFgjI7gKQPaKe/SK7S/Apdn+jV2tJFYcvU0qaLbvL8SmJY1pndwkAAABAtmCEGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALAAgRsAAAAAAAsQuAEAAAAAsACBGwAAAAAACxC4AQAAAACwAIEbAAAAAAALELgBAAAAALDATRG43333XUVFRSkoKEi1a9fW+vXrs7skAAAAAACuS7YH7vnz52vw4MEaNmyYNm3apNtvv13NmzfXkSNHsrs0AAAAAACyLNsD9xtvvKHevXurV69eKl++vCZNmqSQkBB98MEH2V0aAAAAAABZlq2B+8KFC9q4caOaNGnibPPz81OTJk30ww8/ZGNlAAAAAABcn4Ds3PixY8eUlpamQoUKubQXKlRIO3bscOufkpKilJQU5+NTp05Jkv7++2+lpqZaW+wtJuDi2ewuwacEOIySkx0KSPVTmsOW3eX4lOPHj2d3CfiX4LjmGY5rWcdxDTcKxzXPcFzLOo5rnjl9+rQkyRhzzb7ZGrg9NXr0aI0YMcKtPTo6Ohuqwb9N1+wuwEcVGJfdFQC4Eo5rWcNxDbh5cVzLGo5rWXP69Gnlzp37qn2yNXAXKFBA/v7++uuvv1za//rrLxUuXNit/3PPPafBgwc7HzscDv3999/Knz+/bDa+xYJ1kpKSFBkZqQMHDigsLCy7ywGA68ZxDcCthuMabhRjjE6fPq2IiIhr9s3WwP1/7d15eI13/v/x18lGksqiGss0spBaxpY0ZdRW1K7aUgylaumU4YvRodqZ2jpozdBWa2hpbaNaX6VViijBWKohBFVFg6RGY4k1CYnk/P7wa749iWOy+pyT83xcV6+678+pebmuuY683vd9f24vLy89/PDD2rx5s5566ilJt0v05s2bNWLEiHyfL1eunMqVK2dzLiAg4B4kBW7z8/PjCxxAmcL3GoCyhu813Av/7cr2L4zfUj5mzBgNGDBA0dHRaty4sd5++22lpaVp4MCBpqMBAAAAAFBkxgt37969df78eU2YMEE///yzGjVqpA0bNuTbSA0AAAAAAGdivHBL0ogRI+54CzngKMqVK6eJEyfme6QBAJwV32sAyhq+1+CILNaC7GUOAAAAAAAKxc10AAAAAAAAyiIKNwAAAAAApYDCDQAAAABAKaBwAwAAAABQCijcAAAAAACUAgo3AAAuYsmSJbp582a+85mZmVqyZImBRAAAlG28FgzIIzw8vECfS0xMLOUkAFCy3N3ddfbsWQUFBdmcv3jxooKCgpSdnW0oGQAUzZQpU/TnP/9ZPj4+NuczMjL097//XRMmTDCUDLiNwg3k4ebmppCQEPXt2zffD6W/NmrUqHuYCgCKz83NTSkpKXrggQdszickJKh169ZKTU01lAwAioZBIhydh+kAgKP59NNP9dFHH2nWrFnq1KmTBg0apM6dO8vNjScwADinyMhIWSwWWSwWtW3bVh4e//fXf3Z2tk6ePKmOHTsaTAgARWO1WmWxWPKdT0hIUMWKFQ0kAmxxhRuw48yZM1q0aJEWLVqk9PR09e/fX4MHD1ZERITpaABQKJMnT87990svvaT77rsvd83Ly0uhoaHq0aOHvLy8TEUEgEIJDAyUxWLRlStX5OfnZ1O6s7Ozdf36dQ0dOlRz5swxmBKgcAMFsm3bNk2aNEnbt2/XhQsXFBgYaDoSABTa4sWL9fvf/17lypUzHQUAimXx4sWyWq0aNGiQ3n77bfn7++eu/TJIbNq0qcGEwG0UbuAubty4oZUrV+qjjz7SN998o27dumnx4sX8sArAKcXFxSknJ0dNmjSxOb9nzx65u7srOjraUDIAKJpt27apWbNmNo/KAI6Eh1KBO9izZ4/+8Ic/qEqVKpo1a5a6d++uM2fO6JNPPqFsA3Baw4cPV3Jycr7zZ86c0fDhww0kAoDiSUtL0+bNm/Od37hxo9avX28gEWCLwg3k8dvf/lZdu3aVt7e3tm3bpvj4eI0YMYLbyAE4vSNHjigqKirf+cjISB05csRAIgAonvHjx99xJ3Kr1arx48cbSATY4t4LII/vv/9evr6+WrJkiZYuXWr3c7w+B4CzKVeunFJSUhQeHm5z/uzZs9yOCcApHT9+XHXr1s13vnbt2jpx4oSBRIAt/nYF8li4cKHpCABQKtq3b69XXnlFX3zxRe4GQ5cvX9arr76qdu3aGU4HAIXn7++vxMREhYaG2pw/ceKEfH19zYQCfoVN0wAAcBFnzpxRy5YtdfHiRUVGRkqSDhw4oMqVK2vTpk0KDg42nBAACufFF1/U7t27tXr1atWoUUPS7bLdo0cPPfLII1qwYIHhhHB1FG7Ajlu3bum7777Tzz//LEmqUqWK6tatK09PT8PJAKDo0tLStGzZMiUkJMjb21sNGjRQnz59+G4D4JSuXLmijh07au/evXrwwQclST/99JNatGihVatWKSAgwGxAuDwKN5BHTk6OJkyYoDlz5ujKlSs2a/7+/hoxYoQmT54sNzf2HAQAADDNarVq06ZNNoPEli1bmo4FSKJwA/mMGzdOixYt0uuvv64OHTqocuXKkqSUlBTFxMTotdde0/PPP68333zTcFIAKJwlS5bcdf255567R0kAAHANFG4gjypVqmjx4sXq0KHDHdc3btyo5557TikpKfc4GQAUT97XG2ZlZSk9PV1eXl7y8fHh7QsAnM6UKVPuuj5hwoR7lAS4M3YpB/K4du2aqlWrZne9atWqSktLu4eJAKBkXLp0Kd+548ePa9iwYRo7dqyBRABQPKtXr7Y5zsrK0smTJ+Xh4aEaNWpQuGEcV7iBPLp06aJbt25p2bJlqlSpks3ahQsX1L9/f7m7u2vt2rWGEgJAydq7d6/69euno0ePmo4CAMV29epVPf/883r66afVv39/03Hg4ijcQB7Jycnq3Lmzjh49qvr169s8w33o0CHVrVtXa9eu5fU5AMqMAwcOqGXLlrp69arpKABQIg4dOqQnnnhCp06dMh0FLo7CDdxBTk6ONm7cqG+++cbmtWBNmzZV+/bt2aEcgFNas2aNzbHVatXZs2f13nvvKTg4WOvXrzeUDABK1o4dO/TEE0/c8VEa4F6icAMA4CLyDgstFoseeOABtWnTRjNnzlTVqlUNJQOAopk9e7bN8S+DxKVLl6pVq1b6+OOPDSUDbqNwA3dgtVp16tQpBQcHy8PDQ5mZmVq9erVu3rypzp0753u2GwAAAPdeWFiYzbGbm1vuIPGVV15RhQoVDCUDbqNwA3n88MMP6tChg5KTkxUeHq6YmBj17NlTR48eldVqlY+Pj3bt2qWIiAjTUQGgwLKyslS7dm2tXbtWderUMR0HAACXwIOoQB4vv/yyGjZsqAMHDqhr167q0qWLHnzwQV26dEmpqalq2rTpf33nIwA4Gk9PT924ccN0DAAoMVlZWfLw8NDhw4dNRwHs4go3kEdQUJBiYmLUqFEjpaWlqUKFCtq+fbuaN28uSdq1a5f69Omj06dPG04KAIUzbdo0HTt2TAsWLJCHh4fpOABQbOHh4Vq9erUaNmxoOgpwR/xtC+Rx/fp1VaxYUZLk6+srX19fm42EgoODlZKSYioeABRZXFycNm/erJiYGNWvX1++vr4266tWrTKUDACK5i9/+YteffVVLV26NPfnN8CRULiBPKpVq6akpCRVr15dkjRjxgwFBQXlrp8/f16BgYGm4gFAkQUEBKhHjx6mYwBAiXnvvfd04sQJVatWTSEhIfkGifHx8YaSAbdRuIE8Hn/8cR09ejT3FvJhw4bZrMfExCgqKspENAAoloULF5qOAAAl6sknn5TFYjEdA7CLZ7iBQjp58qTKly/P+2oBOJ02bdpo1apVCggIsDl/9epVPfXUU9qyZYuZYAAAlFHsUg4UUlhYGGUbgFPaunWrMjMz852/ceOG/v3vfxtIBADFEx4erosXL+Y7f/nyZYWHhxtIBNjilnLgDr7//nt98803atq0qWrXrq2jR4/qnXfe0c2bN9WvXz+1adPGdEQAKLCDBw/m/vrIkSP6+eefc4+zs7O1YcMG/eY3vzERDQCK5dSpU8rOzs53/ubNm/rpp58MJAJsUbiBPDZs2KAnn3xS9913n9LT07V69Wo999xzatiwoXJyctS+fXvFxMRQugE4jUaNGslischisdzxu8vb21vvvvuugWQAUDRr1qzJ/fXGjRvl7++fe5ydna3NmzcrLCzMRDTABs9wA3k8+uijatOmjf72t7/pk08+0R//+EcNGzZMU6dOlSS98sor2rdvn2JiYgwnBYCCOX36tKxWq8LDw/Xtt9/qgQceyF3z8vJSUFCQ3N3dDSYEgMJxc7v9ZKzFYlHeOuPp6anQ0FDNnDlTXbt2NREPyEXhBvLw9/fXvn37VLNmTeXk5KhcuXL69ttvFRkZKUk6fPiwHn/8cZtbMgEAAHDvhYWFKS4uTpUqVTIdBbgjNk0D7uCX10u4ubmpfPnyNrcpVahQQVeuXDEVDQCKbPHixVq3bl3u8bhx4xQQEKBHH31Up0+fNpgMAIrm5MmT+cr25cuXzYQB7oDCDeQRGhqq48eP5x7v3r1b1atXzz1OSkpil3IATmnatGny9vaWdPu77b333tOMGTNUqVIl/elPfzKcDgAK780339Snn36ae9yzZ09VrFhRv/nNb5SQkGAwGXAbhRvIY9iwYTa7XdarV08eHv+3v+D69evZMA2AU0pOTlbNmjUlSZ9//rmeeeYZ/eEPf9D06dN5LRgApzRv3jwFBwdLkjZt2qSvv/5aGzZsUKdOnTR27FjD6QB2KQfyGTp06F3Xp02bdo+SAEDJuu+++3Tx4kVVr15dMTExGjNmjCSpfPnyysjIMJwOAArv559/zi3ca9euVa9evdS+fXuFhoaqSZMmhtMBXOEGCmT58uVKS0szHQMAiqVdu3YaMmSIhgwZomPHjqlz586SpO+++06hoaFmwwFAEQQGBio5OVnS7Ve7Pv7445Ikq9V6x/dzA/cahRsogBdffFEpKSmmYwBAscyZM0dNmzbV+fPn9dlnn+n++++XJO3bt099+vQxnA4ACq979+7q27ev2rVrp4sXL6pTp06SpP379+c+QgOYxGvBgAKoUKGCEhISFB4ebjoKAAAA/r+srCy98847Sk5O1vPPP5/7Gte33npLFSpU0JAhQwwnhKujcAMFQOEGUFZcvnxZ3377rc6dO6ecnJzc8xaLRf379zeYDACAsofCDRTAjh07FB0drfLly5uOAgBF9uWXX+rZZ5/V9evX5efnJ4vFkrtmsViUmppqMB0AFM3x48cVGxubb5AoSRMmTDCUCriNwg0UkNVqVU5Ojtzd3U1HAYAieeihh9S5c2dNmzZNPj4+puMAQLHNnz9fw4YNU6VKlVSlSpV8g8T4+HiD6QAKN5DPrVu3NGnSJP373//WY489psmTJ+vvf/+7Jk2apFu3bun3v/+95s+fLy8vL9NRAaBQfH19dejQIR6PAVBmhISE6I9//KNefvll01GAO2KXciCPyZMna8GCBYqOjtbKlSs1bNgwzZ49Wx988IHmz5+vzZs36+233zYdEwAKrUOHDtq7d6/pGABQYi5duqSePXuajgHYxRVuII8aNWronXfeUdeuXXXixAnVqlVLH3/8sXr37i1JWrFihV5//XUdOnTIcFIAKJwPP/xQU6ZM0cCBA1W/fn15enrarHfr1s1QMgAomsGDB+uRRx7R0KFDTUcB7ojCDeTh7e2tY8eOKTg4OPd4//79ql27tiTp5MmTatiwoa5evWoyJgAUmpub/RvbLBaLsrOz72EaACi+6dOna9asWerSpcsdB4kjR440lAy4zcN0AMDR+Pv76/Lly7mFOyoqShUqVMhdv3nzps2GHADgLPLu3gsAzu6DDz7Qfffdp23btmnbtm02axaLhcIN4yjcQB5169ZVfHy86tevL0nauXOnzfqhQ4cUERFhIhoAAAB+5eTJk6YjAHdF4QbymDdvXr7bkX4tKytL48aNu4eJAKB4Zs+eXaDPcSUIAICSxTPcAACUcWFhYf/1MxaLRYmJifcgDQAU35gxYwr0uVmzZpVyEuDuuMINFECXLl20YMECVa1a1XQUACg0brkEUNbs37//v36GPXfgCLjCDRRAhQoVlJCQoPDwcNNRAKBE/PTTT6pWrdpddy4HAADFw9+yAAC4oLp16+rUqVOmYwBAidm5c6du3rxpOgZgg8INFEBISMhdN1IDAGfDDW4AyppOnTrpzJkzpmMANniGGyiAw4cPm44AAACAu2CQCEdE4Qb+i/T0dCUlJSkzM9PmfIMGDQwlAoDie/XVV1WxYkXTMQAAKNPYNA2w4/z58xo4cKDWr19/x/Xs7Ox7nAgAAAD2fPzxx3ryySfl6+trOgqQi2e4ATtGjx6ty5cva8+ePfL29taGDRu0ePFiRUREaM2aNabjAUChHD9+XJ999lnuK8LWrVunli1b6pFHHtHUqVO5FROA0+vbty9lGw6Hwg3YsWXLFs2aNUvR0dFyc3NTSEiI+vXrpxkzZmj69Omm4wFAga1evVp169ZV3759VadOHS1ZskTPPPOMfH19VblyZU2aNEkzZswwHRMACiU2NlYzZ87Uzp07JUnvv/++qlevrgceeEAvvPCCMjIyDCcEKNyAXWlpaQoKCpIkBQYG6vz585Kk+vXrKz4+3mQ0ACiUqVOnaty4cbpx44bmzp2roUOHavr06Vq/fr3Wrl2rOXPmaNGiRaZjAkCBzZ8/X+3atdO8efPUtm1bTZ8+XS+99JK6dOmiXr16acWKFZo8ebLpmACFG7CnVq1a+uGHHyRJDRs21Pvvv68zZ85o3rx5qlq1quF0AFBwP/zwgwYNGiSLxaIBAwYoMzNTjz/+eO56+/btdfr0aYMJAaBw3nnnHb311ls6fvy4Pv/8c02YMEFz5szR3LlzNWfOHC1YsEArV640HRNgl3LAnlGjRuns2bOSpIkTJ6pjx45atmyZvLy8uBIEwKmkpaWpQoUKkiQ3Nzd5e3vLx8cnd93b21s3b940FQ8ACi0xMVHdunWTJHXs2FEWi0WNGzfOXW/SpImSk5NNxQNyUbgBO/r165f764cfflinT5/W0aNHVb16dVWqVMlgMgAoHIvFIovFYvcYAJzNjRs35O3tnXtcrlw5lStXzub41q1bJqIBNijcQAH5+PgoKirKdAwAKDSr1aqHHnoot2Rfv35dkZGRcnNzy10HAGdisVh07do1lS9fXlarVRaLRdevX9fVq1clKfffgGkUbsAOq9WqlStXKjY2VufOnVNOTo7N+qpVqwwlA4DCWbhwoekIAFCifhkk/vo4MjLS5pg7eeAIKNyAHaNHj9b777+v1q1bq3LlynxpA3BaAwYMMB0BAEpUbGys6QhAgVis3EcG3FHFihX1r3/9S507dzYdBQAAAIAT4rVggB3+/v4KDw83HQMASl1CQoLc3d1NxwCAQsnOzlZiYmLuY383b97UihUr9MknnyglJcVwOuA2Cjdgx6RJkzR58mRlZGSYjgIApY4b3gA4k4MHD+rBBx9URESEGjZsqOTkZEVHR2vQoEF64YUXVKdOHcXFxZmOCXBLOWBPRkaGnn76ae3cuVOhoaHy9PS0WY+PjzeUDAAKp3v37nddv3LlirZu3ars7Ox7lAgAiqdjx46qUKGCJk6cqAULFigmJkb16tXTsmXLZLFYNHDgQP3888/atGmT6ahwcRRuwI5evXopNjZWzzzzzB03TZs4caKhZABQOJ6enmrXrp0qV658x/XU1FStXbuWwg3AaVSsWFE7d+5UnTp1lJGRoQoVKmjXrl1q3LixJOm7775Tq1atdOHCBcNJ4erYpRywY926ddq4caOaN29uOgoAFEudOnXUo0cPDR48+I7rBw4c0Nq1a+9xKgAoOqvVKg+P21Um778lyd3dPd8rXQETeIYbsCM4OFh+fn6mYwBAsT388MN3fQymXLlyql69+j1MBADF8/DDD+vNN9/UmTNnNH36dIWFhem9997LXX/33XdVr149gwmB27ilHLBj3bp1evfddzVv3jyFhoaajgMARXbz5k1lZ2fLx8fHdBQAKBFxcXHq1KmTLl26pPvvv1+xsbEaPHiwTp8+LTc3N126dElffvml2rZtazoqXByFG7AjMDBQ6enpunXrlnx8fPJtmpaammooGQAAANLS0nT06FHVqlVL9913n27cuKFly5YpIyND7dq1U61atUxHBCjcgD2LFy++6/qAAQPuURIAKHldunTRggULVLVqVdNRAAAosyjcAAC4oAoVKighIUHh4eGmowBAiahfv76++uorBQcHm44C5GKXcqAAbty4oczMTJtzbKgGAADgOE6dOqWsrCzTMQAb7FIO2JGWlqYRI0YoKChIvr6+CgwMtPkHAJxZSEhIvr0pAABAyaJwA3aMGzdOW7Zs0dy5c1WuXDktWLBAkydPVrVq1bRkyRLT8QCgWA4fPsxtlwDKlBYtWsjb29t0DMAGz3ADdlSvXl1LlizRY489Jj8/P8XHx6tmzZpaunSpli9frq+++sp0RAAolJycHLm55Z+15+Tk6KeffuJd3AAAlDCucAN2pKam5m4m5Ofnl/sasObNm2v79u0mowFAoVy9elW9evWSr6+vKleurAkTJig7Ozt3/fz58woLCzOYEABK1qVLl7gjEQ6Bwg3YER4erpMnT0qSateurRUrVkiSvvzySwUEBBhMBgCF89prrykhIUFLly7V1KlTtWTJEj355JM2m0FywxuAsiQpKUkDBw40HQPglnLAnrfeekvu7u4aOXKkvv76az3xxBOyWq3KysrSrFmzNGrUKNMRAaBAQkJCtHjxYj322GOSpAsXLqhLly4KCAjQmjVrdPnyZVWrVs3mqjcAOLKrV6/edf3gwYNq1aoV32swjsINFNDp06e1b98+1axZUw0aNDAdBwAKzMfHR999953NbePXrl1Thw4d5O3trQULFqhmzZr8YArAabi5uclisdhdt1qtslgsfK/BOAo3AABlXO3atTVr1ix17tzZ5vz169fVvn17paen69ChQ/xgCsBp+Pv76y9/+YuaNGlyx/Xjx4/rxRdf5HsNxnmYDgA4ktmzZxf4syNHjizFJABQctq3b6+FCxfmK9z33XefNm7cqHbt2hlKBgBFExUVJUlq1arVHdcDAgLYmwIOgSvcwK8UdJdei8WixMTEUk4DACXj0qVL+s9//qPf/va3d1y/du2a4uPj7f7gCgCOZv78+crIyLB7ASQlJUXz5s3TxIkT73EywBaFGwAAAACAUsBrwYBCSkxMVPv27U3HAIASs3fvXm3fvt10DAAAyhyucAOFlJCQoKioKDbhAFBm1KlTR8eOHeN7DUCZsXfvXqWnp6tly5amo8DFsWkaAAAubvPmzcrKyjIdAwBKTP/+/RkkwiFQuAEAcHHVqlUzHQEAShSDRDgKCjcAAC4iOztb7u7uucd79uzRzZs31bRpU3l6ehpMBgAli0EiHAWFG8gjMjJSFovF7np6evo9TAMAxXf27Fn17NlT33zzjZo1a6bPP/9c/fv311dffSVJioiI0NatW1W1alXDSQGgcBgkwtFRuIE8nnrqKdMRAKBEvfzyy7JarVq9erWWLVumrl27yt3dXcnJycrOzlbfvn01depUvffee6ajAkCBMEiEs2CXcgAAyrhq1app1apV+t3vfqfU1FRVqlRJmzZtUtu2bSVJW7Zs0QsvvKAff/zRcFIAKJjnnntOP/74o8aPH69ly5YpOTlZ7u7uWr58ee4gsVGjRgwSYRxXuIECeOONNzR06FAFBASYjgIAhXbp0iX95je/kSRVrFhRPj4+CgkJyV2vWbOmzp49ayoeABTa119/nTtIbNasWe4g8ZfvuilTpuiFF14wnBKQ3EwHAJzBtGnTlJqaajoGABRJUFCQTaEeMWKEKlasmHt86dIl+fr6mogGAEXCIBHOgsINFABPXgBwZo0aNdLu3btzj9944w2bwr1jxw41aNDARDQAKBIGiXAW3FIOAEAZ98UXX9x1/ZFHHlGrVq3uURoAKL5fBomNGzeWdHuQ+GsMEuEo2DQNKIDk5GRVq1bN5rUTAAAAcEzffvutfHx8VK9ePdNR4OIo3IAdcXFxysnJUZMmTWzO79mzR+7u7oqOjjaUDACKz8/PTwcOHFB4eLjpKAAAlFk8ww3YMXz4cCUnJ+c7f+bMGQ0fPtxAIgAoOczbAZQ1fn5+SkxMNB0DsEHhBuw4cuSIoqKi8p2PjIzUkSNHDCQCAACAPQwS4Ygo3IAd5cqVU0pKSr7zZ8+elYcH+w0CcG79+vWTn5+f6RgAAJRpFG7Ajvbt2+uVV17RlStXcs9dvnxZr776qtq1a2cwGQAU39y5c1WpUiXTMQCgxDBIhCNi0zTAjjNnzqhly5a6ePGiIiMjJUkHDhxQ5cqVtWnTJgUHBxtOCACFl5aWpm3btikpKUmZmZk2ayNHjjSUCgCAsonCDdxFWlqali1bpoSEBHl7e6tBgwbq06ePPD09TUcDgELbv3+/OnfurPT0dKWlpalixYq6cOGCfHx8FBQUxGZDAJwSg0Q4Mgo3AAAu4rHHHtNDDz2kefPmyd/fXwkJCfL09FS/fv00atQode/e3XREACgUBolwdBRu4C6OHz+u2NhYnTt3Tjk5OTZrEyZMMJQKAIomICBAe/bsUa1atRQQEKDdu3erTp062rNnjwYMGKCjR4+ajggAhcIgEY6OrZYBO+bPn69hw4apUqVKqlKliiwWS+6axWKhcANwOp6ennJzu71falBQkJKSklSnTh35+/srOTnZcDoAKLwDBw7o/fffl5ubm9zd3XXz5k2Fh4drxowZGjBgAIUbxlG4ATv+9re/aerUqXr55ZdNRwGAEhEZGam4uDhFRESoVatWmjBhgi5cuKClS5eqXr16puMBQKExSISj47VggB2XLl1Sz549TccAgBIzbdo0Va1aVZI0depUBQYGatiwYTp//rw++OADw+kAoPB+GSRKyh0kLlu2TKNHj2aQCIfAM9yAHYMHD9YjjzyioUOHmo4CAACAO9i7d6+uXbum1q1b69y5c3ruuee0a9cuRURE6KOPPlLDhg1NR4SLo3ADdkyfPl2zZs1Sly5dVL9+/XyvAuM1EwAAAADuhsIN2BEWFmZ3zWKx8JoJAE7n4sWLmjBhgt23L6SmphpKBgBA2cSmaYAdJ0+eNB0BAEpU//79deLECQ0ePFiVK1e2efsCADgjBolwdBRuAABcxL///W/t2LGDZxoBlBkMEuHoKNzAr4wZM0avv/66fH19NWbMmLt+dtasWfcoFQCUjNq1aysjI8N0DAAoMQwS4ego3MCv7N+/X1lZWbm/tofpKQBn9M9//lPjx4/XhAkTVK9evXybQfr5+RlKBgBFwyARjo5N0wAAcBHHjx9X3759FR8fb3PearXKYrEoOzvbUDIAKJq4uDgGiXBoXOEGAMBFPPvss/L09NTHH3/Ms44AyoSAgABdvXpVbdq0sTnPIBGOgsIN2HHjxg29++67dne9zHuFCAAc3eHDh7V//37VqlXLdBQAKBEMEuHoKNyAHYMHD1ZMTIyeeeYZNW7cmC9wAE4vOjpaycnJFG4AZQaDRDg6Cjdgx9q1a/XVV1+pWbNmpqMAQIn4n//5H40aNUpjx45V/fr18z3r2KBBA0PJAKBoGCTC0bFpGmBH3bp19cknn/ADKIAyw83NLd85i8XCs44AnNb//u//atKkSQwS4bAo3IAd69ev1+zZszVv3jyFhISYjgMAxXb69Om7rvNdB8DZMEiEo+OWcsCO6Oho3bhxQ+Hh4fLx8ck3MU1NTTWUDACKhkINoKw5efKk6QjAXVG4ATv69OmjM2fOaNq0aex6CaBMOXLkiJKSkpSZmWlzvlu3boYSAUDRMEiEo+OWcsAOHx8f7d69Ww0bNjQdBQBKRGJiop5++mkdOnQo95ZLSbkDRW69BOCsGCTCUXGFG7Cjdu3aysjIMB0DAErMqFGjFBYWps2bNyssLEzffvutLl68qJdeekn/+Mc/TMcDgEJjkAhHl3+XAQCSpDfeeEMvvfSStm7dqosXL+rq1as2/wCAs9m9e7emTJmiSpUqyc3NTW5ubmrevLmmT5+ukSNHmo4HAIX2yyDx3Llz8vHx0Xfffaft27crOjpaW7duNR0P4Ao3YE/Hjh0lSW3btrU5z66XAJxVdna2KlSoIEmqVKmS/vOf/6hWrVoKCQnRDz/8YDgdABTe7t27tWXLFruDxP3795uOCBdH4QbsiI2NNR0BAEpUvXr1lJCQoLCwMDVp0kQzZsyQl5eXPvjgA4WHh5uOBwCFxiARjo7CDdjRqlUr0xEAoET99a9/VVpamiRpypQp6tq1q1q0aKH7779fn376qeF0AFB4DBLh6NilHLiLy5cv68MPP9T3338vSfrtb3+rQYMGyd/f33AyACgZqampCgwM5NWHAJzSxo0blZaWpu7du+vEiRPq2rWrjh07ljtIbNOmjemIcHEUbsCOvXv3qkOHDvL29lbjxo0lSXFxccrIyFBMTIyioqIMJwQAAEBeDBLhSCjcgB0tWrRQzZo1NX/+fHl43H764tatWxoyZIgSExO1fft2wwkB4L/r3r17gT+7atWqUkwCAIDr4RluwI69e/falG1J8vDw0Lhx4xQdHW0wGQAUHI/AAChrGCTCmVC4ATv8/PyUlJSk2rVr25xPTk7O3Q0TABzdwoULTUcAgBLFIBHOhMIN2NG7d28NHjxY//jHP/Too49Kknbu3KmxY8eqT58+htMBQMk5ePCgoqOjlZmZaToKAPxXDBLhTCjcgB3/+Mc/ZLFY9Nxzz+nWrVuSJE9PTw0bNkxvvPGG4XQAUHKsVmvu9xwAlAUMEuEo2DQN+C/S09P1448/SpJq1KghHx8fw4kAoGQlJCQoKipK2dnZpqMAQIlISEhQZGSkcnJyTEeBi+MKN/Bf+Pj4qH79+qZjAAAAoBB4LRgcAYUbsKN169Z3/aLesmXLPUwDAEV39erVu65fu3btHiUBAMC1ULgBOxo1amRznJWVpQMHDujw4cMaMGCAmVAAUAQBAQF3HSBarVauBAFwKgwS4Swo3IAdb7311h3PT5o0SdevX7/HaQCg6GJjY01HAIASxSARzoJN04BCOnHihBo3bqzU1FTTUQAAAFzStm3bCvS5Vq1alXIS4O64wg0U0u7du1W+fHnTMQCgWLp06aIFCxaoatWqpqMAQKFRpOEsKNyAHd27d7c5tlqtOnv2rPbu3avXXnvNUCoAKBnbt29XRkaG6RgAUGIYJMIRUbgBO/z9/W2O3dzcVKtWLU2ZMkXt27c3lAoAAAB3wiARjojCDdixcOFC0xEAoNSEhITI09PTdAwAAMo0CjfwX2RmZurcuXPKycmxOV+9enVDiQCg+A4fPmw6AgCUKAaJcETsUg7YcezYMQ0ePFi7du2yOf/Layays7MNJQOA4klPT1dSUpIyMzNtzjdo0MBQIgAAyiaucAN2DBw4UB4eHlq7dq2qVq3KuxwBOL3z589r4MCBWr9+/R3XGSQCcFYMEuGoKNyAHQcOHNC+fftUu3Zt01EAoESMHj1aly9f1p49e/TYY49p9erVSklJ0d/+9jfNnDnTdDwAKDQGiXB0bqYDAI6qbt26unDhgukYAFBitmzZolmzZik6Olpubm4KCQlRv379NGPGDE2fPt10PAAotF8PEr29vbVhwwYtXrxYERERWrNmjel4AFe4gV+7evVq7q/ffPNNjRs3TtOmTVP9+vXzbcLh5+d3r+MBQLGkpaUpKChIkhQYGKjz58/roYceUv369RUfH284HQAU3pYtW/TFF1/YDBLbtWsnPz8/TZ8+XV26dDEdES6Owg38SkBAgM2z2larVW3btrX5DJumAXBWtWrV0g8//KDQ0FA1bNhQ77//vkJDQzVv3jxVrVrVdDwAKDQGiXB0FG7gV2JjY01HAIBSM2rUKJ09e1aSNHHiRHXs2FHLli2Tl5eXFi1aZDYcABQBg0Q4Ol4LBuTRtm1bDR8+XN27d7/j+oULF9S4cWMlJibe42QAULLS09N19OhRVa9eXZUqVTIdBwAK7V//+pdu3bql559/Xvv27VPHjh2VmpqaO0js3bu36YhwcRRuIA83Nze5ubnpL3/5iyZPnpxvPSUlRdWqVeOWcgAAAAfDIBGOhlvKgTuYO3eu/vznP+vgwYP617/+JV9fX9ORAKDYrFarVq5cqdjYWJ07d045OTk266tWrTKUDABKho+Pj6KiokzHAHJRuIE7ePLJJ9W8eXM9+eST+t3vfqcvvvhC4eHhpmMBQLGMHj1a77//vlq3bq3KlSvbbBIJAM6IQSIcHYUbsKNOnTqKi4tTnz599Mgjj+jTTz/V448/bjoWABTZ0qVLtWrVKnXu3Nl0FAAoEQwS4ego3MBd+Pv7a926dXrllVfUuXNnvfnmm+rbt6/pWABQJP7+/tytA6BMYZAIR0fhBvLIOxm1WCx644031KhRIw0ZMkRbtmwxlAwAimfSpEmaPHmyPvroI3l7e5uOAwDFxiARjo5dyoE83Nzc9PPPPysoKCjf2oEDB/TUU08pOTmZXcoBOJ2MjAw9/fTT2rlzp0JDQ+Xp6WmzHh8fbygZABTN4sWLtWHDBgaJcFhc4QbyiI2NVcWKFe+41qhRI+3bt0/r1q27x6kAoPgGDBigffv2qV+/fjzrCKBM6NWrl5YvX66goCAGiXBIXOEGAMBF+Pr6auPGjWrevLnpKABQInr16qXY2Fg988wzdxwkTpw40VAy4DaucAMA4CKCg4Pl5+dnOgYAlJh169YxSIRDczMdAAAA3BszZ87UuHHjdOrUKdNRAKBEMEiEo+OWcgAAXERgYKDS09N169Yt+fj45HvWMTU11VAyACiadevW6d1339W8efMUGhpqOg6QD4UbAAAXsXjx4ruuDxgw4B4lAYCSwSARjo7CDQAAAMApMUiEo6NwAwDggm7cuKHMzEybczwHCQBAyWKXcgAAXERaWppefvllrVixQhcvXsy3np2dbSAVAJQMBolwROxSDgCAixg3bpy2bNmiuXPnqly5clqwYIEmT56satWqacmSJabjAUChpaWlacSIEQoKCpKvr68CAwNt/gFMo3ADAOAivvzyS/3zn/9Ujx495OHhoRYtWuivf/2rpk2bpmXLlpmOBwCFxiARjo7CDQCAi0hNTVV4eLik27dZ/rJ7b/PmzbV9+3aT0QCgSBgkwtFRuAEAcBHh4eE6efKkJKl27dpasWKFpNs/sAYEBBhMBgBFwyARjo7CDQCAixg4cKASEhIkSePHj9ecOXNUvnx5/elPf9LYsWMNpwOAwmOQCEfHa8EAAHBRp0+f1r59+1SzZk01aNDAdBwAKLS33npL7u7uGjlypL7++ms98cQTslqtysrK0qxZszRq1CjTEeHiKNwAAAAAygQGiXA0FG4AAMqw2bNnF/izI0eOLMUkAAC4Hgo3AABlWFhYWIE+Z7FYlJiYWMppAKD4GCTCmVC4AQAAADgNBolwJhRuAAAAAABKgYfpAAAAoPSMGTOmwJ+dNWtWKSYBAMD1ULgBACjD9u/fX6DPWSyWUk4CACWDQSKcCYUbAIAyLDY21nQEAChRDBLhTHiGGwAAF7R8+XJ169ZNvr6+pqMAAFBmuZkOAAAA7r0XX3xRKSkppmMAQIlZvny50tLSTMcAbFC4AQBwQdzgBqCsYZAIR0ThBgAAAOD0GCTCEVG4AQBwQevXr1e1atVMxwAAoExj0zQAAAAATm/Hjh2Kjo5W+fLlTUcBclG4AQBwIStXrtSKFSuUlJSkzMxMm7X4+HhDqQAAKJu4pRwAABcxe/ZsDRw4UJUrV9b+/fvVuHFj3X///UpMTFSnTp1MxwOAIlm5cqV69eql3/3ud4qKirL5BzCNwg0AgIv45z//qQ8++EDvvvuuvLy8NG7cOG3atEkjR47UlStXTMcDgEJjkAhHR+EGAMBFJCUl6dFHH5UkeXt769q1a5Kk/v37a/ny5SajAUCRMEiEo6NwAwDgIqpUqaLU1FRJUvXq1fXNN99Ikk6ePMnrdAA4JQaJcHQUbgAAXESbNm20Zs0aSdLAgQP1pz/9Se3atVPv3r319NNPG04HAIXHIBGOjl3KAQBwETk5OcrJyZGHh4ck6ZNPPtGuXbsUERGhF198UV5eXoYTAkDhDBkyRMHBwZo4caLmzJmjsWPHqlmzZtq7d6+6d++uDz/80HREuDgKNwAALiIpKUnBwcGyWCw2561Wq5KTk1W9enVDyQCgaBgkwtFRuAEAcBHu7u46e/asgoKCbM5fvHhRQUFBys7ONpQMAIqGQSIcHc9wAwDgIqxWa74fSiXp+vXrKl++vIFEAFA8YWFhOn/+fL7zqampCgsLM5AIsOVhOgAAAChdY8aMkSRZLBa99tpr8vHxyV3Lzs7Wnj171KhRI0PpAKDoGCTC0VG4AQAo4/bv3y/p9g+mhw4dsnmm0cvLSw0bNtSf//xnU/EAoNAYJMJZULgBACjjYmNjJd1+Fdg777wjPz8/w4kAoHgYJMJZsGkaAAAu5sSJE/rxxx/VsmVLeXt7270lEwAcHYNEODoKNwAALiI1NVU9e/ZUbGysLBaLjh8/rvDwcA0aNEiBgYGaOXOm6YgAUCQMEuGo2KUcAAAXMXr0aHl6eiopKcnmecfevXtrw4YNBpMBQNGkpqaqbdu2euihh9S5c2edPXtWkjR48GC99NJLhtMBFG4AAFxGTEyM3nzzTT344IM25yMiInT69GlDqQCg6BgkwtGxaRoAAC4iLS3N5gfSX6SmpqpcuXIGEgFA8cTExGjjxo0MEuGwuMINAICLaNGihZYsWZJ7bLFYlJOToxkzZqh169YGkwFA0TBIhKPjCjcAAC5ixowZatu2rfbu3avMzEyNGzdO3333nVJTU7Vz507T8QCg0H4ZJL7++uuSGCTC8bBLOQAALuTy5cuaM2eOEhISdP36dUVFRWn48OGqWrWq6WgAUGiHDx9W27ZtFRUVpS1btqhbt242g8QaNWqYjggXR+EGAMCF3LhxQwcPHtS5c+eUk5Njs9atWzdDqQCg6BgkwpFRuAEAcBEbNmxQ//79lZqaqrx//VssFmVnZxtKBgBFxyARjozCDQCAi4iIiFD79u01YcIEVa5c2XQcACg2BolwdBRuAABchJ+fn/bv388zjQDKDAaJcHS8FgwAABfxzDPPaOvWraZjAECJSUlJ0ZgxYyjbcFhc4QYAwEWkp6erZ8+eeuCBB1S/fn15enrarI8cOdJQMgAomkGDBqlZs2YaPHiw6SjAHVG4AQBwER9++KGGDh2q8uXL6/7775fFYslds1gsSkxMNJgOAAqPQSIcHYUbAAAXUaVKFY0cOVLjx4+XmxtPlQFwfgwS4ego3AAAuIiKFSsqLi6OTdMAlBkMEuHo+H8lAAAuYsCAAfr0009NxwCAEpOZmanevXtTtuGwPEwHAAAA90Z2drZmzJihjRs3qkGDBvmedZw1a5ahZABQNL8MEl999VXTUYA7onADAOAiDh06pMjISEnS4cOHbdZ+/dwjADgLBolwdDzDDQAAAMAptW7d2u6axWLRli1b7mEaID8KNwAAAAAApYDdBQAAAAAAKAUUbgAAAAAASgGFGwAAAACAUkDhBgAAubZu3SqLxaLLly8X+L8JDQ3V22+/XWqZAABwVhRuAACcyPPPPy+LxaKhQ4fmWxs+fLgsFouef/75ex8MAADkQ+EGAMDJBAcH65NPPlFGRkbuuRs3bujjjz9W9erVDSYDAAC/RuEGAMDJREVFKTg4WKtWrco9t2rVKlWvXl2RkZG5527evKmRI0cqKChI5cuXV/PmzRUXF2fze3311Vd66KGH5O3trdatW+vUqVP5/vd27NihFi1ayNvbW8HBwRo5cqTS0tJK7c8HAEBZQeEGAMAJDRo0SAsXLsw9/uijjzRw4ECbz4wbN06fffaZFi9erPj4eNWsWVMdOnRQamqqJCk5OVndu3fXE088oQMHDmjIkCEaP368ze/x448/qmPHjurRo4cOHjyoTz/9VDt27NCIESNK/w8JAICTo3ADAOCE+vXrpx07duj06dM6ffq0du7cqX79+uWup6Wlae7cufr73/+uTp06qW7dupo/f768vb314YcfSpLmzp2rGjVqaObMmapVq5aeffbZfM9/T58+Xc8++6xGjx6tiIgIPfroo5o9e7aWLFmiGzdu3Ms/MgAATsfDdAAAAFB4DzzwgLp06aJFixbJarWqS5cuqlSpUu76jz/+qKysLDVr1iz3nKenpxo3bqzvv/9ekvT999+rSZMmNr9v06ZNbY4TEhJ08OBBLVu2LPec1WpVTk6OTp48qTp16pTGHw8AgDKBwg0AgJMaNGhQ7q3dc+bMKZX/jevXr+vFF1/UyJEj862xQRsAAHdH4QYAwEl17NhRmZmZslgs6tChg81ajRo15OXlpZ07dyokJESSlJWVpbi4OI0ePVqSVKdOHa1Zs8bmv/vmm29sjqOionTkyBHVrFmz9P4gAACUUTzDDQCAk3J3d9f333+vI0eOyN3d3WbN19dXw4YN09ixY7VhwwYdOXJEL7zwgtLT0zV48GBJ0tChQ3X8+HGNHTtWP/zwgz7++GMtWrTI5vd5+eWXtWvXLo0YMUIHDhzQ8ePH9cUXX7BpGgAABUDhBgDAifn5+cnPz++Oa2+88YZ69Oih/v37KyoqSidOnNDGjRsVGBgo6fYt4Z999pk+//xzNWzYUPPmzdO0adNsfo8GDRpo27ZtOnbsmFq0aKHIyEhNmDBB1apVK/U/GwAAzs5itVqtpkMAAAAAAFDWcIUbAAAAAIBSQOEGAAAAAKAUULgBAAAAACgFFG4AAAAAAEoBhRsAAAAAgFJA4QYAAAAAoBRQuAEAAAAAKAUUbgAAAAAASgGFGwAAAACAUkDhBgAAAACgFFC4AQAAAAAoBRRuAAAAAABKwf8DaRzp0Mqrc5QAAAAASUVORK5CYII=",
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "accepted_lengths = []\n",
+    "\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                accepted_lengths.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'Accepted Length': get_accepted_len(filepath)\n",
+    "                })\n",
+    "\n",
+    "accepted_df = pd.DataFrame(accepted_lengths)\n",
+    "\n",
+    "# # Create a bar plot\n",
+    "# fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "# accepted_df.pivot_table(index=['Model', 'Batch Size'], columns='Arrival Rate', values='Accepted Length').plot(kind='bar', ax=ax)\n",
+    "# plt.title('Accepted Length by Model, Batch Size, and Arrival Rate')\n",
+    "# plt.ylabel('Accepted Length')\n",
+    "# plt.xlabel('Model and Batch Size')\n",
+    "# plt.legend(title='Arrival Rate')\n",
+    "# plt.show()\n",
+    "# Group by model and calculate the mean of accepted lengths\n",
+    "average_accepted_df = accepted_df.groupby('Model')['Accepted Length'].mean().reset_index()\n",
+    "\n",
+    "# Sort the dataframe by 'Accepted Length' in ascending order\n",
+    "average_accepted_df = average_accepted_df.sort_values(by='Accepted Length')\n",
+    "\n",
+    "# Create a bar plot\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "average_accepted_df.plot(x='Model', y='Accepted Length', kind='bar', ax=ax)\n",
+    "plt.title('Average Number of Accepted Tokens per Step\\nLLM: LLAMA-3.1-70B-Instruct\\nBatch Sizes: 4, 8')\n",
+    "plt.ylabel('Average Number of Accepted Tokens')\n",
+    "plt.xlabel('Model')\n",
+    "plt.grid(True)  # Turn the grid on\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/wildchat/average_accepted_tokens.pdf')\n",
+    "\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAB8UAAAHvCAYAAADNQw6XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HupSwcVRBABkfiCXewNsCEidrELRmOPGjUajQXFGnvXaAKKEOy9koixJZbYYi8RNTZEpQhSd74/+HbisksVXBbv33VxwZ45M/PM7DqP8Mw5IxEEQQAREREREREREREREREREVEppKXuAIiIiIiIiIiIiIiIiIiIiIoLi+JERERERERERERERERERFRqsShORERERERERERERERERESlFoviRERERERERERERERERERUarEoTkREREREREREREREREREpRaL4kREREREREREREREREREVGqxKE5ERERERERERERERERERKUWi+JERERERERERERERERERFRqsShORERERERERERERERERESlFoviRERERERUYp04cQISiQQ7duxQdyiFEh0dDYlEgkWLFqk7FCoi7969w+DBg2FtbQ2JRIKxY8eqOyQlEokEgYGBCm0XLlxAkyZNYGRkBIlEgitXrgAAjhw5gtq1a0MqlUIikSAuLu6Tx6upAgMDIZFIinUfDg4OCAgIKNZ9EBERERERfQ5YFCciIiIiok9KIpHk6+vEiRPqDlXjHTp0SKk4qg7y4mFeXx4eHgCAgIAAhXZTU1PUqlULixcvRmpqqtL2z5w5gy5duqB8+fLQ19eHg4MDhg4disePH4t95Dco5OcrOjo6x2OZO3cuQkJCMHz4cISGhqJ///5FfboUODg4iHFpaWnB3NwcNWrUwJAhQ3Du3Ll8bSM9PR09evTAmzdvsHTpUoSGhsLe3h6vX7+Gn58fDAwMsHr1aoSGhsLIyKhYj6ewnj17hsDAQLGYXxBr1qyBRCJBw4YNiz6wEiL7Z9jU1BTu7u44ePBgobcZHh6OZcuWFV2QREREREREaqSj7gCIiIiIiOjzEhoaqvB68+bNiIyMVGp3cXHBrVu3PmVopc6hQ4ewevVqtRfGu3btiipVqoiv3717h+HDh6NLly7o2rWr2F6+fHnxZ319fWzcuBEAEBcXh507d2LChAm4cOECIiIixH4rV67EmDFjULlyZXz99deoUKECbt26hY0bN2Lr1q04dOgQmjRpAktLS6XP2OLFi/Hvv/9i6dKlCu2WlpY5Hsvx48fRqFEjzJgxo3AnoxBq166N8ePHAwASExNx69YtbN++HRs2bMA333yDJUuWKPR///49dHT++3X/wYMHePToETZs2IDBgweL7UeOHEFiYiKCgoLQunXrT3MwhfTs2TPMnDkTDg4OqF27doHWDQsLg4ODA86fP4/79+8rfBY/xtSpU/Hdd98VybaKQps2bTBgwAAIgoBHjx5h7dq18PX1xeHDh+Hl5VXg7YWHh+P69eslcjYEIiIiIiKigmJRnIiIiIiIPql+/fopvP7zzz8RGRmp1A7go4viycnJMDQ0/Kht0MerWbMmatasKb6OjY3F8OHDUbNmTZXvOwDo6OgoLBsxYgQaNmyIrVu3YsmSJbCxscGZM2cwduxYNGvWDEeOHFF4r4cPH46mTZuie/fuuHHjBiwsLJT2FRERgbdv3+YYgyoxMTFwdXXNd/+8ZGRkQCaTQU9PL8c+tra2SjEuWLAAffr0wdKlS+Hs7Izhw4eLy6RSqVLMAGBubp6v9o+RlJRUokabP3z4EGfPnsWuXbswdOhQhIWF5euGhtzeF/kx6ujoKNx8oG5ffPGFwuekW7ducHV1xfLlywtVFCciIiIiIipNOH06ERERERGVeDKZDHPmzEHFihUhlUrRqlUr3L9/X6GPh4cHqlevjr/++gstWrSAoaEhpkyZAiCr+Ddo0CCUL18eUqkUtWrVwqZNmxTWlz+/PPu07fJpt0NCQhTat2/fDldXV0ilUlSvXh27d+9GQEAAHBwcVB7Djz/+CCcnJ+jr66N+/fq4cOGCwvKAgAAYGxvjn3/+gZeXF4yMjGBjY4NZs2ZBEIQCxxkQEIDVq1cDUJxaOScdOnRA5cqVVS5r3Lgx6tWrJ76OjIxEs2bNYG5uDmNjY1StWlU818VFS0tLnF5dPr15UFAQJBIJNm3apHTzg5OTE3744Qc8f/4c69ev/+j9y8/7w4cPcfDgQaWp1vPzGfvwGfPLli0TPw83b94scDwGBgYIDQ1FmTJlMGfOHIXPyIfPFA8ICIC7uzsAoEePHuI09R4eHvD39wcA1K9fHxKJROHZ1efOnUO7du1gZmYGQ0NDuLu748yZMwoxyKfFv3nzJvr06QMLCws0a9ZMXL5lyxa4ubnBwMAAZcqUQa9evfDkyROFbcj/3d68eROenp4wNDSEra0tfvjhB4VzX79+fQDAwIEDxXOf/d+kKmFhYbCwsICPjw+6d++OsLAwpT65vS+5HWP2Z4pXr14dnp6eStuXyWSwtbVF9+7dxbZFixahSZMmKFu2LAwMDODm5oYdO3bkeTwF4eLignLlyuHBgwcK7Xv37oWPjw9sbGygr68PJycnBAUFITMzU+zj4eGBgwcP4tGjR+L5/vDalpqaihkzZqBKlSrQ19eHnZ0dJk6cqPR4A3VcK4iIiIiIiFQpObc0ExERERER5WD+/PnQ0tLChAkTEB8fjx9++AF9+/ZVeqby69ev4e3tjV69eqFfv34oX7483r9/Dw8PD9y/fx+jRo2Co6Mjtm/fjoCAAMTFxWHMmDEFjufgwYPo2bMnatSogXnz5uHt27cYNGgQbG1tVfYPDw9HYmIihg4dColEgh9++AFdu3bFP//8A11dXbFfZmYm2rVrh0aNGuGHH37AkSNHMGPGDGRkZGDWrFkFinHo0KF49uyZyqnpVenZsycGDBiACxcuiAVIAHj06BH+/PNPLFy4EABw48YNdOjQATVr1sSsWbOgr6+P+/fvKxVMi4O8uFe2bFkkJyfjt99+Q/PmzeHo6JjjMQ0ZMgQHDhz46GmuXVxcEBoaim+++QYVK1YUpzO3tLQs8GcsODgYKSkpGDJkCPT19VGmTJlCxWRsbIwuXbrgp59+ws2bN1GtWjWlPkOHDoWtrS3mzp2L0aNHo379+uI09VWrVsWPP/6IWbNmwdHREU5OTgCypoj39vaGm5sbZsyYAS0tLQQHB6Nly5Y4deoUGjRooLCPHj16wNnZGXPnzhWL83PmzMG0adPg5+eHwYMH49WrV1i5ciVatGiBy5cvK4xOf/v2Ldq1a4euXbvCz88PO3bswKRJk1CjRg14e3vDxcUFs2bNwvTp0zFkyBA0b94cANCkSZM8z1FYWBi6du0KPT099O7dG2vXrlX6jMvl9r6oOsbsevbsicDAQLx48QLW1tZi++nTp/Hs2TP06tVLbFu+fDk6duyIvn37Ii0tDREREejRowcOHDgAHx+fPI8rP+Lj4/H27VvxfZULCQmBsbExxo0bB2NjYxw/fhzTp09HQkKC+O/8+++/R3x8vMLjBYyNjQFkFfk7duyI06dPY8iQIXBxccHff/+NpUuX4u7du9izZw8A9V4riIiIiIiIlAhERERERERqNHLkSCGnX02ioqIEAIKLi4uQmpoqti9fvlwAIPz9999im7u7uwBAWLduncI2li1bJgAQtmzZIralpaUJjRs3FoyNjYWEhASFfUVFRSms//DhQwGAEBwcLLbVqFFDqFixopCYmCi2nThxQgAg2NvbK61btmxZ4c2bN2L73r17BQDC/v37xTZ/f38BgPD111+LbTKZTPDx8RH09PSEV69eFTjO3M5tdvHx8YK+vr4wfvx4hfYffvhBkEgkwqNHjwRBEISlS5cKAMR4CuPVq1cCAGHGjBkql/v7+wtGRkbCq1evhFevXgn3798X5s6dK0gkEqFmzZqCIAjClStXBADCmDFjct1XzZo1hTJlyqhc5uPjo/B+5Ye9vb3g4+Oj0Jbfz5j8PTI1NRViYmIKvb8Pyd+PvXv3im3Zz638M7N9+3aFdYODgwUAwoULF8Q2mUwmODs7C15eXoJMJhPbk5OTBUdHR6FNmzZi24wZMwQAQu/evRW2Gx0dLWhrawtz5sxRaP/7778FHR0dhXb5v9vNmzeLbampqYK1tbXQrVs3se3ChQtKn++8XLx4UQAgREZGisdWsWJFpc9Mbu9LTsf44TK5O3fuCACElStXKvQbMWKEYGxsLCQnJ4ttH/4sCFmfl+rVqwstW7ZUaLe3txf8/f3zPFYAwqBBg4RXr14JMTExwsWLF4V27doJAISFCxcq9M2+b0EQhKFDhwqGhoZCSkqK2JbTv4/Q0FBBS0tLOHXqlEL7unXrBADCmTNnBEEommsFERERERFRUeH06UREREREVOINHDhQ4dm+8pGi//zzj0I/fX19DBw4UKHt0KFDsLa2Ru/evcU2XV1djB49Gu/evcPvv/9eoFiePXuGv//+GwMGDBBHTgKAu7s7atSooXKdnj17wsLCIs/4AWDUqFHizxKJBKNGjUJaWhp+/fXXAsVZUKampvD29sa2bdsURsJu3boVjRo1QqVKlQD89/zpvXv3QiaTFVs8SUlJsLS0hKWlJapUqYIpU6agcePG2L17NwAgMTERAGBiYpLrdkxMTJCQkFBscQIF/4x169YNlpaWRbJv+WdQfj4+1pUrV3Dv3j306dMHr1+/RmxsLGJjY5GUlIRWrVrh5MmTSu/7sGHDFF7v2rULMpkMfn5+4vqxsbGwtraGs7MzoqKilI7hw2dh6+npoUGDBir/fRREWFgYypcvL05pLpFI0LNnT0RERChMFS6X2/uS/RhV+eKLL1C7dm1s3bpVbMvMzMSOHTvg6+sLAwMDsf3Dn9++fYv4+Hg0b94cly5dyvfxZffTTz/B0tISVlZWqFevHn777TdMnDgR48aNU+j34b4TExMRGxuL5s2bIzk5Gbdv385zP9u3b4eLiwv+97//Kby/LVu2BADx/f1U1woiIiIiIqL8YFGciIiIiIhKPHlBVk5eYH779q1Cu62trULxHMia/tvZ2RlaWoq//ri4uIjLC0Lev0qVKkrLVLUB+Y9fS0tL6bneX3zxBYD/nqNdnHr27IknT57gjz/+AJA1Xflff/2Fnj17KvRp2rQpBg8ejPLly6NXr17Ytm1bkRe9pFIpIiMjERkZiZMnT+LJkyc4c+aMeH7kxfC8isGJiYl5Fs4/VkE/YzlN914Y7969A5D3zQH5de/ePQCAv7+/eFOC/Gvjxo1ITU1FfHy8wjrZj+fevXsQBAHOzs5K27h16xZiYmIU+lesWFHpefcWFhZK/z4KIjMzExEREfD09MTDhw9x//593L9/Hw0bNsTLly/x22+/Ka2T2/uS3/esZ8+eOHPmDJ4+fQog63noMTExCv+GAODAgQNo1KgRpFIpypQpA0tLS6xdu1bp3BZEp06dEBkZiYMHD4rPO09OTlb6XN64cQNdunSBmZkZTE1NYWlpKd6UkJ/937t3Dzdu3FB6b+XXKvn7+6muFURERERERPnBZ4oTEREREVGJp62trbJdyPZs3w9HQBZU9qKcnKoRpQWV3/jzozjj9PX1haGhIbZt24YmTZpg27Zt0NLSQo8ePcQ+BgYGOHnyJKKionDw4EEcOXIEW7duRcuWLXHs2LEcj7WgtLW10bp16xyXV6lSBTo6Orh27VqOfVJTU3Hnzh3Uq1evSGIqKh/zOc3u+vXrAHK+IaOg5AXLhQsXonbt2ir7fDhDAqB8PDKZDBKJBIcPH1b5eci+flH++5A7fvw4nj9/joiICERERCgtDwsLQ9u2bRXacntf8vue9ezZE5MnT8b27dsxduxYbNu2DWZmZmjXrp3Y59SpU+jYsSNatGiBNWvWoEKFCtDV1UVwcDDCw8PzeYTKKlasKP6bad++PcqVK4dRo0bB09MTXbt2BQDExcXB3d0dpqammDVrFpycnCCVSnHp0iVMmjQpXwVrmUyGGjVqYMmSJSqX29nZAfh01woiIiIiIqL8YFGciIiIiIhKNXt7e1y7dg0ymUxhxKR8mmB7e3sA/43ejouLU1g/+yhfef/79+8r7UtVW0HIZDL8888/4ohLALh79y4AwMHBoUBxAjkX0HNiZGSEDh06YPv27ViyZAm2bt2K5s2bw8bGRqGflpYWWrVqhVatWmHJkiWYO3cuvv/+e0RFReVayC5KRkZG8PT0xPHjx/Ho0SPxffnQtm3bkJqaig4dOhRrLPn9jBW1d+/eYffu3bCzsxNHpX8sJycnAFnT6Rf2vXRycoIgCHB0dFT4LH+Mgn6Ww8LCYGVlhdWrVyst27VrF3bv3o1169YV6Q0KQNaI8gYNGmDr1q0YNWoUdu3ahc6dO0NfX1/ss3PnTkilUhw9elShPTg4uEhjGTp0KJYuXYqpU6eiS5cukEgkOHHiBF6/fo1du3ahRYsWYt+HDx8qrZ/TOXdycsLVq1fRqlWrPN+XknCtICIiIiIiAjh9OhERERERlXLt27fHixcvFJ7zm5GRgZUrV8LY2Bju7u4AsgqX2traOHnypML6a9asUXhtY2OD6tWrY/PmzeLU1QDw+++/4++///7oeFetWiX+LAgCVq1aBV1dXbRq1apAcQJZhWNAuYCem549e+LZs2fYuHEjrl69qjTt85s3b5TWkY8oTk1Nzfd+isLUqVMhCAICAgLw/v17hWUPHz7ExIkTUaFCBQwdOrRY48jvZ6wovX//Hv3798ebN2/w/fffF7honBM3Nzc4OTlh0aJFCp9vuVevXuW5ja5du0JbWxszZ85UGu0tCAJev35d4LgK8ll+//49du3ahQ4dOqB79+5KX6NGjUJiYiL27dtX4Djyo2fPnvjzzz/x888/IzY2VunfkLa2NiQSicLsDtHR0dizZ0+RxqGjo4Px48fj1q1b2Lt3r7hvQHEUflpaWo7XD1XTqfv5+eHp06fYsGGD0rL3798jKSkJQMm6VhAREREREXGkOBERERERlWpDhgzB+vXrERAQgL/++gsODg7YsWMHzpw5g2XLlonPYjYzM0OPHj2wcuVKSCQSODk54cCBA0rPPwaAuXPnolOnTmjatCkGDhyIt2/fYtWqVahevbrKQmJ+SaVSHDlyBP7+/mjYsCEOHz6MgwcPYsqUKbC0tCxwnG5ubgCA0aNHw8vLC9ra2ujVq1euMbRv3x4mJiaYMGECtLW10a1bN4Xls2bNwsmTJ+Hj4wN7e3vExMRgzZo1qFixIpo1a1boYy+MFi1aYNGiRRg3bhxq1qyJgIAAVKhQAbdv38aGDRsgk8lw6NAhcXR9ccnvZ6ywnj59ii1btgDIGh1+8+ZNbN++HS9evMD48eOLtOivpaWFjRs3wtvbG9WqVcPAgQNha2uLp0+fIioqCqampti/f3+u23BycsLs2bMxefJkREdHo3PnzjAxMcHDhw+xe/duDBkyBBMmTChQXE5OTjA3N8e6detgYmICIyMjNGzYUOWzvvft24fExER07NhR5bYaNWoES0tLhIWFKRWsi4Kfnx8mTJiACRMmoEyZMkojon18fLBkyRK0a9cOffr0QUxMDFavXo0qVark+jiAwggICMD06dOxYMECdO7cGU2aNIGFhQX8/f0xevRoSCQShIaGqpyq3s3NDVu3bsW4ceNQv359GBsbw9fXF/3798e2bdswbNgwREVFoWnTpsjMzMTt27exbds2HD16FPXq1StR1woiIiIiIiIWxYmIiIiIqFQzMDDAiRMn8N1332HTpk1ISEhA1apVERwcjICAAIW+K1euRHp6OtatWwd9fX34+flh4cKFqF69ukI/X19f/PLLLwgMDMR3330HZ2dnhISEYNOmTbhx40ahY9XW1saRI0cwfPhwfPvttzAxMcGMGTMwffr0QsXZtWtXfP3114iIiMCWLVsgCEKeRXGpVIqOHTsiLCwMrVu3hpWVlcLyjh07Ijo6WhwFW65cObi7u2PmzJkwMzMr9LEX1jfffIN69eph8eLFWLZsGeLj41GhQgX06NED33//fbFNXf6hgnzGCuPKlSvo378/JBIJTExMYGdnB19fXwwePBgNGjT4+APIxsPDA3/88QeCgoKwatUqvHv3DtbW1mjYsGG+C/DfffcdvvjiCyxduhQzZ84EkPWs6bZt2+ZYrM6Nrq4uNm3ahMmTJ2PYsGHIyMhAcHCwyqJ4WFgYpFIp2rRpo3JbWlpa8PHxQVhYWKFGreelYsWKaNKkCc6cOYPBgwdDV1dXYXnLli3x008/Yf78+Rg7diwcHR2xYMECREdHF3lR3MDAAKNGjUJgYCBOnDgBDw8PHDhwAOPHj8fUqVNhYWGBfv36oVWrVvDy8lJYd8SIEbhy5QqCg4OxdOlS2Nvbw9fXF1paWtizZw+WLl2KzZs3Y/fu3TA0NETlypUxZswYccr8knatICIiIiKiz5tEUHU7MBERERERERVY7dq1YWlpicjIyAKvGxAQgB07dnzUSHMiIiIiIiIiIlLGZ4oTEREREREVUHp6OjIyMhTaTpw4gatXr8LDw0M9QRERERERERERkUqcPp2IiIiIiKiAnj59itatW6Nfv36wsbHB7du3sW7dOlhbW2PYsGHqDo+IiIiIiIiIiD7AojgREREREVEBWVhYwM3NDRs3bsSrV69gZGQEHx8fzJ8/H2XLllV3eERERERERERE9AE+U5yIiIiIiIiIiIiIiIiIiEotPlOciIiIiIiIiIiIiIiIiIhKLRbFiYiIiIiIiIiIiIiIiIio1GJRnIiIiIiIiIiIiIiIiIiISi0WxYmIiIiIiEqokJAQSCQSXLx4Mcc+0dHRkEgkWLRoUa7bcnBwgEQiQevWrVUu37BhAyQSSZ77y01gYCAkEgliY2Nz7HPixAlIJBLs2LEj39v18/ODRCLBpEmTct2mRCLBli1bVPZp2rQpJBIJqlevrnJ5ZmYmbGxsIJFIcPjw4XzHBgDffPMN6tatizJlysDQ0BAuLi4IDAzEu3fv8rX+2rVr0aNHD1SqVAkSiQQBAQEF2r/8vVX15ezsrNT/p59+gouLC6RSKZydnbFy5UqlPgEBAQrb0dHRgZ2dHXr16oWbN2/mK678fH4/xs2bNxEYGIjo6Ohi2b6mxEBERERERER501F3AERERERERPRpSKVSREVF4cWLF7C2tlZYFhYWBqlUipSUFDVFp1pCQgL2798PBwcH/PLLL5g/fz4kEonKvlKpFOHh4ejXr59Ce3R0NM6ePQupVJrjfo4fP47nz5/DwcEBYWFh8Pb2zneMFy5cQPPmzTFw4EBIpVJcvnwZ8+fPx6+//oqTJ09CSyv3+9EXLFiAxMRENGjQAM+fP8/3fuWWLVumVIB/9OgRpk6dirZt2yq0r1+/HsOGDUO3bt0wbtw4nDp1CqNHj0ZycrLSTQf6+vrYuHEjACAjIwMPHjzAunXrcOTIEdy8eRM2NjYFjrUo3bx5EzNnzoSHhwccHBw+2xiIiIiIiIgobyyKExERERERfSaaNm2KCxcuYOvWrRgzZozY/u+//+LUqVPo0qULdu7cqcYIle3cuROZmZn4+eef0bJlS5w8eRLu7u4q+7Zv3x779u1DbGwsypUrJ7aHh4ejfPnycHZ2xtu3b1Wuu2XLFtStWxf+/v6YMmUKkpKSYGRklK8YT58+rdTm5OSECRMm4Pz582jUqFGu6//+++/iKHFjY+N87fNDnTt3VmqbPXs2AKBv375i2/v37/H999/Dx8dHHKn/1VdfQSaTISgoCEOGDIGFhYXYX0dHR+kGg0aNGqFDhw44ePAgvvrqqwLHqi6CICAlJQUGBgbqDoWIiIiIiIjUgNOnExERERERfSakUim6du2K8PBwhfZffvkFFhYW8PLyUlonPT0dt2/fLtQI5qIQFhaGNm3awNPTEy4uLggLC8uxb6dOnaCvr4/t27crtIeHh8PPzw/a2toq13v//j12796NXr16wc/PD+/fv8fevXs/Km75qOG4uLg8+9rb2+c4+r2wwsPD4ejoiCZNmohtUVFReP36NUaMGKHQd+TIkUhKSsLBgwfz3K58hgEdncLdYx8QEABjY2M8ffoUnTt3hrGxMSwtLTFhwgRkZmYq9I2IiICbmxtMTExgamqKGjVqYPny5QCypmbv0aMHAMDT01Oc5v3EiRMAss5/hw4dcPToUdSrVw8GBgZYv369+LiBkJAQpdgkEgkCAwMV2p4+fYpBgwbBxsYG+vr6cHR0xPDhw5GWlpZnDERERERERFRysChORERERET0GenTpw/Onz+PBw8eiG3h4eHo3r07dHV1lfo/ffoULi4umDx58qcMEwDw7NkzREVFoXfv3gCA3r17Y8eOHUhLS1PZ39DQEJ06dcIvv/witl29ehU3btxAnz59ctzPvn378O7dO/Tq1QvW1tbw8PDItfiuSkZGBmJjY/Hs2TMcO3YMU6dOhYmJCRo0aFCg7RSFy5cv49atW0rHfPnyZQBAvXr1FNrd3NygpaUlLv9QbGwsYmNj8fLlS/zxxx/45ptvULZsWXTo0KHQ8WVmZsLLywtly5bFokWL4O7ujsWLF+PHH38U+0RGRqJ3796wsLDAggULMH/+fHh4eODMmTMAgBYtWmD06NEAgClTpiA0NBShoaFwcXERt3Hnzh307t0bbdq0wfLly1G7du0Cxfns2TM0aNAAERER6NmzJ1asWIH+/fvj999/R3Jycr5iICIiIiIiopKB06cTERERERF9Rlq2bAlra2v88ssvmDp1Km7duoUrV65g+fLl+Oeff9QdnoJffvkF+vr66NSpEwCgV69emD59Og4dOqRyynAgq+jv6+uLJ0+ewM7ODmFhYahcuXKuU5hv2bIFTZo0gZ2dnbifESNG4NWrV7C0tMxXrBcvXkTjxo3F11WrVsW+fftQpkyZfB5t0ZEX9D+cOh0Anj9/Dm1tbVhZWSm06+npoWzZsnj27JlCe1JSktLx29ra4tixY/k+L6qkpKSgZ8+emDZtGgBg2LBhqFu3Ln766ScMHz4cAHDw4EGYmpri6NGjKkf4V65cGc2bN8eKFSvQpk0beHh4KPW5f/8+jhw5ojADQnR0dL7jnDx5Ml68eIFz584p3Egwa9YsCIIAc3PzPGMgIiIiIiKikoEjxYmIiIiIiD4j2tra8PPzE0dTh4WFwc7ODs2bN1fZ38HBAYIgqJxuuriFhYXBx8cHJiYmAABnZ2e4ubnlOoq7bdu2KFOmDCIiIiAIAiIiIsSR5qq8fv0aR48eVejTrVs3SCQSbNu2Ld+xurq6IjIyEnv27MHEiRNhZGSEd+/e5Xv9oiKTyRAREYE6deoojVh+//499PT0VK4nlUrx/v17pbbIyEhERkbi6NGjWL9+PYyNjdG+fXvcvXv3o+IcNmyYwuvmzZsr3JRhbm6OpKQkREZGFnofjo6OKh8JkB8ymQx79uyBr6+v0sh6AEU+3T0REREREREVL44UJyIiIiIi+sz06dMHK1aswNWrVxEeHo5evXqVuCLfrVu3cPnyZQwYMAD3798X2z08PLB69WokJCTA1NRUaT1dXV306NED4eHhaNCgAZ48eZLr1Olbt25Feno66tSpo7Cfhg0bIiwsDCNHjgQAvHnzRmHadgMDA5iZmYmvTU1N0bp1awBZzzYPDw9Hp06dcOnSJdSqVavwJ+L/vX//HvHx8Qpt8ud7f+j333/H06dP8c033ygtMzAwyHHq+ZSUFBgYGCi0aWtri8ck1759ezg7O2Py5MnYuXMnMjMz8erVK4U+ZcqUybH4DmQV27OPNLewsMDbt2/F1yNGjMC2bdvg7e0NW1tbtG3bFn5+fmjXrl2O283O0dEx332ze/XqFRISElC9evVCb4OIiIiIiIhKDo4UJyIiIiIi+sw0bNgQTk5OGDt2LB4+fJhr0VhdtmzZAgD45ptv4OzsLH4tXrwYKSkp2LlzZ47r9unTB1euXEFgYCBq1aoFV1fXHPvKR503bdpUYT+nT5/GH3/8IY5e7tq1KypUqCB+jRkzJtf4u3btCgCIiIgo0HHnZOvWrQr7r1ChQo7Ho6WlpXJ0fIUKFZCZmYmYmBiF9rS0NLx+/Ro2NjZ5xlGxYkVUrVoVJ0+eBAA8efJEKa6zZ8/mug1V06FnZ2VlhStXrmDfvn3o2LEjoqKi4O3tDX9//zzXlcte5AdyHuGdmZmZ7+0SERERERGR5uFIcSIiIiIios9Q7969MXv2bLi4uKB27drqDkeBIAgIDw+Hp6cnRowYobQ8KCgIYWFhGDhwoMr1mzVrhkqVKuHEiRNYsGBBjvt5+PAhzp49i1GjRsHd3V1hmUwmQ//+/REeHo6pU6di8eLFCiOZ8yogp6amQiaTKY3uLiwvL688pxJPTU3Fzp074eHhoTI++ft88eJFtG/fXmy/ePEiZDJZvj8HGRkZ4tTw1tbWSnEVxch4IOtZ576+vvD19YVMJsOIESOwfv16TJs2DVWqVCnU7AYWFhYAgLi4OIX2R48eKby2tLSEqakprl+/nuv2StoMC0RERERERKQai+JERERERESfocGDB0NbWxsNGzbMtV96ejoePHgAMzOzHEcnF7UzZ84gOjoas2bNQvfu3ZWW3717F9OmTcOzZ89UFn8lEglWrFiBy5cvo3///jnuRz5KfOLEibCzs1NavnHjRoSFhWHq1Klwc3NTuY24uDgYGRlBV1dXaV0ACs+jTk5OxuPHj1GuXDmUK1cux7hUyW10uNyhQ4cQFxeHvn37qlzesmVLlClTBmvXrlUoiq9duxaGhobw8fHJM467d+/izp074vmQSqVKU6wXhdevX6Ns2bLiay0tLdSsWRNAVvEfAIyMjAAoF7hzY2pqinLlyuHkyZMYO3as2L5mzRqFflpaWujcuTO2bNmCixcvKj1XXBAESCSSQsVAREREREREnx6L4kRERERERCXczz//jCNHjii1fziF92+//YaUlBSlPp07d1b5XGR7e3sEBgbmue+nT5/CxcUF/v7+CAkJyVe8S5YsgaGhoUKblpYWpkyZIr7euXMnbt++rbSuv78/wsLCoK2tnWORtmPHjvj+++8RERGBcePGqezTqVMndOrUKdc4w8LCULt2bZUFcfl+vv76a1y6dAl169ZV2efEiRMYPXo0unfvDmdnZ6SlpeHUqVPYtWsX6tWrh379+ol9z58/D09PT8yYMUPh3O/fvx9Xr14FkHUTwrVr1zB79mwxBnkxOC9hYWHQ19dHt27dVC43MDBAUFAQRo4ciR49esDLywunTp3Cli1bMGfOHJQpU0ahf0ZGhjiNvUwmQ3R0NNatWweZTIYZM2bkK6bCGjx4MN68eYOWLVuiYsWKePToEVauXInatWvDxcUFQNbId21tbSxYsADx8fHQ19dHy5YtYWVllee258+fj8GDB6NevXo4efIk7t69q9Rv7ty5OHbsGNzd3TFkyBC4uLjg+fPn2L59O06fPg1zc/NCx0BERERERESfFoviREREREREJdzatWtVtgcEBIg/HzlyRGXh3MHBQWVRvDjNmzdPqU1bW1uhKJ7Ts7bd3d2xfft2NGnSRKlIK1e9enU4Ojpiy5YtORbF83Lp0iXcvn0b06ZNy7GPr68vvv76a2zZsiXHoniNGjXg6emJvXv34vnz5xAEAU5OTpg+fTq+/fZb6Onp5RnLzp07sWnTJvH15cuXcfnyZQBZz/DOT1E8ISEBBw8ehI+PD8zMzHLsN2LECOjq6mLx4sXYt28f7OzssHTpUpXPSE9NTVUYaW9qaor69esjNDQUrVq1yjOmj9GvXz/8+OOPWLNmDeLi4mBtbY2ePXsiMDAQWlpaALKmbl+3bh3mzZuHQYMGITMzE1FRUXkWpKdPn45Xr15hx44d2LZtG7y9vXH48GGl9WxtbXHu3DlMmzYNYWFhSEhIgK2tLby9vcWbPgobAxEREREREX1aEkEQBHUHQUREREREREREREREREREVBy01B0AERERERERERERERERERFRcWFRnIiIiIiIiIiIiIiIiIiISi0WxYmIiIiIiIiIiIiIiIiIqNRiUZyIiIiIiIiIiIiIiIiIiEotFsWJiIiIiIiIiIiIiIiIiKjUYlGciIiIiIiIiIiIiIiIiIhKLRbFiYiIiIiI6LMUEhICiUSCixcvFvu+JBIJAgMDi30/RERERERERKSMRXEiIiIiIiIqdvIC9IdfVlZW8PT0xOHDhwu93blz52LPnj1FF2gBnT59Gt7e3rC1tYVUKkWlSpXg6+uL8PBwtcVU1Nq0aQOJRIJRo0Z91Hb++usvdOjQAdbW1jA2NkbNmjWxYsUKZGZmFlGkRERERERERKrpqDsAIiIiIiIi+nzMmjULjo6OEAQBL1++REhICNq3b4/9+/ejQ4cOBd7e3Llz0b17d3Tu3Lnog83D9u3b0bNnT9SuXRtjxoyBhYUFHj58iJMnT2LDhg3o06eP2Pf9+/fQ0dG8X8F37dqFP/7446O389dff6FJkyZwdnbGpEmTYGhoiMOHD2PMmDF48OABli9fXgTREhEREREREammeb+RExERERERkcby9vZGvXr1xNeDBg1C+fLl8csvvxSqKK5OgYGBcHV1xZ9//gk9PT2FZTExMQqvpVLppwytSKSkpGD8+PGYNGkSpk+f/lHbWr9+PQDg5MmTKFOmDABg6NChcHd3R0hICIviREREREREVKw4fToRERERERGpjbm5OQwMDJRGUS9atAhNmjRB2bJlYWBgADc3N+zYsUOhj0QiQVJSEjZt2iROyR4QECAuf/r0KQYNGgQbGxvo6+vD0dERw4cPR1pamsJ2UlNTMW7cOFhaWsLIyAhdunTBq1ev8oz9wYMHqF+/vlJBHACsrKyUYpU/Uzw6OlppKvkPvz507tw5tGvXDmZmZjA0NIS7uzvOnDmj0CcxMRFjx46Fg4MD9PX1YWVlhTZt2uDSpUtin+TkZNy+fRuxsbF5HpfcDz/8AJlMhgkTJuR7nZwkJCRAKpXC3Nxcob1ChQowMDD46O0TERERERER5YYjxYmIiIiIiOiTiY+PR2xsLARBQExMDFauXIl3796hX79+Cv2WL1+Ojh07om/fvkhLS0NERAR69OiBAwcOwMfHBwAQGhqKwYMHo0GDBhgyZAgAwMnJCQDw7NkzNGjQAHFxcRgyZAj+97//4enTp9ixYweSk5MVCtlff/01LCwsMGPGDERHR2PZsmUYNWoUtm7dmuux2Nvb47fffsO///6LihUr5vscWFpaIjQ0VKEtPT0d33zzjUJcx48fh7e3N9zc3DBjxgxoaWkhODgYLVu2xKlTp9CgQQMAwLBhw7Bjxw6MGjUKrq6ueP36NU6fPo1bt26hbt26AIDz58/D09MTM2bMEIvzuXn8+DHmz5+Pn3/+uUiK1h4eHti6dSuGDh2KcePGidOn79q1CwsXLvzo7RMRERERERHlhkVxIiIiIiIi+mRat26t8FpfXx8///wz2rRpo9B+9+5dhWLsqFGjULduXSxZskQsivfr1w/Dhg1D5cqVlYrqkydPxosXL3Du3DmF6dpnzZoFQRAU+pYtWxbHjh0TR2nLZDKsWLEC8fHxMDMzy/FYJk2ahEGDBsHJyQlNmzZFs2bN0LZtWzRp0gRaWjlPzGZkZKQU78iRI/Hu3TtERkYCAARBwLBhw+Dp6YnDhw+LsQ0dOhTVqlXD1KlTcezYMQDAwYMH8dVXX2Hx4sXi9iZOnJjj/vNj/PjxqFOnDnr16vVR25H76quvcOPGDaxfvx4bN24EAGhra2PVqlUYNmxYkeyDiIiIiIiIKCcsihMREREREdEns3r1anzxxRcAgJcvX2LLli0YPHgwTExM0LVrV7HfhwXxt2/fIjMzE82bN8cvv/yS5z5kMhn27NkDX19fhYK4XPYpyocMGaLQ1rx5cyxduhSPHj1CzZo1c9zPl19+CVtbWyxZsgRRUVGIiopCUFAQKleujNDQUDRp0iTPWAFg8+bNWLNmDRYvXgxPT08AwJUrV3Dv3j1MnToVr1+/VujfqlUrhIaGQiaTQUtLC+bm5jh37hyePXsGGxsblfvw8PBQuhkgJ1FRUdi5cyfOnTuXr/75oa2tDScnJ3h5eaFHjx6QSqX45Zdf8PXXX8Pa2hqdO3cusn0RERERERERZceiOBEREREREX0yDRo0UChU9+7dG3Xq1MGoUaPQoUMHcfrwAwcOYPbs2bhy5QpSU1PF/tkL2qq8evUKCQkJqF69er5iqlSpksJrCwsLAFnF+Lx4eXnBy8sLycnJ+Ouvv7B161asW7cOHTp0wO3bt5WeLZ7dlStXMGzYMPTu3Rvjxo0T2+/duwcA8Pf3z3Hd+Ph4WFhY4IcffoC/vz/s7Ozg5uaG9u3bY8CAAahcuXKe8WeXkZGB0aNHo3///qhfv36B18/J/PnzsXz5cty7dw/GxsYAAD8/P3h6emLkyJHo0KGD0nPliYiIiIiIiIpKzvO5ERERERERERUzLS0teHp64vnz52Ih+NSpU+jYsSOkUinWrFmDQ4cOITIyEn369Mn3aOeC0NbWVtlekH0ZGhqiefPmWLVqFaZOnYq3b9/i8OHDua7z9u1bdOvWDV988YU4pbicTCYDACxcuBCRkZEqvz4sLv/zzz9YuXIlbGxssHDhQlSrVi3P/auyefNm3LlzB0OHDkV0dLT4BQCJiYmIjo5GcnJygbe7Zs0atGzZUoxZrmPHjnj27Jm4DyIiIiIiIqLiwNuwiYiIiIiISK0yMjIAAO/evQMA7Ny5E1KpFEePHoW+vr7YLzg4WGldVSPHLS0tYWpqiuvXrxdTxLmTj4R//vx5jn1kMhn69u2LuLg4/PrrrzA0NFRY7uTkBAAwNTVVeg67KhUqVMCIESMwYsQIxMTEoG7dupgzZw68vb0LFPvjx4+Rnp6Opk2bKi3bvHkzNm/ejN27dxd4uvOXL18iMzNTqT09PR3Af58BIiIiIiIiouLAkeJERERERESkNunp6Th27Bj09PTg4uICIGvktkQiUSiiRkdHY8+ePUrrGxkZIS4uTqFNS0sLnTt3xv79+3Hx4kWldYpqtPlvv/2msv3QoUMAgKpVq+a47syZM3H06FH88ssvcHR0VFru5uYGJycnLFq0SLxZ4EOvXr0CAGRmZiI+Pl5hmZWVFWxsbBSmnU9OTsbt27cRGxub6zH16tULu3fvVvoCgPbt22P37t1o2LBhrttQ5YsvvkBkZKTC89EzMzOxbds2mJiYiDcBEBERERERERUHjhQnIiIiIiKiT+bw4cO4ffs2ACAmJgbh4eG4d+8evvvuO5iamgIAfHx8sGTJErRr1w59+vRBTEwMVq9ejSpVquDatWsK23Nzc8Ovv/6KJUuWwMbGBo6OjmjYsCHmzp2LY8eOwd3dHUOGDIGLiwueP3+O7du34/Tp0zA3N//oY+nUqRMcHR3h6+sLJycnJCUl4ddff8X+/ftRv359+Pr6qlzv77//RlBQEFq0aIGYmBhs2bJFYXm/fv2gpaWFjRs3wtvbG9WqVcPAgQNha2uLp0+fIioqCqampti/fz8SExNRsWJFdO/eHbVq1YKxsTF+/fVXXLhwAYsXLxa3ef78eXh6emLGjBkIDAzM8Zj+97//4X//+5/KZY6OjkojxD08PPD777/neaPBd999h379+qFhw4YYMmQIDAwM8Msvv+Cvv/7C7Nmzoaurm+v6RERERERERB+DRXEiIiIiIiL6ZKZPny7+LJVK8b///Q9r167F0KFDxfaWLVvip59+wvz58zF27Fg4OjpiwYIFiI6OViqKL1myBEOGDMHUqVPx/v17+Pv7o2HDhrC1tcW5c+cwbdo0hIWFISEhAba2tvD29laaqrywNm7ciL1792Lbtm149uwZBEFA5cqV8f3332PSpEnQ0VH9K/fr168hCAJ+//13/P7770rL+/XrByCr4PzHH38gKCgIq1atwrt372BtbY2GDRuK58vQ0BAjRozAsWPHsGvXLshkMlSpUgVr1qzB8OHDi+Q4cyOPKS99+/ZFuXLlMG/ePCxcuBAJCQmoWrUq1q1bp/DeExERERERERUHiVBU88YRERERERER0WcjMTERZcqUwbJlyzBy5Eh1h0NERERERESUIz5TnIiIiIiIiIgK7OTJk7C1tcVXX32l7lCIiIiIiIiIcsWR4kREREREREREREREREREVGpxpDgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosihMRERERERERERERERERUanFojgREREREREREREREREREZVaLIoTEREREREREREREREREVGpxaI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcWiOBERERERERERERERERERlVosilOpFxgYCIlEUqz7cHBwQEBAQLHuQxMtXLgQlStXhra2NmrXrg0AyMjIwMSJE2FnZwctLS107twZACCRSBAYGCiuGxISAolEgujo6E8eNxEVL16XiYgoO+YGIiLKjrmBiIg+xLxARB+LRXEqMdasWQOJRIKGDRuqO5RiI5FIFL5MTU3h7u6OgwcPFnqb4eHhWLZsWdEFWUSOHTuGiRMnomnTpggODsbcuXMBAD///DMWLlyI7t27Y9OmTfjmm2/UHCkR5YTX5cL51NflY8eOYdCgQahevTq0tbXh4ODwyfatCW7evInAwMCPvslq/PjxcHV1LZqgiDQYc0PhfMrckJycjNWrV6Nt27aoUKECTExMUKdOHaxduxaZmZmfJIaSjrmBqGgxNxSOOv+eExcXBysrK0gkEuzYsUMtMZQ0Z8+eRWBgIOLi4j5qO926dUP79u2LJigiDcW8UDifOi/IZDKsW7cOtWvXhrGxMcqXLw9vb2+cPXv2k8VQkjEvlE4silOJERYWBgcHB5w/fx73798vsu1OnToV79+/L7Ltfaw2bdogNDQUmzdvxsSJE3H//n34+vri6NGjhdpeSS2KHz9+HFpaWvjpp58wYMAA8cJ//Phx2NraYunSpejfvz/c3d1Vrt+/f3+8f/8e9vb2nzJsIvoAr8uacV0ODw9HeHg4zMzMYGNj88n2qylu3ryJmTNnfnTh4+DBg/Dx8SmaoIg0GHNDyc8N//zzD77++msIgoBx48Zh0aJFcHR0xIgRI/Dll19+khhKOuYGoqLF3FDyc0N206dPR3Jyslr2XVKdPXsWM2fO/KjiR3p6OiIjI5kb6LPHvKAZeeHbb7/F8OHDUaNGDSxZsgTjx4/H3bt34e7ujvPnz3+yOEoq5oXSiUVxKhEePnyIs2fPYsmSJbC0tERYWFi+1svIyEBaWprKZUlJSQAAHR0dSKXSIov1Y33xxRfo168f+vfvj6lTp+LXX3+FIAhYvny5ukMrUjExMTAwMICenp5Su7m5eZ7ra2trQyqVFvuUOESkGq/LmnNdnjt3LhISEnDmzBnUqlXro7eX23v4ufrnn39w584d/hJDnz3mBs3IDdbW1vj7778RGRmJb7/9FkOHDsWuXbswcOBAbN68uVB/mGRuUMbcQJSFuUEzcsOHrl+/jrVr12LSpEkftZ2UlBTIZLIiiqp0OHXqFBITE5kb6LPGvKAZeSEjIwNr165F9+7dERoaiiFDhmDixIn49ddfkZGRke/37UPMC8qYF0oeFsWpRAgLC4OFhQV8fHzQvXt3lRfd6OhoSCQSLFq0CMuWLYOTkxP09fXFqe8kEglu3ryJPn36wMLCAs2aNQOg/KyR6tWrw9PTU2n7MpkMtra26N69u9i2aNEiNGnSBGXLloWBgQHc3NyKfFopFxcXlCtXDg8ePFBo37t3L3x8fGBjYwN9fX04OTkhKChIYcpDDw8PHDx4EI8ePRKnavlw2tzU1FTMmDEDVapUgb6+Puzs7DBx4kSkpqYq7Cs2Nha3b9/O113CGRkZCAoKEs+/g4MDpkyZorBNiUSC4OBgJCUliXHJnxEeFRWFGzduiO0nTpxQuR9VzxR3cHBAhw4dcPr0aTRo0ABSqRSVK1fG5s2bldaPi4vD2LFjYWdnB319fVSpUgULFixgYibKJ16XNee6bGNjA11d3UIda27vIQDcvn0b3bt3R5kyZSCVSlGvXj3s27dPaTs3btxAy5YtYWBggIoVK2L27Nn4+eefla7jEokEgYGBSuuremZXfq/jERERcHNzg4mJCUxNTVGjRg3xF9CQkBD06NEDAODp6amUey5evAgvLy+UK1cOBgYGcHR0VDmK8uDBgzAzMxM/w4mJiRg7diwcHBygr68PKysrtGnTBpcuXVJY79y5c2jXrh3MzMxgaGgId3d3nDlzRmn7T58+xaBBg8TPlqOjI4YPH84CFJU4zA2akRvKlSuHatWqKbV36dIFAHDr1q1c12duYG4gKgjmBs3IDR8aM2YMunTpgubNm+d7nRMnTkAikSAiIgJTp06Fra0tDA0NkZCQACD/17bTp0+jfv36kEqlcHJywvr165XeZ/nnJSQkRGl9VTnj6dOn+PLLL1G+fHno6+ujWrVq+Pnnn5XWXblyJapVqwZDQ0NYWFigXr16CA8PB5D1Wfv2228BAI6OjuJ7Is9XkZGRaNasGczNzWFsbIyqVatiypQpSvs4ePAgXF1dxffyxYsXGDhwICpWrAh9fX1UqFABnTp1Upqp5PDhw2jevDmMjIxgYmICHx8f3LhxQ2n7t2/fhp+fHywtLWFgYICqVavi+++/V+pHpE7MC5qRF9LT0/H+/XuUL19eod3KygpaWlowMDDIdX3mBeYFTaWj7gCIgKxk2bVrV+jp6aF3795Yu3YtLly4gPr16yv1DQ4ORkpKCoYMGQJ9fX2UKVNGXNajRw84Oztj7ty5EARB5b569uyJwMBAvHjxAtbW1mL76dOn8ezZM/Tq1UtsW758OTp27Ii+ffsiLS0NERER6NGjBw4cOFBkd/fEx8fj7du3cHJyUmgPCQmBsbExxo0bB2NjYxw/fhzTp09HQkICFi5cCAD4/vvvER8fj3///RdLly4FABgbGwPISv4dO3bE6dOnMWTIELi4uODvv//G0qVLcffuXezZs0fc16pVqzBz5kxERUXBw8Mj13gHDx6MTZs2oXv37hg/fjzOnTuHefPm4datW9i9ezcAIDQ0FD/++CPOnz+PjRs3AgDq1KmD0NBQzJkzB+/evcO8efMAZP1noSDu37+P7t27Y9CgQfD398fPP/+MgIAAuLm5iX/8S05Ohru7O54+fYqhQ4eiUqVKOHv2LCZPnoznz5+XyOnmiUoaXpc157pcFFS9hzdu3EDTpk1ha2uL7777DkZGRti2bRs6d+6MnTt3isWVFy9ewNPTExkZGWK/H3/8Mc9foHKT3+t4ZGQkevfujVatWmHBggUAsoo9Z86cwZgxY9CiRQuMHj0aK1aswJQpU8Sc4+LigpiYGLRt2xaWlpb47rvvYG5ujujoaOzatUspnkOHDqFNmzbQ0cn6r/OwYcOwY8cOjBo1Cq6urnj9+jVOnz6NW7duoW7dugCyHhfi7e0NNzc3zJgxA1paWggODkbLli1x6tQpNGjQAADw7NkzNGjQAHFxcRgyZAj+97//4enTp9ixYweSk5OVZlwhUifmBs3ODS9evACQVTTPD+YG5gai/GBu0KzcsH37dpw9exa3bt0q1CMkgoKCoKenhwkTJiA1NRV6enr5vrb9/fff4jU2MDAQGRkZmDFjhlJBpiBevnyJRo0aQSKRYNSoUbC0tMThw4cxaNAgJCQkYOzYsQCADRs2YPTo0ejevTvGjBmDlJQUXLt2DefOnUOfPn3QtWtX3L17F7/88guWLl0q5kpLS0vcuHEDHTp0QM2aNTFr1izo6+vj/v37Kos7hw4dQocOHcTX3bp1w40bN/D111/DwcEBMTExiIyMxOPHj8UCSWhoKPz9/eHl5YUFCxYgOTkZa9euRbNmzXD58mWx37Vr19C8eXPo6upiyJAhcHBwwIMHD7B//37MmTOn0OeQqKgxL2hGXjAwMEDDhg0REhKCxo0bo3nz5oiLi0NQUBAsLCwwZMiQfB0z8wLzgsYRiNTs4sWLAgAhMjJSEARBkMlkQsWKFYUxY8Yo9Hv48KEAQDA1NRViYmIUls2YMUMAIPTu3Vtp+/Jlcnfu3BEACCtXrlToN2LECMHY2FhITk4W2z78WRAEIS0tTahevbrQsmVLhXZ7e3vB398/z2MFIAwaNEh49eqVEBMTI1y8eFFo166dAEBYuHChQt/s+xYEQRg6dKhgaGgopKSkiG0+Pj6Cvb29Ut/Q0FBBS0tLOHXqlEL7unXrBADCmTNnxDb5OYqKiso1/itXrggAhMGDByu0T5gwQQAgHD9+XGzz9/cXjIyMlLbh7u4uVKtWTakdgDBjxgzxdXBwsABAePjwodhmb28vABBOnjwptsXExAj6+vrC+PHjxbagoCDByMhIuHv3rsI+vvvuO0FbW1t4/PhxrsdJ9LnjdVlzrsvZ5bTvnOT2HrZq1UqoUaOGwrHJZDKhSZMmgrOzs9g2duxYAYBw7tw5sS0mJkYwMzNTuo5nv9bLZX+/8nsdHzNmjGBqaipkZGTkeIzbt29XeS53794tABAuXLiQ47qCIAhJSUmCVCoVgoODxTYzMzNh5MiROa4jk8kEZ2dnwcvLS5DJZGJ7cnKy4OjoKLRp00ZsGzBggKClpaUyjg/XJVI35gbNzQ2CIAipqamCq6ur4OjoKKSnp+fal7mBuYEov5gbNCs3JCcnC5UqVRImT54sCIIgREVFCQCE7du357muvG/lypUVjq8g17bOnTsLUqlUePTokdh28+ZNQVtbW+F9ln9ePrzGymXPGYMGDRIqVKggxMbGKvTr1auXYGZmJsbaqVMnlX+L+tDChQuVcpQgCMLSpUsFAMKrV69yXf+ff/5ReC/evn2r8vPxocTERMHc3Fz46quvFNpfvHghmJmZKbS3aNFCMDExUTh/gsC8QCUL84Jm5YV79+4JdevWFQCIX5UrVxZu376d57rMC8wLmorTp5PahYWFoXz58uJUJxKJBD179kRERITCFCJy3bp1g6WlpcptDRs2LM/9ffHFF6hduza2bt0qtmVmZmLHjh3w9fVVGL3w4c9v375FfHw8mjdvrjQFXkH89NNPsLS0hJWVFerVq4fffvsNEydOxLhx4xT6fbjvxMRExMbGonnz5khOTsbt27fz3M/27dvh4uKC//3vf4iNjRW/WrZsCQCIiooS+wYGBkIQhDzvKj506BAAKMU6fvx4AFnTgRQ3V1dXhSm+LC0tUbVqVfzzzz9i2/bt29G8eXNYWFgoHHvr1q2RmZmJkydPFnucRJqM12XNuS4Xlezv4Zs3b3D8+HH4+fmJxxobG4vXr1/Dy8sL9+7dw9OnTwFk5YZGjRqJd/oCWdfmvn37Fjqe/F7Hzc3NkZSUhMjIyALvw9zcHABw4MABpKen59jv+PHjSE1Nhbe3t8K6586dw7Nnz1Suc+XKFdy7dw99+vTB69evxfiTkpLQqlUrnDx5EjKZDDKZDHv27IGvry/q1auntJ0PpwsjUjfmBs3ODaNGjcLNmzexatUqcWRzXpgbmBuI8sLcoFm5Yf78+UhPT1c5vWt++fv7Kxxffq9tmZmZOHr0KDp37oxKlSqJ67u4uMDLy6tQsQiCgJ07d8LX1xeCICicKy8vL8THx4vvt7m5Of79919cuHChwPuR54a9e/fm+ki+7I/VMDAwgJ6eHk6cOIG3b9+qXCcyMhJxcXHo3bu3Qvza2tpo2LCh+F6/evUKJ0+exJdffqlw/gDmBSpZmBc0Ky+YmJigWrVqGDlyJHbt2oU1a9YgIyMDnTt3RmxsbL7OAfMC84Km4fTppFaZmZmIiIiAp6cnHj58KLY3bNgQixcvxm+//Ya2bdsqrOPo6Jjj9nJb9qGePXtiypQpePr0KWxtbXHixAnExMSgZ8+eCv0OHDiA2bNn48qVK0rPzC6sTp06YdSoUUhLS8OFCxcwd+5cJCcnQ0tL8R6VGzduYOrUqTh+/Lj4LA65+Pj4PPdz79493Lp1K8f/WMTExBQ49kePHkFLSwtVqlRRaLe2toa5uTkePXpU4G0WVPaLPABYWFgoJJJ79+7h2rVrRXrsRJ8LXpc167pcVLK/T/fv34cgCJg2bRqmTZumcp2YmBjY2tri0aNHaNiwodLyqlWrFjqe/F7HR4wYgW3btsHb2xu2trZo27Yt/Pz80K5duzz34e7ujm7dumHmzJlYunQpPDw80LlzZ/Tp0wf6+vpiv4MHD6JevXoK03f98MMP8Pf3h52dHdzc3NC+fXsMGDAAlStXFuMHsn45zEl8fDzS0tKQkJCA6tWr531SiNSIuUGzc8PChQuxYcMGBAUFoX379vlej7mBuYEoN8wNmpUboqOjsXDhQqxevVqcjrcwsr9P+b22paam4v3793B2dlZaXrVqVXEQRkG8evUKcXFx+PHHH/Hjjz+q7CM/V5MmTcKvv/6KBg0aoEqVKmjbti369OmDpk2b5rmfnj17YuPGjRg8eDC+++47tGrVCl27dkX37t0V3vuDBw+ibdu24s1n+vr6WLBgAcaPH4/y5cujUaNG6NChAwYMGCBO8yw/f/LCVnampqYAIA4EYW6gkox5QbPyQkZGBlq3bg0PDw+sXLlSbG/dujWqVauGhQsXio8iyg3zAvOCpmFRnNTq+PHjeP78OSIiIhAREaG0PCwsTClZ5vYcuvw+o65nz56YPHkytm/fjrFjx2Lbtm0wMzNT+EPJqVOn0LFjR7Ro0QJr1qxBhQoVoKuri+DgYISHh+fzCJVVrFgRrVu3BgC0b98e5cqVw6hRo+Dp6YmuXbsCAOLi4uDu7g5TU1PMmjULTk5OkEqluHTpEiZNmpTrHUhyMpkMNWrUwJIlS1Qut7OzK/QxqPNuI21tbZXtwgfPlpHJZGjTpg0mTpyosu8XX3xRLLERlQa8LmvmdfljZX+f5MczYcKEHO/QzX6D1MfIfsd4fq/jVlZWuHLlCo4ePYrDhw/j8OHDCA4OxoABA7Bp06Zc9ymRSLBjxw78+eef2L9/P44ePYovv/wSixcvxp9//in+sfDQoUMYOHCgwrp+fn5o3rw5du/ejWPHjom/LO7atQve3t7i+Vu4cCFq166tcv/GxsZ48+ZNnueGqCRgbtDc3BASEoJJkyZh2LBhmDp1aoHWZW5gbiDKDXODZuWG6dOnw9bWFh4eHuKzxF+8eAEgq4gQHR2NSpUqKRVyssspN+R1bfuwAJWXnP7mpCovAEC/fv1yLL7UrFkTQNbIwzt37uDAgQM4cuQIdu7ciTVr1mD69OmYOXNmrvEYGBjg5MmTiIqKwsGDB3HkyBFs3boVLVu2xLFjx6CtrY3k5GScOHECa9euVVh37Nix8PX1xZ49e3D06FFMmzYN8+bNw/Hjx1GnTh3xGEJDQxWehyyX39ldiEoC5gXNygsnT57E9evXlbbp7OwMFxcXlc/HVoV5gXlB0/AMklqFhYXBysoKq1evVlq2a9cu7N69G+vWrct3EswvR0dHNGjQAFu3bsWoUaOwa9cudO7cWeHu/507d0IqleLo0aMK7cHBwUUay9ChQ7F06VJMnToVXbp0gUQiwYkTJ/D69Wvs2rULLVq0EPt+eJedXE5JwcnJCVevXkWrVq2KrIhtb28PmUyGe/fuwcXFRWx/+fIl4uLiYG9vXyT7+VhOTk549+6d+J8SIso/Xpc167pcXOSj2nR1dfO8ltrb24t3sn7ozp07Sm0WFhaIi4tTaEtLS8Pz588V2gpyHdfT04Ovry98fX0hk8kwYsQIrF+/HtOmTUOVKlXyPNeNGjVCo0aNMGfOHISHh6Nv376IiIjA4MGDcf36dTx+/Bg+Pj5K61WoUAEjRozAiBEjEBMTg7p162LOnDnw9vaGk5MTgKw7eHM7BktLS5iamuL69et5HieROjE3aGZu2Lt3LwYPHoyuXbuqfO8KirmBuYHoQ8wNmpUbHj9+jPv374vX8g+NGDECQNZ0wvIpYfOrINc2AwODfOUGCwsLAFDKDdlnJ7S0tISJiQkyMzPzlRuMjIzQs2dP9OzZE2lpaejatSvmzJmDyZMnQyqV5nqutbS00KpVK7Rq1QpLlizB3Llz8f333yMqKgqtW7dW+VgNOScnJ4wfPx7jx4/HvXv3ULt2bSxevBhbtmwRz5+VlVWuxyB/35gbqCRjXtCsvPDy5UsAyoVlAEhPT0dGRkahtsu8wLxQ0vGZ4qQ279+/x65du9ChQwd0795d6WvUqFFITEzEvn37imX/PXv2xJ9//omff/4ZsbGxSlOqaGtrQyKRKCSG6Oho7Nmzp0jj0NHRwfjx43Hr1i3s3btX3DegOPo5LS0Na9asUVrfyMhI5TQrfn5+ePr0KTZs2KC07P3790hKShJfx8bG4vbt20hOTs41Vvl0i8uWLVNol99RpuoPQ+rg5+eHP/74A0ePHlVaFhcXV+ikTlTa8bqcRZOuy8XFysoKHh4eWL9+vVJRAsgaTSLXvn17/Pnnnzh//rzC8rCwMKX1nJycxGe+yv34449Kv4Tl9zr++vVrhWVaWlriXb/yu46NjIzE9T709u1bhfcTgHgXs3zdQ4cOoXz58grPdM3MzFR6f62srGBjYyOu5+bmBicnJyxatAjv3r1TOgb5+dPS0kLnzp2xf/9+XLx4Ualf9viI1IG5IYum5YaTJ0+iV69eaNGiBcLCwvIc+ZcfzA3MDURyzA1ZNCk3zJ49G7t371b4CgoKAgBMnDgRu3fvFq+NBZHfa5u2tja8vLywZ88ePH78WFx+69Ytpeu6qakpypUrp5Qbsp9DbW1tdOvWDTt37lRZFPgwL2XPDXp6enB1dYUgCEhPTweQc25QNYOHqtyQ/bEaycnJSElJUVjPyckJJiYm4npeXl4wNTXF3LlzxThUHYOlpSVatGiBn3/+WeH8AcwLVDIwL2TRpLwgn2Up+6j+S5cu4c6dO6hTp06u6+eEeYF5oaTjSHFSm3379iExMREdO3ZUubxRo0awtLREWFiYUiIrCn5+fpgwYQImTJiAMmXKKN154+PjgyVLlqBdu3bo06cPYmJisHr1alSpUgXXrl0r0lgCAgIwffp0LFiwAJ07d0aTJk1gYWEBf39/jB49GhKJBKGhoSovaG5ubti6dSvGjRuH+vXrw9jYGL6+vujfvz+2bduGYcOGISoqCk2bNkVmZiZu376Nbdu24ejRo+IfclatWoWZM2ciKioKHh4eOcZZq1Yt+Pv748cffxSnfjl//jw2bdqEzp07w9PTs0jPS2F9++232LdvHzp06ICAgAC4ubkhKSkJf//9N3bs2IHo6GiUK1dO3WESlTi8Lv9HU67LAHDt2jXxF8v79+8jPj4es2fPBpB13fb19S3UOVi9ejWaNWuGGjVq4KuvvkLlypXx8uVL/PHHH/j3339x9epVAFl/RAsNDUW7du0wZswYGBkZ4ccff4S9vb3S+zJ48GAMGzYM3bp1Q5s2bXD16lUcPXpU6Zqc3+v44MGD8ebNG7Rs2RIVK1bEo0ePsHLlStSuXVuc0aR27drQ1tbGggULEB8fD319fbRs2RLh4eFYs2YNunTpAicnJyQmJmLDhg0wNTUVbwI7ePAgvL29Fe4OTkxMRMWKFdG9e3fUqlULxsbG+PXXX3HhwgUsXrwYQFZBY+PGjfD29ka1atUwcOBA2Nra4unTp4iKioKpqSn2798PAJg7dy6OHTsGd3d3DBkyBC4uLnj+/Dm2b9+O06dPF3i0DlFRY274j6bkhkePHqFjx46QSCTo3r07tm/frrC8Zs2aYpG4oJgbmBuIAOaGD2lKbmjWrJlSm/xaUr9+fXTu3LlQx1+Qa9vMmTNx5MgRNG/eHCNGjEBGRgZWrlyJatWqqcwN8+fPx+DBg1GvXj2cPHkSd+/eVdr//PnzERUVhYYNG+Krr76Cq6sr3rx5g0uXLuHXX38VCxdt27aFtbU1mjZtivLly+PWrVtYtWoVfHx8YGJiAiDr/QCA77//Hr169YKuri58fX0xa9YsnDx5Ej4+PrC3t0dMTAzWrFmDihUriudV1WM17t69i1atWsHPzw+urq7Q0dHB7t278fLlS/Tq1QtAVqFn7dq16N+/P+rWrYtevXrB0tISjx8/xsGDB9G0aVOsWrUKALBixQo0a9YMdevWxZAhQ+Do6Ijo6GgcPHgQV65cKdT7R1RUmBf+oyl5wc3NDW3atMGmTZuQkJCAtm3b4vnz51i5ciUMDAwwduzYQh0/8wLzQoknEKmJr6+vIJVKhaSkpBz7BAQECLq6ukJsbKzw8OFDAYCwcOFCpX4zZswQAAivXr3KcZkqTZs2FQAIgwcPVrn8p59+EpydnQV9fX3hf//7nxAcHKxye/b29oK/v38uR5sFgDBy5EiVywIDAwUAQlRUlCAIgnDmzBmhUaNGgoGBgWBjYyNMnDhROHr0qEIfQRCEd+/eCX369BHMzc0FAIK9vb24LC0tTViwYIFQrVo1QV9fX7CwsBDc3NyEmTNnCvHx8Urn6MPt5iQ9PV2YOXOm4OjoKOjq6gp2dnbC5MmThZSUFIV+/v7+gpGRkdL67u7uQrVq1VSemxkzZoivg4ODBQDCw4cPxTZ7e3vBx8dH5Tbd3d0V2hITE4XJkycLVapUEfT09IRy5coJTZo0ERYtWiSkpaXleZxEnyNelxVpynVZfr1U9ZXXOcjtPRQEQXjw4IEwYMAAwdraWtDV1RVsbW2FDh06CDt27FDod+3aNcHd3V2QSqWCra2tEBQUJPz0009K1/HMzExh0qRJQrly5QRDQ0PBy8tLuH//vsr3Kz/X8R07dght27YVrKysBD09PaFSpUrC0KFDhefPnytsa8OGDULlypUFbW1t8bxeunRJ6N27t1CpUiVBX19fsLKyEjp06CBcvHhREARBiIuLE3R0dIRt27YpbCs1NVX49ttvhVq1agkmJiaCkZGRUKtWLWHNmjVK5+/y5ctC165dhbJlywr6+vqCvb294OfnJ/z2228K/R49eiQMGDBAsLS0FPT19YXKlSsLI0eOFFJTU3N+84g+EeYGRZqQG6KionLMC9n/z60KcwNzA1FemBsUaUJuUEWeL7Zv3/7RffN7bfv9998FNzc3QU9PT6hcubKwbt06le9LcnKyMGjQIMHMzEwwMTER/Pz8hJiYGJV57OXLl8LIkSMFOzs7QVdXV7C2thZatWol/Pjjj2Kf9evXCy1atBDjc3JyEr799luFcykIghAUFCTY2toKWlpaYr767bffhE6dOgk2NjaCnp6eYGNjI/Tu3Vu4e/euIAiCcP36dQGAcP78eYVtxcbGCiNHjhT+97//CUZGRoKZmZnQsGFDpRwiP79eXl6CmZmZIJVKBScnJyEgIEDMP3LXr18XunTpIpibmwtSqVSoWrWqMG3aNJXvCdGnxLygSFPyQnJysjBr1izB1dVVMDAwEMzMzIQOHToIly9fznNd5gXmBU0lEQSOpSciIiKiohUSEoKBAwfi4cOHcHBwUHc4BbZt2zb07dsXsbGxMDMzU3c4RESlAnMDERFlFxgYiJkzZ2rsdK8//PADlixZgufPnxfZs36JiD5nzAtUnPhMcSIiIiKibMzNzbFixQoWPYiISMTcQERE2Tk4OGDp0qUsfBAREQDmhZKOzxQnIiIiIsqmbdu26g6BiIhKGOYGIiLKzs/PT90hEBFRCcK8ULJxpDgREREREREREREREREREZVafKY4ERERERERERERERERERGVWhwpTkREREREREREREREREREpRaL4kREREREREREREREREREVGrpqDuAkkAmk+HZs2cwMTGBRCJRdzhERJ8FQRCQmJgIGxsbaGmVrHu0mBeIiNSDuYGIiD5UkvMCwNxARKQOzA1ERJRdfnMDi+IAnj17Bjs7O3WHQUT0WXry5AkqVqyo7jAUMC8QEakXcwMREX2oJOYFgLmBiEidmBuIiCi7vHIDi+IATExMAGSdLFNTUzVHk7f09HQcO3YMbdu2ha6urrrDKRBNjV1T4wYYuzpoatzAp409ISEBdnZ24jW4JJHH9PDhQ/zxxx8a915q6mdQU+MGNDd2xv1pMe68aUJu0JTfGeT4ufu0NDVuQHNjZ9yf1qeOuyTnBeC/3LBx40Z07txZo95LQHM/hwBjVxdNjV1T4wYYuyqakhs06fcGTf6cAZodvybHDjB+dWP8/8lvbmBRHBCnMTE1NdWIRJWeng5DQ0OYmppq3AddU2PX1LgBxq4Omho3oJ7YS+JUUvKYTExMNPK91NTPoKbGDWhu7Iz702Lc+VeSc4Om/M4gx8/dp6WpcQOaGzvj/rTUFXdJzAvAf3Fp4nsJaO7nEGDs6qKpsWtq3ABjz01Jzw2a9HuDJn/OAM2OX5NjBxi/ujF+ZXnlhpL30A0iIiIiIiIiIiIiIiIiIqIiwqI4ERERERERERERERERERGVWiyKExERERERERERERERERFRqcVnin/GMjMzkZ6e/kn3mZ6eDh0dHaSkpCAzM/OT7vtjaGrcAGNXB02NGyja2HV1daGtrV1EkRERERERERERERERERUOi+KfIUEQ8OLFC8TFxall39bW1njy5EmeD7wvSTQ1boCxq4Omxg0Ufezm5uawtrbWuPNARERERERERERERESlB4vinyF5QdzKygqGhoaftFglk8nw7t07GBsbQ0tLc2bv19S4AcauDpoaN1B0sQuCgOTkZMTExAAAKlSoUFQhEhERERERERERERERFQiL4sVNlgk8Ogu8ewkYlwfsmwBa6ptOODMzUyyIly1b9pPvXyaTIS0tDVKpVKOKhZoaN8DY1UFT4waKNnYDAwMAQExMDKysrDiVOhERERERERER0ccoYfUGIiJNwqJ4cbq5DzgyCUh49l+bqQ3QbgHg2lEtIcmfIW5oaKiW/RPR50V+rUlPT2dRnIiIKD/4Ry4iIiIiIlKlBNYbiIg0CYvixeXmPmDbAACCYnvC86x2v81qTVR8vi8RfQq81hARERUA/8hFRER54c1TRESfpxJebyAi0gSaNa+vppBlZv0xK3uCAv5rO/JdVj8iDXbixAlIJBLExcXlex0HBwcsW7as2GIiIiIi0kjyP3J9WBAH/vsj18196omLiIhKjpv7gGXVgU0dgJ2Dsr4vq84cQURU2rHeQERUJFgULw6Pzir/MUuBACQ8zepHVIwCAgKgra2Nb775RmnZyJEjIZFIEBAQ8OkDIyIiIqL/8I9cRESUF948RUT0+WK9gYioSLAoXhzevSzafkQfwc7ODrt27cL79+/FtpSUFISHh6NSpUpqjIyIAGQVOB6eAv7ekfWdBQ8ios8P/8hFRES54c1TRESfN9YbiIiKBIvixcG4fNH2I/oIderUga2tLXbt2iW27dq1C5UqVUKdOnXEttTUVIwePRpWVlaQSqVo1qwZLly4oLCtQ4cO4YsvvoCBgQE8PT0RHR2ttL/Tp0+jefPmMDAwgJ2dHUaPHo2kpKRiOz4ijcbpD4mICOAfuYiIKHe8eYqI6PPGegMRUZFgUbw42DcBTG0ASHLoIAFMbbP6EX0C/fr1w6ZNm8TXP//8MwYOHKjQZ+LEidi5cyc2bdqES5cuoUqVKvDy8sKbN28AAE+ePEHXrl3h6+uLK1euYPDgwfjuu+8UtvHgwQO0a9cO3bp1w7Vr17B161acPn0ao0aNKv6DJNI0nP6QiIjk+EcuIiLKDW+eIiL6vLHeQERUJFgULw5a2kC7Bf//Inui+v/X7eZn9SP6BPz8/HD69Gk8evQIjx49wpkzZ9CvXz9xeVJSEtauXYuFCxfC29sbrq6u2LBhAwwMDPDTTz8BANauXQsnJycsXrwYVatWRd++fZWeRz5v3jz07dsXY8eOhbOzM5o0aYIVK1Zg8+bNSElJ+ZSHTFSycfpDIiL6EP/IRUREueHNU0REnzeFekN2rDcQEeUXi+LFxbUj4LcZMLFWbDe1yWp37aieuOizVK5cObRv3x4hISEIDg6Gj48PypUrJy5/8OAB0tPT0bRpU7FNV1cXDRo0wK1btwAAt27dQsOGDRW227hxY4XXV69eRUhICIyNjcUvLy8vyGQyPHz4sBiPkEjDcPpDIiL6EP/IRUREueHNU0REJK836BoqtrPeQESUbzrqDqBUc+0I2NYDlrpkvfbfD9g35R+zSC0GDhyI0aNHAwBWr15dLPt49+4dhg4dKu7nQ5UqVSqWfRJpJE5/SERE2cn/yLV7CJD+/r92U5usgjj/yEVE9PmS3zy1bYCKhbx5iojos+HaEbi+E7i5B6jZG6jTN+uGKF7/iYjyhSPFi1vG//9BS88EcGzBBEVq065dO6SlpSE9PR1eXl4Ky5ycnKCnp4czZ86Ibenp6bhw4QJcXV0BAC4uLjh//rzCen/++afC67p16+LmzZuoUqWK0peenl4xHRmRBuL0h0REpIprR8DBPevnOv0B/wPA2L9ZECciov9untLO9rs1RwgSEX1e0pOzvjs2Axybs95ARFQALIoXt7R3Wd/1jNQbB332tLW1cevWLdy8eRPa2or/WTIyMsLw4cPx7bff4siRI7h58ya++uorJCcnY9CgQQCAYcOG4d69e/j2229x584dhIeHIyQkRGE7kyZNwtmzZzFq1ChcuXIF9+7dw969ezFq1KhPdZhEmoHTHxIRUU7SErO+O7XkH7mIiEiRa0fA7P9nYWvxLW+eIiL6HKXK6w3G6o2DiEgDsShe3NKSsr7rM0mR+pmamsLU1FTlsvnz56Nbt27o378/6tati/v37+Po0aOwsLAAkDX9+c6dO7Fnzx7UqlUL69atw9y5cxW2UbNmTfz++++4e/cumjdvjjp16mD69OmwsbEp9mMj0igKz47NXhjn9IdERJ+1lISs71LV/2cjIqLP3Ps3Wd+rd+PNU0REnyP5TbSsNxARFRifKV7c5EVxjhQnNQgJCYFMJkNCQoLK5Xv27BF/lkqlWLFiBVasWJHj9jp06IAOHTootA0cOFDhdf369XHs2LEctxEdHZ134ESfA/n0h0cmAQnP/mvns2OJiD5vqfFZ3/XN1BsHERGVPLJM4P3brJ8Ny6o3FiIiUg9xpLiJeuMgItJAah0pPm/ePNSvXx8mJiawsrJC586dcefOHXH5mzdv8PXXX6Nq1aowMDBApUqVMHr0aMTHxytsRyKRKH1FRER86sNRLY3TmRARUQ5cOwLD//zvdd+dnP6QiOhzl/L/v+twpDgREWX3Pg6AkPWzgYU6IyEiInWR1xs4UpyIqMDUOlL8999/x8iRI1G/fn1kZGRgypQpaNu2LW7evAkjIyM8e/YMz549w6JFi+Dq6opHjx5h2LBhePbsGXbs2KGwreDgYLRr1058bW5u/omPJgepfKY4ERHlIvX/Z3LQ1gOqtAIkOT1nnIiISj1BAFLl0yGyKE5ERNkkv876LjUDtHXVGwsREakH6w1ERIWm1qL4kSNHFF6HhITAysoKf/31F1q0aIHq1atj586d4nInJyfMmTMH/fr1Q0ZGBnR0/gvf3Nwc1tbWnyz2fOP06URElBv59IcGFiyIExF97tLeAYIs62eOFCciouzkRXFOnU5E9HnKzAAy3mf9zOnTiYgKrEQ9U1w+LXqZMmVy7WNqaqpQEAeAkSNHYvDgwahcuTKGDRuGgQMHQpJDcSE1NRWpqania/nzltPT05Genv6xh6FAKyUB2gBkOobILKJty2MsTKzp6ekQBAEymQwymaxI4ikIQRDE7+rYf2FpatwAY1cHTY0bKPrYZTIZBEFAeno6tLW1FZYV9fX2Y+SWFz78Xhwk715BB4AgNUdGCcgT6qSpcQOaGzvj/rQYd/73VRJ8yt8ZRO/eQBeAoKWDDOgCRbAffu4+LU2NG9Dc2Bn3p/Wp4y5p5yen3AB8mlgliTHQASAzKFMkf2PS1M8hwNjVRVNj19S4Acae23ZLik/6e0NKAuTzhKRr6RfJ7wuAZn/OAM2OX5NjBxi/ujF+5W3lRSLIKyBqJpPJ0LFjR8TFxeH06dMq+8TGxsLNzQ39+vXDnDlzxPagoCC0bNkShoaGOHbsGGbMmIEffvgBo0ePVrmdwMBAzJw5U6k9PDwchoaGRXNA/8/l2XZ88XI/Hli2xfWK/Yp024Who6MDa2tr2NnZQU9PT93hEFEpl5aWhidPnuDFixfIyMhQWJacnIw+ffqINzup06fMC9lViLuABg9X4rXRFzj9xdRi3RcRUUn3uecGk/f/ouXtKUjVNsaRmmuKZR9ERJqkJOUFQL2/NwBApdgTqPPkZ7wwrYVzTuOLfX9ERCXR55wbpGlv4HVjLGQSbeyvHVyk2yYi0mT5zQ0lpig+fPhwHD58GKdPn0bFihWVlickJKBNmzYoU6YM9u3bB13dnJ+dNH36dAQHB+PJkycql6u6e8vOzg6xsbFFnki1jn4H7Ysbkdl0HGQeU4pkm+np6YiMjESbNm1yPQ+qpKSk4MmTJ3BwcIBUKi2SeApCEAQkJibCxMQkx5H8JZGmxg0wdnXQ1LiBoo89JSUF0dHRsLOzU7rmJCQkoFy5ciXil5ic8sLz589x7ty5Ql1v80tyeTN0Do2DzNkLmX5hRbLNj8kT6qSpcQOaGzvj/rQYd940ITcUx+8McpJ/z0NnU3sI5g7IGHmxSLbJz92npalxA5obO+P+tD513CUpLwA554bw8HB06tSp2M+J1tnl0I4KgqxmL2T6rvro7Wnq5xBg7OqiqbFratwAY1dFU3JDsfzeEHsXuuubQDCwQMa4e0W2WU3+nAGaHb8mxw4wfnVj/P/Jb24oEdOnjxo1CgcOHMDJkydVFsQTExPRrl07mJiYYPfu3XmenIYNGyIoKAipqanQ19dXWq6vr6+yXVdXt+g/OP//jA9tqQm0i3jbhYk3MzMTEokEWlpa0NLSKtJ48kM+HbM8Bk2hqXEDjF0dNDVuoOhj19LSgkQiUXm9KkmJOre8IP9ebPGmZU2rpWVYFlolIE+UBJoaN6C5sTPuT4tx576PkuKT/s4gl54EAJBITYt8H/zcfVqaGjegubEz7k/rU8Vd0s5NTrkB+ETnJOUtAEDLqFyR/u6gqZ9DgLGri6bGrqlxA4w9+/ZKkk/6e0NmCgBAomdSLOdBkz9ngGbHr8mxA4xf3Rh//nODWqs1giBg1KhR2L17N44fPw5HR0elPgkJCWjbti309PSwb9++fI1uvnLlCiwsLHL8ReWTSnuX9V3PWL1xEBFRyfQ+6w9bMLBQbxxERKR+qf//bFqpmXrjICKikin5TdZ3w7LqjYOIiNQjLTHruz5rDUREhaHWkeIjR45EeHg49u7dCxMTE7x48QIAYGZmBgMDA7EgnpycjC1btiAhIQEJCVl/KLK0tIS2tjb279+Ply9folGjRpBKpYiMjMTcuXMxYcIEdR7af9KyRnswURERkUrv47K+syhOREQp8Vnf9dU/DSQREZVA71kUJyL6rKXKB+AZqTcOIiINpdaR4mvXrkV8fDw8PDxQoUIF8Wvr1q0AgEuXLuHcuXP4+++/UaVKFYU+8ueF6+rqYvXq1WjcuDFq166N9evXY8mSJZgxY4Y6D+0/8qI4E1WxO3HiBCQSCeLi4tQdikoeHh4YO3asWmNwcHDAsmXL1BoDEWUjjhQ3V2sYRERUAnCkOBER5Sb5ddZ3FsWJiD5PnJWWiOijqH36dFVfAQEBALKKiDn1cXBwAAC0a9cOly9fRmJiIt69e4crV65g6NChJec5vrx7q0jIC945fXl6eqo7xDzt2rULQUFBxbqPwMBA1K5du1j3UVzu3LkDT09PlC9fHlKpFJUrV8a0adOQnp6e63pr165FzZo1YWpqClNTUzRu3BiHDx9W6DN06FA4OTnBwMAAlpaW6NSpE27fvq3Q5/Hjx/Dx8YGhoSGsrKzw7bffIiMjQ1weEhICiUQCFxcXpRi2b98OiUQiXpeICoTTpxMRkVyKvCjOkeJERKQCi+JERJ83eVGcs9ISERWKWqdP/yyU4ru3MmUCzj98g5jEFFiZSNHAsQy0tSTFsq8mTZrg+fPnSu379u3DsGHDMGLEiGLZb1EqU6aMukMo0XR1dTFgwADUrVsX5ubmuHr1Kr766iu8f/8eixYtynG9ihUrYv78+XB2doYgCNi0aRM6deqEy5cvo1q1agAANzc39O3bF5UqVcKbN28QGBiItm3b4uHDh9DW1kZmZiZ8fHxgbW2Ns2fP4vnz5xgwYAB0dXUxd+5ccV9GRkaIiYnBH3/8gcaNG4vtP/30EypVqlR8J4dKN3H6dHN1RkFERCWBfKQ4p08nIiJVWBQnIvq8iQPwTNQbBxGRhiohw6lLsVI6ffqR68/RbMFx9N7wJ8ZEXEHvDX+i2YLjOHJduXBdFPT09GBtba3w9fbtW0yYMAFTpkxBjx49xL5//fUX6tWrB0NDQzRp0gR37twRlwUEBKBz584K2x47diw8PDzE16mpqRg9ejSsrKwglUrRrFkzXLhwQVwuH7V+9OhR1KlTBwYGBmjZsiViYmJw+PBhuLi4wNTUFH369EFycrK4Xvbp0x0cHDB37lx8+eWXMDExQaVKlfDjjz8qxPbvv/+id+/eKFOmDIyMjFCvXj2cO3fuI8/mf5YsWYIaNWrAyMgIdnZ2GDFiBN69eycuDwkJgbm5OQ4cOICqVavC0NAQ3bt3R3JyMjZt2gQHBwdYWFhg9OjRyMzMFNcLDQ1FvXr1YGJiAmtra/Tp0wcxMTG5xlK5cmUMHDgQtWrVgr29PTp27Ig+ffrgjz/+yHU9X19ftG/fHs7Ozvjiiy8wZ84cGBsb488//xT7DBkyBC1atICDgwPq1q2L2bNn48mTJ4iOjgYAHDt2DDdv3sSWLVtQu3ZteHt7IygoCKtXr0ZaWpq4HR0dHfTp0wc///yz2Pbvv//ixIkT6NOnT77OOZGSlLis7xwpTkRE8meKc6Q4ERFll5n+X55gUZyI6PPEkeJERB+FRfHiJhbFS0+iOnL9OYZvuYTn8SkK7S/iUzB8y6ViK4x/KC4uDp06dYKHh4fSlOTff/89Fi9ejIsXL0JHRwdffvllgbY9ceJE7Ny5E5s2bcKlS5dQpUoVeHt74+3btwr9AgMDsWrVKpw9exZPnjyBn58fli1bhvDwcBw8eBDHjh3DypUrc93X4sWLUa9ePVy+fBkjRozA8OHDxSL+u3fv4O7ujqdPn2Lfvn24evUqJk6cCJlMVqDjyY2WlhZWrFiBGzduYNOmTTh+/DgmTpyo0Cc5ORkrVqxAREQEjhw5ghMnTqBLly44dOgQDh06hNDQUKxfvx47duwQ10lPT0dQUBCuXr2KPXv2IDo6WnwsQn7dv38fR48eRdOmTfO9TmZmJiIiIpCUlKQwkvtDSUlJCA4OhqOjI+zs7AAAf/zxB2rUqIHy5cuL/by8vJCQkIAbN24orP/ll19i27Zt4g0PISEhaNeuncK6RAUinz5daq7WMIiIqARI4UhxIiLKgfz3Bkg4yxQR0ecqtfTOSktE9Clw+vTiJJMB6SW/KC4IAt6nZ+bdEVlTps/YdwOCqu0AkAAI3HcTTauUUzmVukwmw/u0TOikZUBLSwsGutqQSAo25bpMJkOfPn2go6ODsLAwpfXnzJkDd3d3AMB3330HHx8fpKSkQCqV5rntpKQkrF27FiEhIfD29gYAbNiwAZGRkQgNDcXUqVPFvrNnzxYLtoMGDcLkyZPx4MEDVK5cGQDQvXt3REVFYdKkSTnur3379uLU75MmTcLSpUsRFRWFqlWrIjw8HK9evcKFCxfEqderVKmS39OUL9lHrs+ePRvDhg3DmjVrxPb09HSsXbsWTk5O4nGFhobi5cuXMDY2hqurKzw9PREVFYWePXsCgMKNCJUrV8aKFStQv359vHv3DsbGuf9baNKkCS5duoTU1FR89dVXmDJlSp7H8ffff6Nx48ZISUmBsbExdu/eDVdXV4U+a9aswcSJE5GUlISqVasiMjISenp6AIAXL14oFbXlr1+8eKHQXqdOHVSuXBk7duxA//79ERISgiVLluCff/7JM04iJRlp/93ly5HiRESUymeKExFRDuRTpxtYAFra6o2FiIjUIy0x6ztHihMRFQqL4sVJXhAHSvT06e/TM+E6/WiRbEsA8CIhBTUCj+Wr/81ZXjDUK9jHcMqUKfjjjz9w/vx5mJgoPz+lZs2a4s8VKlQAAMTExOTrmc8PHjxAenq6wuhkXV1d1K9fH3fv3s1xP+XLl4ehoaFYEJe3nT9/Ptf9fbgNiUQCa2trcZrxK1euoE6dOiqfRf748WOFou+UKVPyVTzO7tdff8W8efNw+/ZtJCQkICMjAykpKUhOToahoSEAwNDQUCyIy4/LwcFBobhdvnx5henR//rrLwQGBuLq1at4+/atOLpdHne1atXw6NEjAEDz5s1x+PBhcd2tW7ciMTERV69exbfffgtbW1tMmzYNp06dEm9UAID169ejb9++AICqVaviypUriI+Px44dO+Dv74/ff/9d4Rz17dsXbdq0wfPnz7Fo0SL4+fnhzJkz+bpZIrsvv/wSwcHBqFSpEpKSktC+fXusWrWqwNshEqdOhwSQmqkzEiIiKgnkI8WZE4iIKDs+T5yIiDhSnIjoo7AoXpzkU6dLtABdA/XGUkpERERg0aJFOHjwIJydnVX20dXVFX+WjyKXF2W1tLQgCIrj3NPT0wsVS/b9fPha3pbXVOe5rWNgkPNnxsbGBleuXBFfqyqc5yU6OhodOnTA8OHDMWfOHJQpUwanT5/GoEGDkJaWJhbFVcWYW9xJSUnw8vKCl5cXwsLCYGlpicePH8PLy0t8PvehQ4fE8579OOVTmru6uiI9PR3Dhg3DlClTUK9ePYVj/nBkt56enjiK3s3NDRcuXMDy5cuxfv16sY+ZmRnMzMzg7OyMRo0awcLCArt370bv3r1hbW2tdAPDy5cvAQDW1tZK565v376YOHEiAgMD0b9/f+jo8FJKhfQ+Luu71IyjPYiI6L+R4vosihMRUTYsihMRURqL4kREH4OVnOL04fPECzhF+KdkoKuNm7O88tX3/MM3CAi+kGe/kIH10cBRuVArk8mQmJAIE1MTcfr0/Lpy5QoGDRqE+fPnw8srf/FmZ2lpievXryttV17kdXJygp6eHs6cOQN7e3sAWUXzixcvYujQoYXaZ2HVrFkTGzduxJs3b5SK3jo6Oh89lfpff/0FmUyGxYsXQ0tLCwCwbdu2j9omANy+fRuvX7/G/PnzxQL3xYsXFfrIz21eZDIZ0tPTIZPJYGBgkO9jlslkSE1NzXG5IAgQBEHs07hxY8yZMwcxMTGwsrICAERGRsLU1FRpGnYg6yaEjh07Ytu2bVi3bl2+YiJSSf5cQD4TkIiIgA9GinP6dCIiykYsihf8pngiIiol5PUGTp9ORFQoLIoXJ/HOrZI7dTqQNco3v1OYN3e2RAUzKV7Ep6h8rrgEgLWZFM2dLXN8pniGnjYM9XTEQmx+xMbGonPnzvDw8EC/fv2UnvOsrZ2/4nrLli2xcOFCbN68GY0bN8aWLVtw/fp11KlTBwBgZGSE4cOH49tvv0WZMmVQqVIl/PDDD0hOTkb//v3zHW9R6N27N+bOnYvOnTtj3rx5qFChAi5fvgwbGxs0btw4x/Xev3+vMKJaJpNBIpGgVq1aCv2qVKmC9PR0rFy5Er6+vjhz5kyRFHgrVaoEPT09rFy5EsOGDcP169cRFBSU53phYWHQ1dVFjRo1oK+vj4sXL+L7779Hly5dlEamf2jy5Mnw9vZGpUqVkJiYiPDwcJw4cQJHj2Y9EuCff/7B1q1b0bZtW1haWuLff//F/PnzYWBggPbt2wMA2rZtC1dXV/Tv3x8//PADXrx4galTp2LkyJHQ19dXud+QkBCsWbMGZcvyLn36CGJRnM8TJyL67MlkH4wUZ1GciIiyYVGciIg4fToR0UdhUbw4iSPFS3ZRvCC0tSSY4euK4VsuQQIoFMblJfAZvq4qC+If4+DBg3j06BEePXokPif8Q/b29ggJCclzO15eXpg2bRomTpyIlJQUfPnllxgwYAD+/vtvsc/8+fMhk8nQv39/JCYmol69ejh8+DDMzc2L8Ijypqenh2PHjmH8+PFo3749MjIy4OrqitWrV+e63t27d8Uiv5y7uzuOHz+u0FarVi0sWbIECxYswOTJk9GiRQvMmzcPAwYM+Ki4LS0tERISgilTpmDFihWoW7cuFi1ahI4dO+a6no6ODhYsWIC7d+9CEATY29tj5MiR+PLLL3NdLyYmBgMGDMDz589hZmaGmjVr4ujRo2jTpg0AQCqV4tSpU1i2bBnevn2L8uXLo0WLFjh79qw4KlxbWxsHDhzA8OHD0bhxYxgZGcHf3x+zZs3Kcb8GBga5TnFPlC8sihMRkVxaIsT/XXOkOBERZZf8Jus7p08nIvp8pSVmfdc3UW8cREQaikXx4pSqGSPFC6pd9QpY268uZu6/iefxKWK7tZkUM3xd0a66ctH6Y/n7+8Pf3z/PftmfF167dm2ltpkzZ2LmzJk5bkMqlWLFihVYsWKF2CaTyZCQkDVyx8PDQ2mbAQEBCAgIUGgLDAxEYGCg+PrEiRMKy6Ojo5X2/eEIbyCr2L9jx44cY80u+z6zx559n9988w2++eYbhbYPR8Tn57gAKN2Q0Lt3b/Tu3VuhLfs5y65nz57o2bNnjrHn5Keffsp1uY2NDQ4dOpRrHyDrXOfWT9W5+NDYsWMxduzYPPdDpCAlLus7i+JERCSfOl1LF9CRqjcWIiIqefhMcSIi4khxIqKPUqii+OPHj/Ho0SMkJyfD0tIS1apVy3GK4c9aWulNUu2qV0AbV2ucf/gGMYkpsDKRooFjmSIfIU5ElB8am5fkI8Wl5moNg4ioNNK43CCfOl1qBkj4f2oioqKmcXkhOxbFiYiKnMblBnm9gc8UJyIqlHwXxaOjo7F27VpERETg33//VRj1qaenh+bNm2PIkCHo1q1bgZ4VXaqJ06eXziSlrSVBYyf+MkZE6lEq8hKnTyciKlIanRvkI8U5dToRUZHR6LyQHadPJyIqEhqbGzIzgIz/n7W1lNYbiIiKW76u6qNHj0atWrXw8OFDzJ49Gzdv3kR8fDzS0tLw4sULHDp0CM2aNcP06dNRs2ZNXLhwobjj1gyl8JniREQlQanJSyyKExEVGY3PDfKR4vosihMRFQWNzwvZcaQ4EdFH0+jcIH+eOMCiOBFRIeVrpLiRkRH++ecflC2r/B9vKysrtGzZEi1btsSMGTNw5MgRPHnyBPXr1y/yYDVOWul8pjgRkbqVmrz0Pi7rO4viREQfTeNzQ0p81neOFCciKhIanxey40hxIqKPptG5QT4AT1sP0NFTbyxERBoqX0XxefPm5XuD7dq1K3QwpU4pfqY4EZE6lZq8JI4UN1drGEREpYHG5wZ5UZwjxYmIioTG54UPZaT+N0LQsIx6YyEi0mAanRtSWWsgIvpYBX4oxsOHD3Hv3j2l9nv37iE6OrooYio9OH06EVGx0+i8xOnTiYiKhUbmBvn06VJztYZBRFQaaWRe+JB8lLhEG9A3U28sRESlhMblBvkAPH0WxYmICqvARfGAgACcPXtWqf3cuXMICAgoiphKD3lRnImKiKjYaHReYlGciKhYaGRuSJEXxTlSnIioqGlkXviQ+DzxMoBWgf+UR0REKmhcbkj9/xlD9EzUGwcRkQYr8P+kL1++jKZNmyq1N2rUCFeuXCmKmEoPTp9ORFTsNDYvyWRASlzWzyyKExEVKY3MDfKR4pw+nYioyGlkXviQWBTn88SJiIqKxuUGjhQnIvpoBS6KSyQSJCYmKrXHx8cjMzOzSIIqNTh9OhFRsdPYvJSWCAiyrJ85VS4RUZHSyNzAkeJERMVGI/PCh1gUJyIqchqXG8RnirPWQERUWAUuirdo0QLz5s1TSAyZmZmYN28emjVrVqTBaTwmqs+Kh4cHxo4dK752cHDAsmXL1BbPxwgICEDnzp3VHQZRvmhsXpJPna5jAOhK1RsLEVEpo5G5ISU+6ztHihMRFTmNzAsf+nD6dCIiKhIalxs4Ky0R0UfTKegKCxYsQIsWLVC1alU0b94cAHDq1CkkJCTg+PHjRR6gRivtI8VlmcCjs8C7l4BxecC+CaClre6oCsTDwwO1a9fW2OJ1YZ04cQKenp54+/YtzM3N1R1OgXXs2BFXrlxBTEwMLCws0Lp1ayxYsAA2NjYq+7958wYzZszAsWPH8PjxY1haWqJz584ICgqCmZlZjvtJSUnBsGHD8Ndff+HWrVvo0KED9uzZo9AnJCQEAwcOFF8bGRmhatWqGDt2LPr27ZvrcTg4OGDs2LEKN1MUVnR0NBwdHXH58mXUrl37o7eXl8DAQOzZs6dETCelsXmJzxMnIio2GpkbUjlSnIiouGhkXvhQ8pus7wYsihMRFRWNyw3i9Ol8pjgRUWEVeKS4q6srrl27Bj8/P8TExCAxMREDBgzA7du3Ub169eKIUXOJd2+VwkR1cx+wrDqwqQOwc1DW92XVs9qJipmnpye2bduGO3fuYOfOnXjw4AG6d++eY/9nz57h2bNnWLRoEa5fv46QkBAcOXIEgwYNynU/mZmZMDAwwOjRo9G6desc+5mamuL58+d4/vw5Ll++jLZt22LgwIG4c+dOoY+xuKSlpak7hCKnsXnpfVzWdxbFiYiKnEbmBnH69Jxv2CMiosLRyLzwIU6fTkRU5DQuN6RypDgR0ccqcFEcAGxsbDB37lwcPHgQO3bswPTp01GmDO9WVVJaR4rf3AdsGwAkPFNsT3ie1V5MhXEPDw98/fXXGDt2LCwsLFC+fHls2LABSUlJGDhwIExMTFClShUcPnxYXOf69evw9vaGsbExypcvj/79+yM2NhZA1hThv//+O5YvXw6JRAKJRILo6GhkZmZi0KBBcHR0hIGBAapWrYoVK1Z8dPxLlixBjRo1YGRkBDs7O4wYMQLv3r0Tl4eEhMDc3BwHDhxA1apVYWhoiO7duyM5ORmbNm2Cg4MDLCwsMHr0aIVpfUJDQ1GvXj2YmJjA2toaffr0QUxMzEfH+6EjR46gWbNmMDc3R9myZdGhQwc8ePBAXB4dHQ2JRIJt27ahefPmMDAwQP369XH37l1cuHAB9erVg7GxMby9vfHq1StxvQsXLqBNmzYoV64czMzM4O7ujkuXLuUZzzfffINGjRrB3t4eTZo0wXfffYc///wT6enpKvtXr14dO3fuhK+vL5ycnNCyZUvMmTMH+/fvR0ZGRo77MTIywtq1a/HVV1/B2to6x34SiQTW1tawtraGs7MzgoKCoKWlhWvXruV5LNm3s3HjRnTp0gWGhoZwdnbGvn3//Xt6+/Yt+vbtC0tLSxgYGMDZ2RnBwcEAAEdHRwBAnTp1IJFI4OHhAeC/qfDnzJkDGxsbVK1aVdxX9lHv5ubmCAkJEV//+++/6N27N8qUKQMjIyPUq1cP586dQ0hICGbOnImrV6+K/3Y+XE8dNDIviSPFzdUaBhFRaaVxuUE+UpzTpxMRFQuNywsfYlGciKhYaFRuEEeKsyhORFRYhSqKnzp1Cv369UOTJk3w9OlTAFmFudOnTxdpcBpPU4rigpAVa36+UhKAwxMBCKo2lPXtyKSsfjltIz35v58FVdvJ2aZNm1CuXDmcP38eX3/9NYYPH44ePXqgSZMmuHTpEtq2bYv+/fsjOTkZcXFxaNmyJerUqYOLFy/iyJEjePnyJfz8/AAAy5cvR+PGjfHVV1+Jo3zt7Owgk8lQsWJFbN++HTdv3sT06dPx/fffY/fu3R91mrW0tLBixQrcuHEDmzZtwvHjxzFx4kSFPsnJyVixYgUiIiJw5MgRnDhxAl26dMGhQ4dw6NAhhIaGYv369dixY4e4Tnp6OoKCgnD16lXs2bMH0dHRCAgI+KhYs0tKSsK4ceNw8eJF/Pbbb9DS0kKXLl0gk8kU+s2YMQNTp07FpUuXoKOjgz59+mDixIlYvnw5Tp06hfv372P69Oli/8TERPj7++P06dP4888/4ezsjPbt2yMxMTHfsb158wZhYWFo0qQJdHV1871efHw8TE1NoaNT4KdI5CozMxObNm0CANStW7fA68+cORN+fn64du0a2rdvj759+/4fe/cd31S9/gH8k6Tp3ruVljIKpVCwyFYRZIMoilcEfwrKRcUNF+cFBQeiV1HR68A9QK5e50VFAcGBbASBslcZHbSlMx1pcn5/pOfQTdImOeebft6vl6/QND15Oj+ePOf7fFFQYBuVN2/ePGRkZOCHH37Avn378MYbbyAyMhIAsGXLFgDAmjVrkJWVhS+//FI55tq1a3HgwAGsXr0aK1eutKuO0tJSXHHFFTh9+jS+/fZb7Nq1Cw899BCsVismTZqEf/zjH+jevbvyuzNp0iSHP1dnEjKXOD6diMilhMsGrhQnInIp4XKhNjbFiYhcQqhs4EpxIqJWc7gb9MUXX+Dmm2/GTTfdhB07dqCyshKArcG0cOFCfP/9904vUkhWK2CWm+IaDyqzCVjY+F7MjpNsK8gXJTT6Xj2A0Np3PHbGoYsGevXqhblz5wIAHn30USxatAiRkZGYMWMGAODxxx/HG2+8gb/++gtr1qxBeno6Fi5cqHz8e++9h4SEBBw8eBBdunSBt7c3/P3966wCNhgMWLBggfJ2hw4d8Mcff+Drr7/G1KlT7a61vtr7RiclJeHpp5/GnXfeiddff12532w244033kCnTp0AANdffz0+/vhj5OTkIDAwEKmpqRg6dCjWrVunNCFvu+025eM7duyIJUuWoG/fvigtLUVgoHN+9iZOnFjn7ffeew9RUVHIyMioM05ozpw5GDVqFADg/vvvx+TJk7F27VpceumlAIDp06fXWVF85ZVX1jnu0qVLERoail9++QVXXXVVszU9/PDDeO2112AymTBgwAC7m70AkJeXh6eeegq333673R/TnKKiIuVrXV5eDqPRiJdffln5Pjpi2rRpmDx5MgBg4cKFWLJkCbZs2YLRo0cjMzMT6enp6NOnDwDbz5EsKioKABAREdFgVXtAQADeeecdeHt7213H8uXLcfbsWWzdulW5QrZz587K+wMDA+Hl5dXsCnp3ETaXuFKciMhlhMsGqwWoqrkokCvFiYicTrhcqK+8Zk9xNsWJiJxGuGxQzhc8cKtWIiI3cXil+NNPP40333wTb7/9dp1VmZdeeqldY4/bDLkhDmh/pbhAevbsqfzbYDAgIiICaWlpyn0xMTEAgNzcXOzatQvr1q1DYGCg8l9KSgoA1Bn93Zh///vfuOSSSxAVFYXAwEC8/fbbOHXqFADbFYS1j7ls2TK7al+zZg2GDRuGiy66CEFBQbj55puRn58Pk8mkPMbf379OIzUmJgZJSUl1mtsxMTF1xqNv374d48ePR2JiIoKCgnDFFVcAADIzMwEAaWlpaNeuHYKDgzFmzBi7aq3v0KFDmDx5Mjp27Ijg4GClGSs/h6z290f+XtT//tSuPScnBzNmzEBycjJCQkIQHByM0tJS5bgzZ85Uaq/f4H/wwQfx559/4qeffoLBYMAtt9wCyY7JA8XFxRg3bhxSU1Mxf/585f7u3bsr31NHv05BQUHYuXMndu7ciT///BPPPPMMZs+ejf/9738AbM3t2j8z9b9utdX+GgYEBCA4OFj5ms2cORMrVqzAxRdfjIceegh//PGHXfWlpaU51BAHgF27diE9PV27I6NqETaXKgptt1wpTkTkdMJlgzw6HQB82RQnInI24XKhPhOb4kREziZcNigrxdlrICJqKYdXih84cACDBw9ucH9ISAgKCwudUZNnkEen6/SA0U/dWi7E6G9bsW2PE38Ay66/8ONu+i/QflCDu61WK4pLShAcFAS9Xm97bkdKrTceW6fT1blPp9Mpz1NaWorx48fjueeea3CcuLi4Jp9jxYoVmDNnDl588UUMHDgQQUFBeP7557Fx40YAQJ8+fbBz507l8XLztznHjx/HVVddhZkzZ+KZZ55BeHg4fv/9d0yfPh1VVVXw9/e36/OT75PHlpeVlWHUqFEYNWoUli1bhqioKGRmZmLUqFGoqqoCAKxcuRLnzp1DYGAgAgJa9j9N48ePR/v27fH2228jPj4eVqsVPXr0UJ5D1tj3ov59tUeuT506Ffn5+XjllVfQvn17+Pj4YODAgcpxFyxYgDvuuAOBgYG2n5daIiMjERkZiS5duqBbt25ISEjApk2bMHDgwCY/j5KSEowePRpBQUH46quv6tT2/fffK3uS+/k59jur1+vrrKLu0aMHfvjhB/zrX//CNddcgzvvvFMZ2w/Y9itqSnPf7zFjxuDEiRP4/vvvsXr1agwbNgx33303XnjhhWbra+z7rtPpGlxEUHtPdke/BmoSNpfkleK+oaqWQUTkiYTLBnl0usEH8PJRtxYiIg8kXC7Up4xP1/5Fy0REohAuG6o4Pp2IqLUcborHxsbi8OHDdcb2AsDvv/+Ojh07Oqsu8VXVGp1e0xzULJ3O/ivMOl0JBMcDxVlofF9xne39na4E9IaG77ZaAaPF9nz6Fm1pb7fevXvjiy++QFJSUpP7Rnt7e8NisdS5b8OGDRg0aBDuuusu5b6jR48q//bz86vTALXH9u3bYbVa8eKLLyrN3c8++8yhYzRm//79yM/Px6JFi5CQYBtZv23btjqPad++PcLCwhAcHNygsWyP/Px8HDhwAG+//TYuv/xyAHDavjobNmzA66+/jrFjxwIATp48iby8POX90dHR8PX1vWDtctNYHnPUmOLiYowaNQo+Pj749ttv4evrW+f97du3b82n0oBer0d5eTkAIDw83GkrrqOiojB16lRMnToVl19+OR588EG88MILykrw+j/PzR0nKytLefvQoUN1phakpaXh3XffRUFBQaO1N/a7oxZhc6m80HbLleJERE4nXDZUcj9xIiJXEi4Xaqsy2ba9A7hSnIjIiYTLBrnf4MOmOBFRSzncIZsxYwbuv/9+bN68GTqdDmfOnMGyZcswZ84czJw50xU1iqnKQ8eZ6A3AaHnldf1mf83boxc13hB3s7vvvhsFBQWYPHkytm7diiNHjuDHH3/ErbfeqjTzkpKSsHnzZhw/fhx5eXmwWq1ITk7Gtm3b8OOPP+LgwYOYN28etm7d2qpaOnfuDLPZjFdffRVHjx7Fxx9/jDfffLPVn2NiYiK8vb2V43777bd46qmn7P743bt3K2O/d+7ciV27djV4TFhYGCIiIrB06VIcPnwYP//8M2bPnt3q2gEgOTkZH3/8Mfbt24fNmzfjpptuuuAK5c2bN+O1117Dzp07ceLECfz888+YPHkyOnXqpKwSP336NFJSUrBlyxYAtob4yJEjUVZWhnfffRfFxcXIzs5Gdnb2BRu7GRkZ2LlzJwoKClBUVKR8rWqTJEk53rFjx7B06VL8/PPPuPrqq1v+xWnE448/jm+++QaHDx/G3r17sXLlSnTr1g2A7QICPz8/rFq1Cjk5OSgqKmr2WFdeeSVee+01/Pnnn9i2bRvuvPPOOqvUJ0+ejNjYWEyYMAEbNmzA0aNH8cUXXygTE5KSknDs2DHs3LkTeXl5zV6Q4GrC5pKypzib4kREziZcNsgrxTk6nYjIJYTLhdrk/cT1Ru4jS0TkRMJlQ2XNnuLezAIiopZyuCn+yCOPYMqUKRg2bBhKS0sxePBg/P3vf8cdd9yBe++916FjPfvss+jbty+CgoIQHR2NCRMm4MCBA3UeU1FRgbvvvhsREREIDAzExIkTkZOTU+cxmZmZGDduHPz9/REdHY0HH3wQ1dXVjn5qzuXJe3ykXg3c8BEQXG8EeXC87f5U5zYCWyo+Ph4bNmyAxWLByJEjkZaWhgceeAChoaHKquM5c+bAYDAgNTVVGT1+xx134LrrrsOkSZPQv39/5Ofnt/p/hHr16oXFixfjueeeQ48ePbBs2TI8++yzrf4co6Ki8MEHH+Dzzz9HamoqFi1adMFR2rUNHjwY6enpyn+XXHJJg8fo9XqsWLEC27dvR48ePTBr1iz861//anXtAPDuu+/i3Llz6N27N26++Wbcd999iI6ObvZj/P398eWXX2LYsGHo2rUrpk+fjp49e+KXX36Bj49t3KjZbMaBAweUlc87duzA5s2bsXv3bnTu3BlxcXHKfydPnmz2+caOHYv09HT873//w/r165WvVW3FxcXK8bp164aXXnoJjz76KB577LFWfHUa8vb2xqOPPoqePXti8ODBMBgMWLFiBQDAy8sLS5YswVtvvYX4+Hhcc801zR7rxRdfREJCAi6//HJMmTIFc+bMUcb4y8/1008/ITo6GmPHjkVaWhoWLVoEg8F2wcvEiRMxevRoDB06FFFRUfj000+d+rk6wpm55FZcKU5E5DLCZYO8UtyHTXEiIlcQLhdqU0anR2h/EiERkUCEywZ5ER5XihMRtZjD49N1Oh3++c9/4sEHH8Thw4dRWlqK1NRUBAY6/sf4l19+wd13342+ffuiuroajz32GEaOHImMjAxlD9xZs2bhu+++w+eff46QkBDcc889uO6667BhwwYAtlHB48aNQ2xsLP744w9kZWXhlltugdFoxMKFCx2uyWmU8eke2BQHbI3vlHG2PcZLc4DAGNse4i5cIb5+/foG9x0/frzBfbX3SU5OTsaXX37Z5DG7dOmirHyt7f3338f777+vvG21WvHII484VF/92mbNmoVZs2bVue/mm29W/j1t2jRMmzatzvvnz5+P+fPn17nvgw8+qPP25MmTMXny5Dr31d8rur4hQ4Y0+5j6zzF8+HBkZGQ0+RxJSUkNjtfYc9T/HNPT0xuswr/++ub3rE9LS8PPP//c7GPq13Ohz7c5jf2M1dbY981qtaK4uFhpINt77MZqrL2H0dy5czF37twmj/f3v/8df//73+vcV/97KYuPj8ePP/7Y4Lnk2gHbSPn//ve/jX68j49Pk+9zN2fmklspK8VDVS2DiMgTCZcNXClORORSwuVCbbWb4kRE5DTCZUMl9xQnImoth5viMm9vb6SmpqK4uBhr1qxB165dlTG+9lq1alWdtz/44ANER0dj+/btGDx4MIqKivDuu+9i+fLluPLKKwHYmpXdunXDpk2bMGDAAPz000/IyMjAmjVrEBMTg4svvhhPPfUUHn74YcyfP1/ZZ9ftlPHpHjzORG8AOlyudhVERACck0tuxfHpREQuJ0w2VNRse8KV4kRELiVMLtRmqhmf7h+ubh1ERB5KiGywmAFLzdaFXClORNRiDjfFb7jhBgwePBj33HMPysvL0bdvXxw7dgySJGHFihWYOHFii4uR98AND7f9j/727dthNpsxfPhw5TEpKSlITEzExo0bMWDAAGzcuBFpaWmIiYlRHjNq1CjMnDkTe/fubTDmGAAqKyvr7H8rr4o0m80wm80trr82XXkxvABYjX6wOOmYMrnGltRqNpshSRKsViusVqtT67KHvBJWrkEUotYNsHY1iFo34PzarVYrJEmC2WxusHLeWX9vnZFLzeWCM2tVVFfAWF1uO7ZXEKChnFCTqHUD4tbOut2Lddv/XK3l6mxw9tdCbzoHAwCrd5Cmzh3UxLrdT9TaWbd7ubtuLeUC0HQ2OLPW+vQlubaM8AtnRtTC2tUhau2i1g2w9uaO21quzgannjeUn4Ox5p9mnQ9fR6pH5PpFrh1g/Wpj/Q2PdSE6ycGZwrGxsfjxxx/Rq1cvLF++HE888QR27dqFDz/8EEuXLsWff/7ZooKtViuuvvpqFBYW4vfffwcALF++HLfeemudUAGAfv36YejQoXjuuedw++2348SJE3XGAJtMJgQEBOD777/HmDFjGjzX/PnzsWDBggb3L1++vM6euq3RMfcnpJ3+BKdC+2N7h7udckxn8PLyQmxsLBISEtRbRU9EbUZVVRVOnjyJ7OxsVFdX13mfyWTClClTUFRUhODglq+Mc0YuuSMXavMxF2L0nvsgQYdvL34f0Omd/hxERCJqq9mQenoFknO/x+Go0djbbopTj01EJDIt5QLg/vMGAOia9SVSsr/Gscgr8VfCNJc8BxGRSNpiNvhV5WHk3tmw6IxYefG7TjkmEZEnsTcbHF4pXlRUpKzkXrVqFSZOnAh/f3+MGzcODz74YIsLvvvuu7Fnzx6lIe5Kjz76KGbPnq28XVxcjISEBIwcObJVQVqb/vf9wGkgvn1nxIwd65RjysxmM1avXo0RI0bAaDRe+ANqqaiowMmTJxEYGAhfX1+n1mUPSZJQUlKCoKAg6HQ6tz9/S4laN8Da1SBq3YDza6+oqICfnx8GDx7c4G9O7VUVreGMXGoqF4YOHYrNmze36O9ts87uB/YA8AvD2HFXOe+4NVqTE2oStW5A3NpZt3ux7gsTIRucec4g03+/BsgFOnTrhfaXa+fcQU2s2/1ErZ11u5e769ZSLgBNZwMAl31N9KvWA9lAYtd0tBvCjJCxdnWIWruodQOsvTGiZINTzxvO7gf2Anq/YIx1cq8BEPvnDBC7fpFrB1i/2lj/efZmg8NN8YSEBGzcuBHh4eFYtWoVVqxYAQA4d+5ci5us99xzD1auXIlff/0V7dq1U+6PjY1FVVUVCgsLERoaqtyfk5OD2NhY5TFbtmypc7ycnBzlfY3x8fGBj49Pg/uNRqPzfnAstrG4et9g6F30w9iSei0WC3Q6HfR6PfR6969MlMcxyzWIQtS6AdauBlHrBpxfu16vh06na/TvlbP+3jojl5rLBfnWqf9jYS4BAOj8wlz6PyxOr9tNRK0bELd21u1erLv553AGV2eD078OVaUAAIN/GAwaOnfQAtbtfqLWzrrdy111aykXgKazQa7VJV+TinMAAENQFDOiEaxdHaLWLmrdAGuvfzxncHU2OLfXUAEA0HkH8nWkZohcv8i1A6xfbazf/mxwuOPxwAMP4KabbkK7du0QHx+PIUOGAAB+/fVXpKWlOXQsSZJwzz334KuvvsLPP/+MDh061Hn/JZdcAqPRiLVr1yr3HThwAJmZmRg4cCAAYODAgdi9ezdyc3OVx6xevRrBwcFITU119NNznqoy261PoHo1EBG1Ac7MJbcpt72wBb8wdesgIvJQwmVDZc0VzT7OXYFOREQ2wuVCbaZ8261/hLp1EBF5GKGyocq2uAI+QerWQUQkOIdXit91113o168fTp48iREjRigrCTt27Iinn37aoWPdfffdWL58Ob755hsEBQUhOzsbABASEgI/Pz+EhIRg+vTpmD17NsLDwxEcHIx7770XAwcOxIABAwAAI0eORGpqKm6++WY8//zzyM7Oxty5c3H33Xc3efWuW8hNce8A9WogImoDnJlLblNeaLv1C1WzCiIijyVcNlQU2W592RQnInIF4XKhNlOB7dY/XN06iIg8jFDZUGmbLAXvtrMAz2K1YEfuDpw1nUWUfxR6R/eGQW9QuywiEpzdTfHLL78c11xzDa655hr06dMHffr0qfP+cePGOfzkb7zxBgAoV2HJ3n//fUybNg0A8NJLL0Gv12PixImorKzEqFGj8PrrryuPNRgMWLlyJWbOnImBAwciICAAU6dOxZNPPulwPU5VWXP1VhsKKiIid3JFLrlNG1opzpMYInInYbOhou2sFGcuEJE7CZsLtSlNca4UJyJyBiGzoWa7pbYylXbNiTVYtGURckw5yn0x/jF4pN8jGN5+uIqVEZHo7G6Kz5gxA9988w0WLFiAdu3a4eqrr8bVV1+NQYMGQafTtejJJUm64GN8fX3x73//G//+97+bfEz79u3x/ffft6gGl+FKcSIil3JFLrlNG2mK8ySGiNxN2GyQx6f7hqhbh4sxF4jI3YTNBZkktanx6bxwiojcQchsUFaKe36vYc2JNZi9fjYk1O0d5ZpyMXv9bCwespjnDkTUYnbvKX7LLbfgiy++QF5eHl588UUUFhbib3/7G2JjY3Hbbbfh66+/Rnl5uStrFQub4pAkCWXmMhRVFqHMXGbXRRAiGzJkCB544AHl7aSkJLz88suq1dMa06ZNw4QJE9Qug6hZQudSG2iKyycxtRsfwPmTmDUn1qhUGRF5MmGzQV4p7sHj05kLRKQGYXNBVlUGWCpt//bwpviaE2sw6otRuO3H2/Dwbw/jth9vw6gvRjEfiMjphMwGeU9xb8/eU9xitWDRlkUNGuIAlPue2/IcLFaLu0sjIg9hd1Nc5uPjg7Fjx+Ktt97CmTNn8O233yIuLg7z5s1DREQErrrqKmzYsMEVtYpFaYp77kgTi9WCrdlb8f3R77E1e2udMCquLMbBcwdxvOg4TpWcwvGi4zh47iBKzCUqVtxQ/UZ2W7F+/XrodDoUFhaqXUqLXH311UhMTISvry/i4uJw880348yZM81+zNKlSzFkyBAEBwc79Llv3boVw4YNQ2hoKMLCwjBq1Cjs2rVLeb/8tZT/8/PzQ1paGj744IMLHtvZP386nQ5ff/21047XnA8++AChoaFuea4LETKXKgptt76halbhMjyJISK1CZUNlmrAXHPu4OOZK8WZC0SkNqFyoTZ5lbiXL2D0V7cWF+KFU0SkBqGyQe41ePj49B25OxpkQW0SJGSbsrEjd4cbqyIiT+JwU7y+/v3745lnnsHu3buxe/duDBs2DFlZWc6oTWzyPh8e2hRv7gre4spinCw5iWprdZ2PqbZW41TJKZRbNXalHQln6NCh+Oyzz3DgwAF88cUXOHLkCK6//vpmP8ZkMmH06NF47LHH7H6e0tJSjB49GomJidi8eTN+//13BAUFYdSoUTCbzXUee+DAAWRlZSEjIwO33347/vGPf2Dt2rUt+vxcqaqqSu0SXE6IXPLwleI8iSEirdF0Nsij0wGPXSnOXCAirdF0LtRWe3S6Vkf6thIvnCIirdB0NlR6dq9BdtZ01qmPIyKqz+Gm+MmTJ3Hq1Cnl7S1btuCBBx7A0qVL0alTJ8yaNeuCzak2ocpz9/m40BW8Xx3+qtmPL7IWteh5hwwZgnvvvRcPPPAAwsLCEBMTg7fffhtlZWW49dZbERQUhM6dO+OHH35QPmbPnj0YM2YMAgMDERMTg5tvvhl5eXkAbCPCf/nlF7zyyivKKt/jx4/DYrFg+vTp6NChA/z8/NC1a1csWbKkRTXXtnjxYqSlpSEgIAAJCQm46667UFpaqrxfXnm7cuVKdO3aFf7+/rj++uthMpnw4YcfIikpCWFhYbjvvvtgsZw/Gfz444/Rp08fBAUFITY2FlOmTEFubm6r661t1apVuOyyyxAaGqpcKXnkyBHl/cePH4dOp8Nnn32Gyy+/HH5+fujbty8OHjyIrVu3ok+fPggMDMSYMWNw9uz5/2nZunUrRowYgcjISISEhOCKK67Ajh0XfjF01qxZGDBgANq3b49BgwbhkUcewaZNmxo0qmt74IEH8Mgjj2DAgAF2f9779+9HQUEBnnzySXTt2hXdu3fHE088gZycHJw4caLOY6OjoxEbG4sOHTrg3nvvRfv27fHnn3/a/VyAbeT+woULcdtttyEoKAiJiYlYunSp8v6qqircc889iIuLg6+vL9q3b49nn31W+VgAuPbaa6HT6ZS358+fj4svvhjvvPMOOnToAF9fX+Xx9cf7X3zxxViwYIHydmFhIe644w7ExMTA19cXPXr0wMqVK7F+/XrceuutKCoqUn535s+f79Dn6kxC5pKHN8V5EkNEahMqGypq/t/Yyw8wGNWtxUWYC0SkNqFyoTZTge3WP1zdOlyIF04RkVqEyga51+DhK8Wj/KOc+jgiovocbopPmTIF69atAwBkZ2dj+PDh2LJlC/75z3/iySefdHqBwhJoT3FJkmAym+z6r6SyBM9uebbJK3glSHhvz3swmU2oqK5o9L+y6jLkV+TDZDY5vM/4hx9+iMjISGzZsgX33nsvZs6cib/97W8YNGgQduzYgZEjR+Lmm2+GyWRCYWEhrrzySqSnp2Pbtm1YtWoVcnJycMMNNwAAXnnlFQwcOBAzZsxAVlYWsrKykJCQAKvVinbt2uHzzz9HRkYGHn/8cfzzn//EV1813+y/EL1ejyVLlmDv3r348MMP8fPPP+Ohhx6q8xiTyYQlS5ZgxYoVWLVqFdavX49rr70W33//Pb7//nt8/PHHeOutt/Df//5X+Riz2YynnnoKu3btwtdff43jx49j2rRpraq1vrKyMsyePRvbtm3D2rVrodfrce2118JqtdZ53BNPPIG5c+dix44d8PLywpQpU/DQQw/hlVdewW+//YbDhw/j8ccfVx5fUlKCqVOn4vfff8emTZuQnJyMsWPHoqTE/jH7BQUFWLZsGQYNGgSj0bkvJHft2hURERF49913UVVVhfLycrz77rvo1q2b0nSuT5IkrFq1CqdOnUK/fv0cfs4XX3wRffr0wZ9//om77roLM2fOxIEDBwAAS5Yswbfffquskl+2bJlSx9atWwEA77//PrKyspS3AeDw4cP44osv8OWXX2Lnzp121WG1WjFmzBhs2LABn3zyCTIyMrBo0SIYDAYMGjQIL7/8MoKDg5XfnTlz5jj8uTqLkLnk4U1xnsQQkdqEyoZKz99PnLlARGoTKhdqq71S3EPxwikiUotQ2VAp7ynu2U3x3tG9EeMfAx0an46igw6x/rHoHd3bzZURkafwcvQD9uzZozR6PvvsM6SlpWHDhg346aefcOedd9ZpeLVZVgtgNtn+LUBQlVeXo//y/k47XkFFAW776Ta7Hrt5ymb4O7AvVq9evTB37lwAwKOPPopFixYhMjISM2bMAAA8/vjjeOONN/DXX39hzZo1SE9Px8KFC5WPf++995CQkICDBw+iS5cu8Pb2hr+/P2JjY5XHGAyGOqtlO3TogD/++ANff/01pk6danet9dXeOzopKQlPP/007rzzTrz++uvK/WazGW+88QY6deoEALj++uvx8ccfIycnB4GBgUhNTcXQoUOxbt06TJo0CQBw223nv9YdO3bEkiVL0LdvX5SWliIw0Dk/fxMnTqzz9nvvvYeoqChkZGSgR48eyv1z5szBqFGjAAD3338/Jk+ejLVr1+LSSy8FAEyfPr3OXttXXnllneMuXboUoaGh+OWXX3DVVVc1W9PDDz+M1157DSaTCQMGDMDKlStb8yk2KigoCOvXr8eECRPw1FNPAQCSk5Px448/wsur7p/Pdu3aAQAqKythtVrx6KOPYvDgwQ4/59ixY3HXXXcBsH2OL730EtatW4euXbsiMzMTycnJuOyyy6DT6dC+fXvl46KibC8ih4aG1vl5BmwrzD/66CPlMfZYs2YNtmzZgn379qFLly4AbD9fspCQEOh0ugbPpQYhc6m80HbrF6pmFS4jn8TkmnIbvYhKBx1i/GN4EkNELiNUNlTITXHP3E8cYC4QkfqEyoXa2kBTnBdOEZFahMoGZaV4kLp1uJhBb8Aj/R7B7PWzG7xPbpQ/3O9hGPQGd5dGRB7C4ZXiZrMZPj4+AGxNk6uvvhoAkJKSop09NtQmN8QBjx9p4m49e/ZU/m0wGBAREYG0tDTlvpiYGABAbm4udu3ahXXr1iEwMFD5LyUlBQDqjP5uzL///W9ccskliIqKQmBgIN5++21lnM5vv/1W55jLli2zq/Y1a9Zg2LBhuOiiixAUFISbb74Z+fn5MJnO/7z4+/srDXH580lKSqrT3I6JiakzHn379u0YP348EhMTERQUhCuuuAIAkJmZCQBIS0tDu3btEBwcjDFjxthVa32HDh3C5MmT0bFjRwQHByurk+XnkNX+/sjfi/rfn9q15+TkYMaMGUhOTkZISAiCg4NRWlqqHHfmzJlK7fUb/A8++CD+/PNP/PTTTzAYDLjlllscnjxQmzxmPzAwEN27dwcAlJeXY/r06bj00kuxadMmbNiwAT169MC4ceNQXl5e5+N/++037Ny5Ezt37sTSpUvx0ksv4Y033gAALFu2rM7PzG+//dZkHbW/hnLTWf6aTZs2DTt37kTXrl1x33334aeffrLrc2vfvr1DDXEA2LVrF9q1a6c0xLVMuFyyWs6PyvXQleLySUxjeBJDRO4gVDbIK8V9PHelOHOBiNQmVC7U1gaa4lwVSERqESobKj13q9b6hrcfjsVDFsOorzsRNMY/BouHLMbw9sNVqoyIPIHDK8W7d++ON998E+PGjcPq1auV1ZNnzpxBRITn/k+6Q+TR6To94OWrbi128PPyw+Ypm+167Pac7bhr7V0XfNzDfR9Gt/Bujb7PAAM6h3WGXq+Hn5efQ7XWH4+t0+nq3KfT2U6irFYrSktLMX78eDz33HMNjhMXF9fkc6xYsQJz5szBiy++iIEDByIoKAjPP/88Nm7cCADo06dPnRHUcvO3OcePH8dVV12FmTNn4plnnkF4eDh+//13TJ8+HVVVVfD397fr85Pvk8eWl5WVYdSoURg1ahSWLVuGqKgoZGZmYtSoUaiqqgIArFy5EufOnUNgYCACAlr2P07jx49H+/bt8fbbbyM+Ph5WqxU9evRQnkPW2Pei/n21R65PnToV+fn5eOWVV9C+fXv4+Phg4MCBynEXLFiAO+64A4GBgdDr617DExkZicjISHTp0gXdunVDQkICNm3ahIEDB7boc3znnXeURrdc8/Lly3H8+HFs3LhRef7ly5cjLCwM33zzDW688Ubl4zt06IDQ0FAAQLdu3fD777/j2Wefxd13342rr74a/fufn8Zw0UUXNVlHc9/v3r1749ixY/jhhx+wZs0a3HDDDRg+fHidcfqNaez7rtfrG1xEUHtPdj8/x3431SRcLlUUAfIqOd9QNStxKfkkZs4vc2CRLMr9Mf4xeLjfwzyJISKXEiobKjx/fDpgy4WH+z6MRVsX1bmfuUBE7iBULtQmN8X9PHdPca4KJCK1CJUN8kpxAabSOsMVCVdAX7Oe8x99/oHuEd3RO7o3s4CIWs3hpvhzzz2Ha6+9Fv/6178wdepU9OrVCwDw7bfftmj/XI9UWSukdI1f6aolOp3O7hHmg+IHXXD0YZR/FHpF9YJe1/gggnB9OPyN/g2anM7Wu3dvfPHFF0hKSmow6lrm7e0Ni8VS574NGzZg0KBByghrADh69Kjybz8/P3Tu3NmhWrZv3w6r1YoXX3xR+bw/++wzh47RmP379yM/Px+LFi1CQkICAGDbtm11HtO+fXuEhYUhODi4RV/z/Px8HDhwAG+//TYuv/xyAMDvv//e6toB29f69ddfx9ixYwEAJ0+eRF5envL+6Oho+Pr6XrB2uWlcWVnZ4loaa1SbTCbo9XqlwQ9Aebv+fur1GQwGpckeFBSEoCDnjDcKDg7GpEmTMGnSJFx//fUYPXo0CgoKEB4eDqPR2ODnuSlRUVF1rnotLi7GsWPHlLfT0tJw6tQpZauB+hr73VGLcLkk7yfuHQh4eatbi4tdetGlsEq235XH+j2GzmGdeRJDRG4hVDa0gZXisqCacY/Jocn4e9rfEeUfxVwgIrcQKhdqawMrxYHzF9T+8/d/wlR9fpoeL5wiIlcSKhvayPh02ZHCI6i0ViLQGIhbUm9pss9AROQoh5viQ4YMQV5eHoqLixEWdn7s6+23366sdm3zqjx3nEntK3h10NVpjMtX8D7a71HEB8Yjuyy7zsca9UbEBMRAKm/5iGtH3H333Xj77bcxefJkPPTQQwgPD8fhw4exYsUKvPPOOzAYDEhKSsLmzZtx/PhxBAYGIjw8HMnJyfjoo4/w448/okOHDvj444+xdetWJCYmtriWzp07w2w249VXX8X48eOxYcMGvPnmm63+HBMTE+Ht7Y1XX30Vd955J/bs2aNc1WiP3bt312nW6nQ65X8AZWFhYYiIiMDSpUsRFxeHzMxMPPJI4+MvHZWcnIyPP/4Yffr0QXFxMR588MELrlDevHkztm7dissuuwxhYWE4cuQI5s2bh06dOimrxE+fPo1hw4bho48+Uv4nNjs7G9nZ2Th8+HCdzz0xMRHh4Y1fdT9ixAg8+OCDuPvuu3HvvffCarVi0aJF8PLywtChQ+s8Njc3FxUVFaisrMSmTZvw2WefNdiLvbUWL16MuLg4pKenQ6/X4/PPP0dsbKyyQj0pKUnZw93Hx6fO3+j6rrzySnzwwQcYP348QkND8fjjj8NgOP+C9BVXXIHBgwdj4sSJWLx4MTp37oz9+/dDp9Nh9OjRSEpKQmlpKdauXYtevXrB399ftQwQLpcqCm23HrxKXHbw3EFIkBDhG4EbU26sc4EJEZErCZUN8pYaHr5SHAD25O0BAAyIH4CxHceqXA0RtSVC5UJtpgLbrb/nrhSXDW8/HB/t/Qh/nv0Tf+vyN4zpMIYXThGRSwmVDZVta6W4fN7QPbI7G+JE5FQt+otiMBgaNFuSkpIQHR3tlKKEJ49P99CQkq/gjfav+/1ubF8PXy9ftAtqh6SQJCSHJSPI6L6r2eLj47FhwwZYLBaMHDkSaWlpeOCBBxAaGqqsOp4zZw4MBgNSU1OV0eN33HEHrrvuOkyaNAn9+/dHfn4+Zs6c2apaevXqhcWLF+O5555Djx49sGzZMjz77LOt/hyjoqLwwQcf4PPPP0dqaioWLVqEF154we6PHzx4MNLT05X/LrnkkgaP0ev1WLFiBbZv344ePXpg1qxZ+Ne//tXq2gHg3Xffxblz59C7d2/cfPPNuO+++y74d8Tf3x9ffvklhg0bhq5du2L69Ono2bMnfvnlF2UfILPZjAMHDtTZr/3NN99Eeno6ZsyYUedz//bbb5t8rpSUFPzvf//DX3/9hYEDB+Lyyy/HmTNnsGrVqgYj+Lt27Yq4uDh07twZjz76KKZOnYolS5a09EvTKHmUf58+fdC3b18cP34c33//vfLz/OKLL2L16tVISEhAenp6s8d69NFHccUVV+Cqq67CuHHjMGHChDr72QPAF198gb59+2Ly5MlITU3FQw89pKwOHzRoEO68805MmjQJUVFReP755536uTpKqFySV4p76H7itWXkZwAAukV0Y0OciNxOmGxQmuIh6tbhBvKLWz0ieqhcCRG1RcLkQm1tZKU4AFisFuw/tx8AcFO3m9A3ti8b4kTkcsJkg7JS3DP7DfXxvIGIXMXhleI5OTmYM2cO1q5di9zc3AZ70mplnK6qlKa4560Ulw1vPxxDE4ZiR+4OnDWdbTD6sMxs+xoEewcjxOf8C3z1f14csX79+gb3HT9+vMF9tZ8jOTkZX375ZZPH7NKli7JXeG3vv/8+3n//feVtq9V6wdXR9eurX9usWbMwa9asOvfdfPPNyr+nTZuGadOm1Xn//PnzMX/+/Dr3ffDBB3Xenjx5MiZPnlznvgt9nYcMGdLsY+o/x/Dhw5GRkdHkcyQlJTU4XmPPUf9zTE9Px9atW+s85vrrr2+29rS0NPz888/NPqaxehr7WtpjxIgRGDFiRJPvb+zztFqtKC4uvuAe7hf6mQFQZ//6GTNmKE39xowfPx7jx4+vc19Tn3dwcDBWrFhR576pU6cqtQNAeHg43nvvvSaf74033sAbb7zR5PvdRbhcKi+03fqFqlmFW8hN8dSIVJUrIaK2RqhsUMane3ZT3GwxY3+BrdnRI5IvbhGRewmVC7WVyyvFPb8pfrToKMqry+Hv5Y+k4CS1yyGiNkCYbKiuAixVtn976CK8+uSmeFpkmsqVEJGncbgpPm3aNGRmZmLevHmIi4vjyq/GVLWNcSYGvQF9Y/s2uF+SJKUpHujhXwMiUp9wudSGVorvy98HgE1xInI/obKhoqYp7uHj0w8WHoTZakawdzASghLULoeI2hihckEmSW1qpbjcAEmNSOUKcSJyC2GyQe41AB7fbwAAk9mEw4W27S95MS0ROZvDTfHff/8dv/32Gy6++GIXlOMhPHhPcXuUV5fDKllh0Bvga/BVuxwi8nDC5VIbWSleaanEkcIjAIDuEd1VroaI2hqhskFZKe7ZTfG9eXsB2F7Y0uwLjkTksYTKBVllMWCttv27DewpzlWBRORuwmRDZYnt1ssXMDjczhHO/oL9sEgWRPlFISYgRu1yiMjDOLyneEJCQqtGYLcJbWB8enNKzbaLAgKMAXzBi4hcTrhcaiMrxQ8WHES1VI1w33DE+PMkhojcS6hsaCMrxeVmBy+UIiI1CJULMnmVuDEAMPqpW4sb7M7bDQDoHsmcICL3ECYb2shUWpmynzhXiRORCzjcFH/55ZfxyCOPNLr3LdWQg8qnbQRVffLo9ABj27wogIjcS7hcaiNNcXk/8W7h3XiBFBG5nVDZUFFku/XwleJys4MvbhGRGoTKBZmp7ewnXmmpxKFzhwBwpTgRuY8w2SAvwGsjvQY2xYnIlRyetzFp0iSYTCZ06tQJ/v7+MBqNdd5fUFDgtOKEpawUbxtBVZvFakF5dTkAINDY9j5/InI/4XKpotB26+FN8X0F3E+ciNQjVDbI49N9Q9Stw4VMZhOOFh0FwGYHEalDqFyQKfuJe/7o9AMFB5QpU3EBcWqXQ0RthDDZII9P9w5Stw432ZPPpjgRuY7DTfGXX37ZBWV4mDY8Pt1UbYIkSTDqjTDqjRf+ACKiVhIul+SV4r6hqpbhavJKcTbFiUgNQmVDGxifvq9gH6ySFdH+0Yjyj1K7HCJqg4TKBZnSFPf8leK1p4lwyhQRuYsw2dCGptIWVhTiZMlJANx2iYhcw+Gm+NSpU11Rh2eplPf5aHtNcWV0ujf3Eyci9xAul9rA+PQqSxUOFdrGH7IpTkRqECYbLGagZsqSJ49PV0YgRnC1BxGpQ5hcqK0NNcX35u0FwJwgIvcSJhsq286e4nvzbXnQPrg9Qnw8d5IWEanH4T3FAeDIkSOYO3cuJk+ejNzcXADADz/8gL179zq1OGFVtZ2gqq/UbPvcOTqdiNxJqFxqA03xQ+cOodpajRCfEI4/JCLVCJEN8ipxoG00xTkCkYhUJEQu1NaGmuLySvHukVwVSETuJUQ2tKGV4rUnhxARuYLDTfFffvkFaWlp2Lx5M7788kuUltr+KO/atQtPPPGE0wsUUhvdU9xsNaOyuhIAEGBse6vkiUgdQuWSJAHlhbZ/e3BTPKOgZnR6eCqnhhCRKoTJhsoi260xADA4PMRLGHJTnM0OIlKLMLlQWxvZU7ykqgTHi48DYBOEiNxLmGxQ9hT3/NfbOWGKiFzN4ab4I488gqeffhqrV6+Gt7e3cv+VV16JTZs2ObU4YbXRPcVNZhMAwNfLF156z31Rj4i0RahcMpcDFtvFQ/ALVbUUV+J+4kSkNmGyoaKmKe7B+4kXVhTiVOkpANwXkIjUI0wu1GYqsN16eFNcHpV7UeBFCPf17M+ViLRFmGxQptIGqVuHi0mSxAlTRORyDjfFd+/ejWuvvbbB/dHR0cjLy3NKUcJrI01xyWJB2eYtKFr5Hco2b0FJzYt6XCXufGvXrkW3bt1gsVgAAPPnz8fFF1+sak0ffPABQkNDlbddUdOqVatw+eWXw2q1OvW45FmEyiV5dLrey6OnibApTkRqEyYb5PHpvp67Xx73BSQiLRAmF2prI+PT2QAhIrUIkw1yr8HDx6dnl2UjvyIfXjovpISnqF0OEXkoh5vioaGhyMrKanD/n3/+iYsuusgpRQmvSh5p4rlBVfzTTzg8bDgyp07FmTlzkDl1KkrGTwF+2eSypvi0adMwYcIElxy7OcePH4fBYEBYWBgMBgOCgoLQvXt33H333Th06JBbanjooYcwd+5cGAwGtzxfS8yZMwdr16516jFHjx4No9GIZcuWOfW45FmEyqXa+4l76Fhxs8WMQ+dsfxu7RXRTuRoiaquEyYbKmqZ4G9hPnKvEiUhNwuRCbW2kKb43z3bxFEflEpG7CZMNlfJKcc/tNQDAnnzbeUNyWDJ8vXxVroaIPJXDTfEbb7wRDz/8MLKzs6HT6WC1WrFhwwbMmTMHt9xyiytqFI+HrxQv/uknnL7/AVRnZ9e5X8rNh+6fL8CyboNKlbWOxWJpdkXy119/jdOnT2PXrl1YuHAh9u3bh169ejm9EVzf77//jiNHjmDixIkufZ7WCgwMRESE80/WJ0+ejNdee83pxyXPIVQuVRTabj14P/HDhYdhtpoR7B2MdoHt1C6HiNooYbJBWSnu+U1xrgAkIjUJkwu1tZGm+O683QCYE0TkfsJkg7wAz8NXist50D2SF9MSkes43BRfuHAhUlJSkJCQgNLSUqSmpmLw4MEYNGgQ5s6d64oaxSPYSBNJkmA1mez6z1JSgpynnwEkqcFxdDX/nV24CJaSkqaPU16u/Ftq5Dj2GjJkCO677z489NBDCA8PR2xsLObPn1/nMYWFhbjjjjsQExMDX19f9OjRAytXrgRwfvz3t99+i9TUVPj4+CAzM7PJ55Ofo2PHjrjmmmuwZs0a9O/fH9OnT1fGmgPAN998g969e8PX1xcdO3bEggULUF1dbVdNjVmxYgVGjBgBX9+GV8i99dZbSEhIgL+/P2644QYUFRUp79u6dStGjBiB6OhoJCYmYujQodixY4fyfkmSMH/+fCQmJsLHxwfx8fG47777lPdXVlZizpw5uOiiixAQEID+/ftj/fr1TdZZf3y6vLL/hRdeQFxcHCIiInD33XfDbDY79ByjR4/Gtm3bcOTIkSafm9o2oXJJXinuG6pqGa4kj07vFtENOg9dDU9E2idMNnj4SnFJkpQVH2mRaSpXQ0RtmTC5ILNaz587eHBT/KzpLHJMOdDr9Nx6iYjcTphsqGwbe4rLk0N43kBEruTl6Ad4e3vj7bffxuOPP47du3ejtLQU6enpSE5ORnl5Ofz8/FxRpzisFsBssv1bkJEmUnk5DvS+xGnHq87JwcG+/Zp9TE7Nbdcd26Hz92/xc3344YeYPXs2Nm/ejI0bN2LatGm49NJLMWLECFitVowZMwYlJSX45JNP0KlTJ2RkZNQZQW4ymfDcc8/hnXfeQUREBKKjo+1+br1ej/vvvx/XXnsttm/fjn79+uG3337DLbfcgiVLluDyyy/HkSNHcPvttwMAnnjiCbtqqu+3337DlClTGtx/+PBhfPbZZ/jf//6H4uJiTJ8+HXfddZcyarykpARTp07FK6+8gpKSEixduhRjx47FoUOHEBQUhC+++AIvvfQSVqxYge7duyM7Oxu7du1Sjn/PPfcgIyMDK1asQHx8PL766iuMHj0au3fvRnJysl1fo3Xr1iEuLg7r1q3D4cOHMWnSJFx88cWYMWOG3c+RkJCAmJgY/Pbbb+jUqZN93xxqU4TKpdrj0z0U9xMnIi0QJhs8fKV4jikHeeV5MOgM6BreVe1yiKgNEyYXZBWFgFQzyc4vXNVSXEmeJtIxpCP8jS1/bYiIqCWEyYaqmqa4IAvwWsIqWbE3v2Y7DU4OISIXcrgpft9992HJkiVISEhAQkKCcn9ZWRmuuuoqrFu3zqkFCkduiAMeOz5dS3r27IknnngCAJCcnIzXXnsNa9euxYgRI7BmzRps2bIF+/btQ5cuXQAAHTt2rPPxZrMZr7/+Onr16tWi509JSQFg23e8X79+WLBgAR555BFMnTpVeb6nnnoKDz30EJ544gm7aqrvxIkTiI+Pb3B/RUUFPvroI2WPm1dffRXjxo3Diy++iNjYWFx55ZUAAKvViuLiYrz11lsIDw/HL7/8gquuugqZmZmIjY3F8OHDYTQakZiYiH79bBczZGZm4v3330dmZqby3HPmzMGqVavw/vvvY+HChXZ9fcLCwvDaa6/BYDAgJSUF48aNw9q1azFjxgyHniM+Ph4nTpyw6zmp7REql9pSUzycTXEiUo8w2VBRM+XHQ1eKy6s9Ood2hp+XRl5UJKI2SZhckJkKbLc+wYCXt7q1uJA8TYQNECJSgzDZoKwU99xew/Gi4ygzl8HPyw8dQ5p/rZyIqDUcbop/9913CAsLw4IFC5T7ysrKMHr0aKcWJiw5pHR6wKvhuGst0vn5oeuO7XY91rRtG07efscFH5ew9C349+nT4H6r1YrikhIEBwVBr9dD18or7nr27Fnn7bi4OOTm5gIAdu7ciXbt2inN58Z4e3s3OIYj5PHv8ojgXbt2YcOGDXjmmWeUx1gsFlRUVMBkMtlVU33l5eWNjk5PTExUGuIAMHDgQFitVhw4cACxsbHIycnB3LlzsX79euTk5MBqtcJkMikj4v/2t7/h5ZdfRseOHTF69GiMHTsW48ePh5eXF3bv3g2LxdKgzsrKSof2De/evXudVfBxcXHYvdu2P4wjz+Hn5weTyQSixgiVS+WFtlsPbYqbrWYcPHcQAFeKE5G6hMmGypqmuG+IunW4CPeJJSKtECYXZMp+4p67Shw4v1K8RwRzgojcT5hsqPL88enyeUO38G7w0jvcsiIispvDf2F++uknXH755QgLC8MDDzyAkpISjBo1Cl5eXvjhhx8cOtavv/6Kf/3rX9i+fTuysrLw1VdfYcKECcr7m9qL9Pnnn8eDDz4IAEhKSmqwgvTZZ5/FI4884tgn5izyfuLegYAge6nqdDq7R5gHXHopvGJjUZ2T0+i+4tDp4BUTg4BLL4WusZHgViv01dXQ+/tDr3d4S/sGjEZjvafXwWq1jRizZ8SNn59fq/a83bdvHwCgQ4cOAIDS0lIsWLAA1113XYPH+vr6tmjsTmRkJM6dO+fwx02dOhX5+fl46aWXEBERgYiICFx66aWoqqoCYBtLfuDAAaxZswarV6/GXXfdhX/961/45ZdfUFpaCoPBgO3btzcY7R4YaP+onua+P448R0FBAaKioux+XmpbnJlLLqesFA9VtQxXOVp4FFXWKgQZg5AQlHDhDyAichFhskEZn+6ZTXF5BWD3yO4qV0JEbZ0wuSBTmuKeu5+4JEnnR+VGsSlORO4nTDa0gfHpvJiWiNzF4aZ4p06dsGrVKgwdOhR6vR6ffvopfHx88N133yEgwLERHmVlZejVqxduu+22RpuIWVlZdd7+4YcfMH36dEycOLHO/U8++aSyRzEABAWpeNWUcuWWZ4aUzmBAzGOP4vT9D9ia/rUb4zW95ZjHHm28Ie5mPXv2xKlTp3Dw4EGHVmbby2q1YsmSJejQoQPS09MBAL1798aBAwfQuXNnp9WUnp6OjIyMBvdnZmbizJkzyujxTZs2Qa/Xo2tX236NGzZswOuvv46xY8eiuLgYRUVFyMvLq3MMPz8/jB8/HuPHj8fdd9+NlJQU7N69G+np6bBYLMjNzcXll19u99fEEfY+R0VFBY4cOaJ8jYnqc2YuuZyHj0+XR6d3i+jWqguOiIhaS5hsqKxpinvg+HSrZEVGni0X0iLTVK6GiNo6YXJB1gaa4qdKTqGosghGvRFdQp3/mg0R0YUIkw2Vnt1vAM5vu8TzBiJytRbNoujZsydWrlyJESNGoH///li5cmWLVsCOGTMGY8aMafL9sbGxdd7+5ptvMHTo0AZ7MAcFBTV4rGqUleIaCk4nCx45EnjlZeQsfBbV2dnK/YaYGMQ+9pjt/RpwxRVXYPDgwZg4cSIWL16Mzp07Y//+/dDpdC0ag1NQUIDs7GxUVFRgz549ePnll7FlyxZ89913ykrnxx9/HFdddRUSExNx/fXXQ6/XY9euXdizZw+efvrpFtU0atQofPjhhw3u9/X1xdSpU/HCCy+guLgY9913H2644QbldyE5ORkff/wxevfujaysLDz55JN1fk8/+OADWCwW9O/fH/7+/vjkk0/g5+eH9u3bIyIiAjfddBNuueUWvPjii0hPT8fZs2exdu1a9OzZE+PGjXP461dfly5d7HqObdu2wcfHBwMHDmz1c5LnclYuuZyHN8XllR7dwrupXAkRkSDZoKwU97ymeGZxJkrMJfAx+KBTaCe1yyEiEiMXZG2gKS6vCkwJT4HRYLzAo4mIXEPz2VBdCVjNtn976ErxKksV9p/bD4ATpojI9exqiqenpze64svHxwdnzpzBpZdeqty3Y8cO51VXS05ODr777rtGm4OLFi3CU089hcTEREyZMgWzZs2Cl1fTn1plZSUqKyuVt4uLbS9Gmc1mmM3mVtWpKy+CFwCr0R+WVh6rKXKNLanVbDZDkiRYrVZljHVLBA4fjoChQ1GwZQNyMw/AEBWBTldcA53B0Oxx5T245RocIUlSg49r7O3a933++ed48MEHMXnyZJSVlaFz585YuHBhnc//QnXI75dH+/v7+6N9+/YYMmQI3nzzTXTu3Fl5zIgRI/Dtt9/i6aefxnPPPQej0YiUlBTcdtttdtXUmMmTJ+Ohhx7Cvn37lFXgkiShc+fOmDBhAsaOHYuCggKMGzcOr732mnKct99+G3feeSf69OmDiy66CAsXLsRDDz2kfH2Cg4Px/PPPY/bs2bBYLEhLS8M333yDsLAwWK1WvPvuu3jmmWfwj3/8A6dPn0ZkZCT69++PsWPHNvr1k7+3td9u7PtT+zEXeg5JkvDf//4XU6ZMga+vb6t+Zt2pNT/nanN27fL30Ww2NxiT35q/t87OpeZyobW11uZVfg46ANXGIEguygjA+XXbS76yt2to1xZnVO1bkYhaO+t2L9Zt/3O1hDuzwWm5UFFkywUvf5flglo/d7tydgEAuoZ1BSyA2eLY8/P3xf1ErZ11u5e769ZSLgBNZ0Nra61PX3oWBgAW31BYPfC8AQD+yv0LAJAanspzB4GIWruodQOsvbnjtoQ7s8Ep5w2mc5AvGzLrfAAPO2cAbFMHq63VCPUJRYxPDDNBMKxfXay/4bEuRCdJjW0MXdeCBQvsfuInnnjC7sfWKUSna7CneG3PP/88Fi1ahDNnzsDX11e5f/HixejduzfCw8Pxxx9/4NFHH8Wtt96KxYsXN/lc8+fPb/RzWr58Ofzt3Fu7KfHnNqHv8deRF5iCDcmPtepYruDl5YXY2FgkJCTA29u71ccrshah1FoKf50/wgyeufJRC+bNm4eSkhK8/PLLapfiVvn5+ejbty/WrVuH9u3bq10OtUBVVRVOnjyJ7OxsVFdX13mfyWTClClTUFRUhOBgx1bIOTuXXJkLtQ3fOxsBVXn4tcvjOBfQ+DYLorJIFjxV9BSqUY0Hgh5ApCFS7ZKISEBtLRtG7b4HvtXFWNf1aRT7JzrlmFrxnek7bKzaiIHeAzHOv/VThoiobdJSLgDuO2+4+MTbaF/wGzLi/oZDseOddlwtWVqyFJmWTEz0n4h0b26XRkT2a0vZ4F95FiMy/oFqvTe+6/VOq46lVZsqN2Fl+UokeyVjauBUtcshIkHZmw12NcXd4UJN8ZSUFIwYMQKvvvpqs8d57733cMcdd6C0tBQ+Pj6NPqaxq7cSEhKQl5fncJDWp9v5Cby+ewDWziNgmfRpq47VFLPZjNWrV2PEiBEwGh0bMVVRUYGTJ08iKSmpzsUFLXWs+BgqqisQHxiPEO+QCz5ekiSUlJQgKChIqP1m1a67sLAQb7zxBh5++GHo9XqHPlbt2ltj69at2Lt3L6ZOnSpU7SJ/zZ1de0VFBY4fP46EhIQGf3OKi4sRGRnZopMYZ2sqF7KysrB58+YW/b1tjNcLHaCrLIH5zk1AhOua4q3JiZY6VHgIk76fhACvAPzyt1+g1zn2twpQp25nEbV21u1erPvCRMgGZ5wzyLwWXQSdpRLme/4EQhKccsz61Pq5m/bTNPyV9xeeGvgUxnVwvCnO3xf3E7V21u1e7q5bS7kANJ0Ny5cvxzXXXOO0r4nhs5ugP/Qjqse8CKm36xoEav0cVlurMfjzwaiwVOCLcV+gQ0gHh48h6u8QwNrVIGrdAGtvjCjZ4JTzhpy9ML5zBaSAKFQ/sK+VlTZNzZ+zJzY+gf8d+x9m9JiBmT1ntugY/D1RD+tXF+s/z95saNGe4gCwfft27Ntn+0PcvXt3pKe77qrO3377DQcOHMB//vOfCz62f//+qK6uxvHjx5VR0/X5+Pg02jA3Go2t/8GxVAAA9L7B0Lv4h7Al9VosFuh0Ouj1eoebq/VVW6tRUW37fAONgXYdTx7HLNcgCrXrDg8Pxz//+c8WfazatbdG37590bVrV+FqF/lr7uza9Xo9dDpdo3+vnB3Urcml5nJBvm19PlQDlSW24wVFA274HxWn1G2ng0UHAQApESnw8W78ojR7ubNuZxO1dtbtXqy7+edwJldlg1PqrK4ELLYXz4wB4S7PBXf+3JmtZhw4dwAAcHHMxa16Xv6+uJ+otbNu93JX3VrKBaDpbACc/DUpPwcA8PLA8wYAOFpwFBWWCgQaA9E5onOLLqiVifo7BLB2NYhaN8Da6x/PmVyVDU75vK2219513oFuy113/5ztLbBtxdfa8waAvydqYv3qYv32Z4PDTfHc3FzceOONWL9+PUJDQwHYVrEOHToUK1asQFRUlKOHvKB3330Xl1xyCXr16nXBx+7cuRN6vR7R0dFOr8MuVaW2W+8AdZ7fjcrMZQAAHy8fGA3i/sIRkdjUyKUWqSg6/2/fC0/WEE1GfgYAIDUiVeVKiIgEyYaK8/vQwidIvTpc4EjhEVRaKhFkDEJisGeNhSciMQmRC7WZ8m23/hHq1uEie/L2AAC6R3RvVUOciKg1hMgGudfgE6huHS5SWlWKY0XHANgygYjI1Rz+P897770XJSUl2Lt3LwoKClBQUIA9e/aguLgY9913n0PHKi0txc6dO7Fz504AwLFjx7Bz505kZmYqjykuLsbnn3+Ov//97w0+fuPGjXj55Zexa9cuHD16FMuWLcOsWbPwf//3fwgLU2l/6ypboxjenhlUtclN8QCj518AQETa5cxccqma1R7wCQYMLR7Uoln78m1XVbMpTkRaIEQ2VNY0xb2DAL1B3VqcbHfebgBAamQqmx1EpAlC5EJtnt4Uz69pikeyAUJE6hEiGyrlBXiedRGtLCM/AxIkxAfEI8LPMzOPiLTF4VflV61ahTVr1qBbt27Kfampqfj3v/+NkSNHOnSsbdu2YejQocrbs2fPBgBMnToVH3zwAQBgxYoVkCQJkydPbvDxPj4+WLFiBebPn4/Kykp06NABs2bNUo6jikoxVoo7Yyt5uSkeaPT8CwCIqGWc8bfmQpyZSy4lN8X9QlUtwxUsVosyJpdNcSLSAiGyQZ4g4qv+PojOtjfPNgKxR0QPlSshIrIRIhdklurzGeGpTfGaleI9IpkTRKQeIbLBw1eKyxfT8iIpInIXh5viVqu10dnsRqNR2YvWXkOGDLlgw+T222/H7bff3uj7evfujU2bNjn0nC6nrBTXZlNc/t6ZTCb4+fm1+DhVlipUWaoAAP5e/k6pjYg8j8lkAuD8/Z5qc2YuuZTSFFdpkokLHSs6hvLqcvh7+SMpOEntcoiIxMgGpSnueVtqyM2OtMg0lSshIrIRIhdkFYUAal4r88Bzh4rqChw6dwgAc4KI1CVENigL8DyzKb4333YxLfOAiNzF4ab4lVdeifvvvx+ffvop4uPjAQCnT5/GrFmzMGzYMKcXKJwqbQeVwWBAaGgocnNzAQD+/v7Q6XQOH6eosghWsxV+Xn4wV5lhhtmuj7NaraiqqkJFRQX0enFGKYpaN8Da1SBq3YDzapckCSaTCbm5uQgNDYXB4LqxsMLkUkWh7dYDX9jKKLDtJ54SnsIxuUSkCUJkgzw+3cezVoqXV5fjcOFhAFzxQUTaIUQuyOTR6b6hHrnt0v6C/bBIFkT4RiDGP0btcoioDRMiG6pKbLcevlKck0OIyF0c/r/r1157DVdffTWSkpKQkJAAADh58iR69OiBTz75xOkFCkeAPcVjY2MBQGmMt8S5inMory5HkHcQzN72NcQBW6OsvLwcfn5+LWrGq0XUugHWrgZR6wacX3toaKjyN8dVhMkleaW4b6iqZbgC9xMnIq0RIhsqapriHjY+XW52RPpFstlBRJohRC7IPH0/8VrTREQ7XyYizyJENnjwSvG88jxkl2VDBx1fTyIit3G4KZ6QkIAdO3ZgzZo12L9/PwCgW7duGD58uNOLE1KV9vcU1+l0iIuLQ3R0NMxm+xvaMqtkxbzv56G4qhjPXv4sOkR2sPtjzWYzfv31VwwePNil45SdTdS6AdauBlHrBpxbu9FodOkKcZkwueTB49Mz8m0rxXkSQ0RaIUQ2eOhKcWWf2IgebHYQkWYIkQsyT2+K59tygtNEiEhtQmSDxqfStoZ83tAptBMCjNrtpRCRZ3G4Kf7RRx9h0qRJGDFiBEaMGKHcX1VVhRUrVuCWW25xaoHC0fie4rUZDIYWNaz2F+zHgdID8PPyQ8+4njAa7G+cGQwGVFdXw9fXV6hmoah1A6xdDaLWDYhZuzC55KFNcYvVgn0FXClORNoiRDZ46Epx+cUtNjuISEuEyAWZpzfFa60UJyJSkxDZIPcaPHB8unLeEMHzBiJyH4c3/rz11ltRVFTU4P6SkhLceuutTilKaB589ZZs05lNAIA+MX0caogTEbmCMLlUXmi79bCm+IniEyivLoeflx+SgpPULoeICIAg2eChK8X35u8FwGYHEWmLELkg8+CmeFFlEU4UnwDAJggRqU+IbKis2VPcA3sNvEiKiNTgcFNckqRGx+CdOnUKISEhTilKaB589ZZsU5atKT4gboDKlRARCZRLykrxUFXLcLaMAtvo9K5hXWHQu35cPhGRPYTIhoqaF+B8NVKPE7DZQURaJUQuyEwFtlv/cHXrcAH5wql2ge0Q6huqbjFE1OYJkQ3yAjyfIHXrcDJJkpTtNHpE9lC5GiJqS+wen56eng6dTgedTodhw4bBy+v8h1osFhw7dgyjR492SZFCEWh8ektUWaqwPWc7AGBAPJviRKQe4XLJQ8encz9xItISobJBaYp7zkpxNjuISGuEygWZB68U35vHaSJEpD6hsqHSM6fSnio5haLKIhj1RnQJ66J2OUTUhtjdFJ8wYQIAYOfOnRg1ahQCA8//Ifb29kZSUhImTpzo9AKFYrUAZpPt3x4WVLJdZ3ehwlKBcN9wJIcmq10OEbVhwuUSm+JERC4nVDYo49M1sgrFCeRmB1d7EJFWCJULMqUp7nkrxXfn7QYAdI/kNBEiUo9Q2aCsFPesXoOcBynhKdyelYjcyu6m+BNPPAEASEpKwqRJk+Dr6+uyooQlrxIHPHal+MYzGwHYRqc3Nl6GiMhdhMulikLbrQc1xa2SFfsL9gMAukV0U7kaIiLBsqGipinuQSvF5X0B2RQnIq0QKhdkXClORORSQmWDh64U5+h0IlKLXU3x2vtrTJ061aUFCU1uiuv0gJeGw7QVNmdtBsD9xIlIXcLlkiSdXynuQeNkM4szUWYug6/BFx1DOqpdDhG1ccJlg7JS3IOa4nxxi4g0RLhckHloUzynLAe55bnQ6/RICU9RuxwiaqOEy4aqEtutpzXFeTEtEalEb8+DunfvjhUrVqCqqqrZxx06dAgzZ87EokWLnFKccJT9xIMAD1xFXVxVrLzQxaY4EalJuFyqKgWs1bZ/e9BKcXl0epfwLvDS2z18hojIJYTLBg9bKZ5rykWuydbs6BbO6SFEpD7hckFmKrDdelhTXH49qXNoZ/gb/VWuhojaKqGyQZLO9xs8aHx6tbUa+/L3AWBTnIjcz65XsF999VU8/PDDuOuuuzBixAj06dMH8fHx8PX1xblz55CRkYHff/8de/fuxT333IOZM2e6um5tkvf48NDR6Vuzt8IqWZEUnIS4wDi1yyGiNky4XJJXiRt8AKOfurU4kbKfeDj3Eyci9QmVDZIEVBTZ/u3rGXuKy6s9OoZ0ZLODiDRBqFyQWcznJ4l4WFNcHp3OBggRqUmobKiuPL/AwoNWih8pPIIKSwUCjYFICk5SuxwiamPsaooPGzYM27Ztw++//47//Oc/WLZsGU6cOIHy8nJERkYiPT0dt9xyC2666SaEhXnOCjiHeXhTfNOZTQCA/nH9Va6EiNo64XKpvNB26xfmUZNEMgpqmuIRbIoTkfqEyobqCsBqtv3bQ8ancwQiEWmNULkgk1eJ6/Qec9GUbHfebgBA94juKldCRG2ZUNkg9xoAj2qK184Dvc6uQcZERE7j0KzTyy67DJdddpmrahGfMj7dM5vim7Nt+4kPjBuociVERDbC5JK8UtwvVNUynMkqWZVxV2yKE5GWCJEN8uh06DzmBa69+bYVgGmRaSpXQkRUlxC5IJP3E/cLA/QGdWtxIqtkZU4QkaYIkQ2VNfuJGwMAvec0j3kxLRGpyXP+mmqBfPWWT5C6dbhAdlk2jhUdg16nR5/YPmqXQ0QkFqUprpEVKE5wquQUSs2l8NZ7o2NoR7XLISISizwa1yfYI17gkiRJeXGreyRXABIRtZjcFPew0emZxZkoqSqBj8EHncM6q10OEZEYlF6DZ1xEK2NTnIjUJP4rMFriwSvFN2fZVol3j+iOEB/PGuFFRORyHtgUl/cT7xreFUa9UeVqiIgEI68U9/WM0eknS06iuKoYRr0RXUK7qF0OEZG4PLQpviff1gBJCU/huQMRkb0q5a1aPacpXl5djsOFhwGwKU5E6mBT3Jk8uCm+Kcu2n/iAuAEqV0JEJKCKQtutBzbFu4V3U7kSIiIBVRbZbj1sP/GU8BQYDWx2EBG1mIc2xffm2UanswFCROQAeaW4B/Ua9hfsh0WyINIvEjH+MWqXQ0RtEJvizlTpeUEF2MYhsilORNQK8kpx31BVy3CmjAJbU5z7iRMRtUBFTVPc1zMmMMkrALtHcHQ6EVGrmApst/7h6tbhZLvzdgNgThAROcQDt2qtPTpdp9OpXA0RtUVsijtTleeNNAGAI4VHkFeeB1+DL3pF91K7HCIi8XjY+HRJkrAvfx8ANsWJiFrEw8anyysA06LSVK6EiEhwHrhS3Gw1Y3/BfgBAWiRzgojIbh44Pl2+SIp5QERqcbgpbjAYkJub2+D+/Px8GAwGpxQlLGV8uucEFXB+dHrvmN7wMfioXA0RUV1C5JLSFA9VtQxnOVV6Stk7tnNoZ7XLISJqQPPZUFnTFPeA8enV1mplS40eERyLS0TapPlckHlgU/zwucOotFQiyBiExOBEtcshIlJoPhuUleKe02tQttPgeQMRqcThprgkSY3eX1lZCW9v71YXJDQP3VOco9OJSMuEyKXyQtuth6wUl5sfyWHJ3DuWiDRJ89ngQSvFjxQeQYWlAgHGACSFJKldDhFRozSfCzIPbIorW2xEdodex4GVRKQdms8GD1spXlRZhMySTAC2TCAiUoOXvQ9csmQJAECn0+Gdd95BYOD5P8YWiwW//vorUlJSnF+hSKo8b09xs9WMrdlbAbApTkTaIlQuKU3xUDWrcBq5Kc7R6USkNcJkgwetFN+bb1vtkRqRymYHEWmOMLkgk5vifp6zp7iyKjCSqwKJSBuEyYaqEtuth+wpLu8nnhiUiBCfEJWrIaK2yu6m+EsvvQTAdgXVm2++WWeEiLe3N5KSkvDmm286v0KReOCe4nvy9sBUbUKoTyi6hndVuxwiIoVQueRhe4pzP3Ei0iphssGDVorLL26x2UFEWiRMLshMBbZbD1opLu8fy5wgIq0QJhs8bKU4zxuISAvsboofO3YMADB06FB8+eWXCAvzjBf2ncoDx6dvOmMbnd4/rj9XfhCRpgiVSx7UFJckCRkFXClORNokTDZUFNlufcVfIaG8uMV9AYlIg4TJBZkyPt0zVoqbzCYcKTwCgDlBRNohTDZ42FRaNsWJSAvsborL1q1b54o6PIPcFPfxjKu3AO4nTkTap/lcqq4CzDX54AFN8TNlZ1BUWQQvvReSQ5PVLoeIqFGazwYPGZ9eaanEoXOHAPDFLSLSNs3nAgCYK86fN3jISvH9BfthkSyI8otCTECM2uUQEdWh+WzwoF6DJEnK5JC0yDSVqyGitszhpvhtt93W7Pvfe++9FhcjPA8bn15mLsNfZ/8CwKY4EWmX5nOporDmHzrAA/ZMkvcTTw5NhrfBW+VqiIgap/1s8IyV4vsL9qNaqka4bzjiAuLULoeIqEmazwUAKK8Zna4zCJ8PMq4KJCIt03w2VNbsKe4t/p7iOaYc5Ffkw6AzcItWIlKVw03xc+fO1XnbbDZjz549KCwsxJVXXum0woTkYePTt+dsR7VUjXaB7dAuqJ3a5RARNUrzuSSPTvcNAfTib0PB/cSJSASazwYPWSkuNzu6R3SHTqdTuRoioqZpPheAWqPTIwAP+ZvKpjgRaZnms0FegOcBK8XlPEgOS4afl5/K1RBRW+ZwU/yrr75qcJ/VasXMmTPRqVMnpxQlrErP2udj45mNAIAB8VwlTkTapflc8qD9xIHzK8XZFCciLdN8NlTUNMV9xW6K783bC4AjEIlI+zSfC0DdpriH2JPPpjgRaZfms6HSc6bSyqPTmQdEpDanLFnT6/WYPXs2XnrpJWccTkxWC1Bdbvu3BwQVwP3EiUhcmsql8kLbrQc0xSVJYlOciISlmWyQJI9ZKS6/uNU9srvKlRAROU4zuSDzsKZ4UWURTpacBGCbKEJEJAJNZYMHrhTvEcGmOBGpy2lzXI8cOYLq6mqHPubXX3/F+PHjER8fD51Oh6+//rrO+6dNmwadTlfnv9GjR9d5TEFBAW666SYEBwcjNDQU06dPR2lpaWs/HcfJo9MBj2iK55Xn4XDhYQBAv9h+KldDROS4luSSSygrxUNVLcMZssuyca7yHLx0XkgOS1a7HCIih2kiG8wmwFpTg8B7xpZUleB48XEAXPFBROLSRC7ITDV7ivuHq1uHk8gNkPbB7RHiI27eEVHbo5lsUFaKi72nuFWyYm++bcIUzxuISG0Oj0+fPXt2nbclSUJWVha+++47TJ061aFjlZWVoVevXrjttttw3XXXNfqY0aNH4/3331fe9vHxqfP+m266CVlZWVi9ejXMZjNuvfVW3H777Vi+fLlDtbSa3BTXGQAvn+YfKwB5lXi38G4I8xV/dSMReS5n5pJLeND49IwC2yrxTqGd4GMQP+uIyHNpOhvk0ek6g9DbLsmTQ+ID4hHu6xkNHCLyXJrOBZmHrRSXm+JcJU5EWqXpbJAkoKrE9m+BzxkA4HjRcZSZy+Br8EWnUA2MpSeiNs3hpviff/5Z5229Xo+oqCi8+OKLuO222xw61pgxYzBmzJhmH+Pj44PY2NhG37dv3z6sWrUKW7duRZ8+fQAAr776KsaOHYsXXngB8fHxDtXTKnJT3DsQ0Onc97wusukMR6cTkRicmUsu4UlNcY5OJyJBaDoblNHpQUKfNygjELnag4gEoOlckHloU5w5QURapelsqK4AJKvt34KPT9+Tb8uD1IhUeOkdbkcRETmVw3+F1q1b54o6mrR+/XpER0cjLCwMV155JZ5++mlERNhOEDZu3IjQ0FClIQ4Aw4cPh16vx+bNm3Httdc2eszKykpUVlYqbxcX216YMpvNMJvNLSvUdA5GAJK3P6pbegw7yTW2uNYLkCRJWSneJ7qPU5/H1bW7iqh1A6xdDaLWDbi3dmc9hzNyqblcqH3bEnpTAQwALN4hsLrpZ8JV38e9Z23jrrqGdnXJzwh/d9yPdbsX67b/uVrL1dnQmjp1ZQXwAiD5BLv8vAFw3fdv91nbfuLdwroxE2oRtW5A3NpZt3u5u24t5QLQdDYAra/VUHoWegAW31DhzxskScLuvJqcCGVO1Mfa3U/UugHW3txxW8vV2dCq84YyW68BAMw6b0DgfsOunF0AXHfeAPD3RE2sX12sv+GxLkQnSZLUkifIzc3FgQMHAABdu3ZFdHR0Sw5zvhCdDl999RUmTJig3LdixQr4+/ujQ4cOOHLkCB577DEEBgZi48aNMBgMWLhwIT788EOlDll0dDQWLFiAmTNnNvpc8+fPx4IFCxrcv3z5cvj7+7eo/oiS/bjs8EKU+MTh59TnWnQMrThrOYtXSl6BAQb8M+Sf8NZ5q10SEXkgk8mEKVOmoKioCMHBwa0+XmtyyRW5IOt9/A0knNuIPRdNxpHo5qejaJkkSVhUvAhlUhnuCLwDCV4JapdERB6oLWRDVPFfGHTkBRT5JWJ9ytMtPo7a/lX0LxRJRbgt4DZ0NHZUuxwi8lBaygXAtecNAw8/h+iSvdje/g6cCr+0VcdSW6G1EC8UvwA99JgbMpevKxGRU7WFbPCvzMGIjAdRrffFd72WtugYWvFmyZs4ZTmFG/xvQE/vnmqXQ0Qeyt5scLgpXlxcjLvvvhuffvoprFbbCA+DwYBJkybh3//+N0JCQlpUcGNN8fqOHj2KTp06Yc2aNRg2bFiLm+KNXb2VkJCAvLy8Fgep7tBP8PpsCqxxF8Ny25oWHcNeZrMZq1evxogRI2A0Gi/8AQ76z8H/4Lltz6FPTB8sHebc0HV17a4iat0Aa1eDqHUD7q29uLgYkZGRrT6JcUYuNZULWVlZ2Lx5c6u+HoZPJ0F/dC2qr3oVUq/JLTqGo1zxfcw15WL016Nh0Bnw299+g6+Xr1OOWxt/d9yPdbsX674wEbKhNecMAKDL+BpeX/0d1sRBsNz8bYuPYy9XfP/yy/Mx4qsR0EGHX//2KwKMzt/nkL8v7idq7azbvdxdt5ZyAWg6G5YvX45rrrmmVV8Tr3eGQpezG9WTVkDqPLzFx3GEq76fa0+uxYO/PYiuYV3x6ZhPnXbc2kT9HQJYuxpErRtg7Y0RJRtadd6QvRvGd4dCCoxB9f17W3YMB7jqe2W2mHHZ55fBbDXj26u/RbvAdk47dp3n4e+Jali/ulj/efZmg8Pj02fMmIE///wT3333HQYOHAjANsb8/vvvxx133IEVK1a0vOoL6NixIyIjI3H48GEMGzYMsbGxyM3NrfOY6upqFBQUNLkPOWDbp9zHx6fB/UajseVfeGsFAEDvEwS9m374WlVvM7bmbAUADIof5LJfJFfV7mqi1g2wdjWIWjfgntqddXxn5FJzuSDftrjeykIAgFdgJODmnwdnfh8PFh0EAHQM7YggvyCnHLMp/N1xP9btXqy7+edwBldnQ6vqNJcCAPR+oW47bwCc+/07kG27KLlDSAeE+oc65ZhN4e+L+4laO+t2L3fVraVcAJrOBrnWVtVbXgAA8AqOFvq8AQD2n9sPwLafuDv+30HE3yGAtatB1LoB1l7/eM7g6mxwRq9B5xPk1u+7s79XB4sOwmw1I9QnFEmhSdDpdE47dmP4e6Ie1q8u1m9/NjjcFF+5ciV+/PFHXHbZZcp9o0aNwttvv43Ro0c7ejiHnDp1Cvn5+YiLiwMADBw4EIWFhdi+fTsuueQSAMDPP/8Mq9WK/v37u7SWBqrKbLfezl8l4U7V1mpszbY1xQfEDVC5GiKiC1Mzl+xSXmi79QtTtYzWyijIAACkhqeqXAkR0YVpOhsqa/af9Wn9qEe17MnfA8DW7CAiEoGmcwEAJAkw5dv+7R+hbi1OsCfPlhNpkWkqV0JE1DRNZ0Ol7UJaeAeqW0cr7c7bDQDoHtnd5Q1xIiJ7ONwUj4iIaHR0SEhICMLCHHvBv7S0FIcPH1bePnbsGHbu3Inw8HCEh4djwYIFmDhxImJjY3HkyBE89NBD6Ny5M0aNGgUA6NatG0aPHo0ZM2bgzTffhNlsxj333IMbb7wR8fHxjn5qraMEldhN8Yz8DJSYSxBkDEJqBBsfRKR9zswllyg/Z7v1C1W1jNbKyLc1xbtFdFO5EiKiC9N0NlTUNMV9xW2Kyy9usSlORKLQdC4AgNkEVNtWBYreFLdKVuzNt436ZU4QkZZpOhuqSmy3HtIU7xHBPCAibdA7+gFz587F7NmzkZ2drdyXnZ2NBx98EPPmzXPoWNu2bUN6ejrS09MBALNnz0Z6ejoef/xxGAwG/PXXX7j66qvRpUsXTJ8+HZdccgl+++23OuNIli1bhpSUFAwbNgxjx47FZZddhqVLnbsPtl08ZKX4pqxNAIB+cf1g0BtUroaI6MKcmUtOZ7UCFYW2fwu+Unxf/j4AQPeI7ipXQkR0YZrOBsFXikuShL15Nc0OvrhFRILQdC4AgMk2Oh0Gb+EbIMeLj6PUXApfgy86hXZSuxwioiZpOhvkXoOP2JkgnzdwcggRaYXDK8XfeOMNHD58GImJiUhMTAQAZGZmwsfHB2fPnsVbb72lPHbHjh3NHmvIkCGQJKnJ9//4448XrCc8PBzLly+3s3oXqpJXirt2n1VX25y1GQBHpxOROJyZS05XWQxIVtu/fUPd+9xOdNZ0FmfLz0Kv06NreFe1yyEiuiBNZ4PgK8VPl55GYWUhvPRezAQiEoamcwGoOzpd8PGycgOkW0Q3eOkdftmRiMhtNJ0NHjA+vcxchqNFRwHYxqcTEWmBw/93OmHCBBeU4QE8YKV4eXU5/sz9EwCb4kQkDk3nkrxK3MsPMPqqWkpryKPTO4Z0hJ+Xn8rVEBFdmKazQV4p7ttwVKMI5P3Eu4R1gbfBW+VqiIjso+lcADxqP3Fl/1hOmCIijdN0NsgL8AReKZ6RnwEJEuIC4hDpF6l2OUREAFrQFH/iiSdcUYf4PKAp/mfOnzBbzYgNiEX74PZql0NEZBdN55Kyn7jYo9OV/cTDuZ84EYlB09lQUWS7FXR8OkcgEpGINJ0LwPnx6f7h6tbhBMwJIhKFprOhUvw9xZX9xCO55RIRaUeL5xhVVVUhNzcXVqu1zv3yqJE2p0oOKnGb4vJ+4gPiBkAn+LguImp7NJlLntIUL7A1xVMjUlWuhIjIMZrMBsHHp3MFIBGJTJO5AHjMSnGzxYz9BfsBsAlCROLQZDYoK8XF3ap1T55twhTzgIi0xOGm+MGDBzF9+nT88ccfde6XJAk6nQ4Wi8VpxQlFWSku7tVbtZviRESi0HQueUpTPJ9NcSISi6azoVJeKS7e+HSL1aJkAl/cIiKRaDoXAI9pih8sPIgqaxWCvYOREJSgdjlERM3SdDZ4wJ7iclOck0OISEscborfeuut8PLywsqVKxEXF8cVxTK5KS7oPh/nKs5hX8E+AED/uP4qV0NEZD9N51J5oe3WL1TNKlolrzwPuaZc6KBDSniK2uUQEdlF09kg8ErxY0XHUF5dDj8vP3QM6ah2OUREdtN0LgAe0xSXR6f3iOyhva8xEVE9ms4GeaW4oFNp88rzkFWWBR10XGBBRJricFN8586d2L59O1JS+MJ4HYLvKb45ezMAIDksGZF+kSpXQ0RkP03nkrJSPFTVMlpDXhGYFJIEf6O/ytUQEdlHs9kgSef3BxRwT/E9+bbVHqkRqTDoDSpXQ0RkP83mgsxDmuLcYoOIRKLpbBB8fLp8kVTHkI4IMIrZLyEiz6R39ANSU1ORl5fnilrEJr+4JehIk01nODqdiMSk6VzygPHp+/JtU0R4ZS8RiUSz2VBVBkg1Yxh9xRufruwLGMHR6UQkFs3mgsxDmuIclUtEItF0Ngg+Pl25SCqSF0kRkbbY1RQvLi5W/nvuuefw0EMPYf369cjPz6/zvuLiYlfXq12CrxTnfuJEJBJhckkZny5uU1zZTzycTXEi0jYhsqGiZj9xvRdg9FOvjhZSmuLcT5yIBCBELshMBbZb/3B162gFk9mEo0VHATAniEi7hMkGZaW4mE1xecIUL5IiIq2xa3x6aGhonT01JEnCsGHD6jxGkiTodDpYLBbnVigKgZviJ0tO4nTpaXjpvNAnpo/a5RARXZAwuVRRaLv1DVWvhlbKKKhpinOlOBFpnBDZUFnz4ppPMKClPQvtUGWpwoFzBwBwxQcRiUGIXJDJK8X9xG2KZ+RnwCpZEeMfgyj/KLXLISJqlDDZIPBKcUmSeDEtEWmWXU3xdevWuboOsVktQHW57d/e4u3zIa8S7xnVk/vFEpEQhMklwcenF1QUILssGwCQEq7BPbaIiGoRIhsqaprivuLtJ37w3EFUW6sR6hOKdoHt1C6HiOiChMgFAJAkjxifvjfftn8sGyBEpGXCZIPAe4qfKj2FosoiGPVGdAnronY5RER12NUUv+KKK1xdh9jkVeKAkCvFlf3E4zk6nYjEIEwuCd4Ul/cTTwpOQqCAVycTUdsiRDbUXikumNr7AuoEW+VORG2TELkAAJUlgNVs+7fATXE5J9gUJyItEyIbJOl8U1zA12LkVeIp4SnwNnirXA0RUV12NcVr++uvvxq9X6fTwdfXF4mJifDx8Wl1YUKRQ0pnALzE+tytkhWbszcDAAbGDVS5GiIix2k6lwRvisv7iXeL6KZyJUREjtFsNsh7ivuGuP+5W0kZgRjBZgcRiUezuQCcXyXu5Qd4izu9j6NyiUg0ms0GswmQrLZ/C7gAT86D7hHccomItMfhpvjFF1/c7MoAo9GISZMm4a233oKvr2+rihOGsp94oHB7A+4v2I+iyiIEGAO4NyARCUnTuVReaLv1C3Xv8zqJ3BTniQwRiUaz2SCvFBewKb43j2NxiUhcms0FACgvsN0KvEq8oKIAp0tPAwBSI1JVroaIyD6azQZlKq1O6KZ4WlSaypUQETWkd/QDvvrqKyQnJ2Pp0qXYuXMndu7ciaVLl6Jr165Yvnw53n33Xfz888+YO3euK+rVJmWciXghJe8n3jemL4x6o8rVEBE5TrO5ZC4Hqstt/xZ8pThf2CIi0Wg2G+SV4oKNTy8zl+Fo0VEAbIoTkZg0mwsAYJKb4uHuf24nkS+cSgpOQrC3WBlHRG2XZrOhssR2K+ACvGprtfJaEidMEZEWObxS/JlnnsErr7yCUaNGKfelpaWhXbt2mDdvHrZs2YKAgAD84x//wAsvvODUYjVLvnrLR7w9PuT9xPvH9Ve5EiKiltFsLsmrxHUG4ZofAFBYUYgzZWcA2PaBIiISiWazoUJeKS5WLmTkZ0CChNiAWET6RapdDhGRwzSbC8D58ekCrxTfk8/R6UQkHs1mg7wAT8Bew5HCI6iwVCDAGICkkCS1yyEiasDhleK7d+9G+/btG9zfvn177N69G4Bt9EhWVlbrqxOFMj5drJXilZZK7MjdAQAYEDdA5WqIiFpGs7mk7CceKtyVvQCQUWC7sjcxKBFB3kEqV0NE5BjNZoM8Pl2wi6W4nzgRiU6zuQB4RlOc+4kTkYA0mw2V8lRa8ZritfcT1+scbj0REbmcw3+ZUlJSsGjRIlRVVSn3mc1mLFq0CCkptpVkp0+fRkxMjPOq1LraI00EsjN3JyotlYj0i0Sn0E5ql0NE1CKazaWKQtutb6h7n9dJODqdiESm3WwQc6W48uJWZHeVKyEiahnN5gIgfFNckiQ2xYlISJrNBoFXinNyCBFpncPj0//973/j6quvRrt27dCzZ08AtquqLBYLVq5cCQA4evQo7rrrLudWqmWCrhSX9xMfEDcAOgFXMRIRARrOJWWlOPcTJyJyN81mg6Arxffm2/aK5YtbRCQqzeYCIHxTPKssCwUVBfDSeXHbJSISimazQdAFeMD5i2nTItNUroSIqHEON8UHDRqEY8eOYdmyZTh48CAA4G9/+xumTJmCoCDbeNWbb77ZuVVqndIUFyuo5P3EOTqdiESm2VzykKZ4t4huKldCROQ4zWaDslI8xP3P3UIFFQU4XXoaAC+UIiJxaTYXgFpN8XB1nr+V5AZIclgyfAw+KldDRGQ/zWZDlZjj0yuqK3Do3CEAvJiWiLTL4aY4AAQFBeHOO+90di3iEnCleFFlkbLio39cf5WrISJqHU3mksBN8aLKIqUB0i2cTXEiEpMms6GiyHYr0Ph0udmRFJyEYG9x6iYiqk+TuQAApgLbraArxTk6nYhEpslskHsNgo1P31+wHxbJgki/SMT4t6GtdYlIKA43xT/66KNm33/LLbe0uBhhCXj11tbsrZAgoUNIB8QGxKpdDhFRi2k2l8oLbbd+oeo8fyvsK9gHAGgX2A4hPuKsZiQikmk2G5Tx6eL8bd2bx9HpRCQ+zeYCIPz4dHn/WI7KJSLRaDYbKsXrNQDA7rzdAIAeET24VSsRaZbDTfH777+/zttmsxkmkwne3t7w9/dv401xcVaK195PnIhIZJrNJYFXinM/cSISnWazQRmfLs6Ka7nZwaY4EYlMs7kACN0Ut1gtyrlD98juKldDROQYzWZDVc2e4oKtFOfkECISgd7RDzh37lyd/0pLS3HgwAFcdtll+PTTT11Ro/YJOD6dTXEi8hSazSUPaIpzP3EiEpUms8FqrbVSXIymuCRJyotb3SPY7CAicWkyFwBbNijj08XbU/x48XGUmcvg5+WHjiEd1S6HiMghms0GZaV4kHo1tACb4kQkAoeb4o1JTk7GokWLGlxd1WbIK8UFuXrrTOkZnCg+Ab1Oj76xfdUuh4jI6TSRSxWFtlsBm+L78m3j07lSnIg8ierZUFUKQLL9W5CV4tll2SioKICXzgsp4Slql0NE5FSq5wIAVBYBksX2bz/xmuJyA6RbeDd46R0eRklEpDmayAbBeg0AUFRZhMySTABsihORtjmlKQ4AXl5eOHPmjLMOJxZlpbgYQbU5azMAW0AFCXbFGRGRvVTPJXmluG+oejW0QElViXIikxrOpjgReRZVs0FeJW7wBrx81anBQfK+gMlhyfAVpGYiIkeofs4grxL3DgSM4v2dVfaPZQOEiDyI6tkg4J7ie/P2AgASghIQ4hOicjVERE1z+DLOb7/9ts7bkiQhKysLr732Gi699FKnFSYUwcanc3Q6EXkSzeaSoOPT5VXiFwVehFDBGvpERDJNZkNFke3WJxjQ6dSpwUHyfuLcJ5aIRKfJXACEHp0OnG+CpEWmqVwJEZHjNJsN8kpxQXoNwPnzBl4kRURa53BTfMKECXXe1ul0iIqKwpVXXokXX3zRWXWJpVKcoJIkiU1xIvIoms0lQZviyn7i4dxPnIjEpclsqKhZKS7I6HTgfLOjRwRf3CIisWkyFwDAlG+79Y9Qr4YWqrJUYf+5/QB48RQRiUmz2aCMTxdnwqs8OYQXSRGR1jncFLdara6oQ2xV4ow0OVR4CAUVBfDz8kOvqF5ql0NE1GqazCWr5XzzQ7SmeIGtKc79xIlIZJrMBnl8uo8YTXGrZMXe/JqmOFd8EJHgNJkLgNBN8YPnDqLaWo1Qn1C0C2yndjlERA7TbDYINj5dkiTsyeNKcSISQ6v2FJckCZIkOasWcQm0p/imM7ZV4r1jesPb4K1yNUREzqWZXKooAlBTh1+ompU4TB6fzqY4EXkK7WSDWCvFjxcdR5m5DL4GX3QK7aR2OURETqOZXACEborLDZDukd2hE2RbECKipmgqG5SV4trvNQBAjikHeeV5MOgMSAlPUbscIqJmtagp/tFHHyEtLQ1+fn7w8/NDz5498fHHHzt8nF9//RXjx49HfHw8dDodvv76a+V9ZrMZDz/8MNLS0hAQEID4+HjccsstOHPmTJ1jJCUlQafT1flv0aJFLfm0Wk6gPcXl0ekD4waqXAkRkfM4K5ecRh6d7h0IGIzq1eGg0qpSHC8+DgDoFsHx6UQkNs1lQ2XNnuK+IerV4AB5X8BuEd3gpXd4wBgRkeZoLhcAoZvi8qhcbrFBRCLTZDYItlJcvkiqc2hn+Hn5qVwNEVHzHH51Y/HixZg3bx7uueceXHrppQCA33//HXfeeSfy8vIwa9Ysu49VVlaGXr164bbbbsN1111X530mkwk7duzAvHnz0KtXL5w7dw73338/rr76amzbtq3OY5988knMmDFDeTsoyI37bVgtQHW57d8aDyqzxYxtObavHfcTJyJP4cxccpryQtutYKPT9xXYVonHBsQi3Ddc5WqIiFpOk9kgrxT3EaQpLq8AjOA+sUQkPk3mAlCrKS7e/3vvzbNtscH9Y4lIVJrMBkkSbk9xjk4nIpE43BR/9dVX8cYbb+CWW25R7rv66qvRvXt3zJ8/36GwGDNmDMaMGdPo+0JCQrB69eo697322mvo168fMjMzkZiYqNwfFBSE2NhYBz8TJ5FDCtD8SvG/8v5CeXU5wn3DkRyWrHY5RERO4cxccpqKmpXioo5OD+fodCISmzazQV4pLsb4dLnZwRe3iMgTaDIXAMBUYLsVbKV4mbkMR4uOArCNTyciEpEms6GqDMp2fBpfgCdjU5yIROJwUzwrKwuDBg1qcP+gQYOQlZXllKKaUlRUBJ1Oh9DQ0Dr3L1q0CE899RQSExMxZcoUzJo1C15eTX9qlZWVqKysVN4uLrat2jCbzTCbzY4VVVYEIwBJZ0C1pAcc/fgWkGt0tNYNpzYAAPrG9IWl2gILLE6v7UJaWrvaRK0bYO1qELVuwL21O+s5nJFLzeVC7Vu7WC3QH9sAAwCrJMFSWQHoDfZ/vBO09Pson8ikhKWo8vPL3x33Y93uxbrtf67WcnU2tKROfXkhDAAsxgBY3fgz0JLvn9lixv6C/QCAlBBmgiNErRsQt3bW7V7urltLuQA0nQ1Ay2o1lOVBD6DaJxSSQH9r/8r5CxIkxPrHIsQrhDnhINbufqLWDbD25o7bWq7Ohpb1Gs7V9Br0qIaXW3oNQMu/V1bJir35totpu4V2U+3nlL8n6mH96mL9DY91ITpJkiRHDtyjRw9MmTIFjz32WJ37n376afznP//B7t27HTnc+UJ0Onz11VeYMGFCo++vqKjApZdeipSUFCxbtky5f/HixejduzfCw8Pxxx9/4NFHH8Wtt96KxYsXN/lc8+fPx4IFCxrcv3z5cvj7+ztUd2BFFobtexhVBn/80PNNhz7W3ZaWLEWmJRMT/Cagj08ftcshojbOZDJhypQpKCoqQnBwy1fNOSOXnJULcYVbkXZqGfzMBcp95cZw7G53E7JC+9p9HLW8UvwKzlrP4paAW9DF2EXtcoioDfLEbAAASFYMOvwcokr34WjkMOxudzOg0zt2DDc6XX0ab5S+AV+dL/4Z/E/odDq1SyKiNkpLuQA4PxtG7J0Nf3MBdsffhKPRIzSdDbX9VvEbfqz4Ed2N3TE5YLLa5RBRG+PJ2RBQkY3h+x6CWe+H73u95dDHquGs5SxeKXkFRhgxN2QuDDr3LgohIpLZmw0ON8W/+OILTJo0CcOHD1f22tiwYQPWrl2Lzz77DNdee22LCm6uKW42mzFx4kScOnUK69evb/YTeu+993DHHXegtLQUPj4+jT6msau3EhISkJeX53iQnt4O4wejIPmGwnL9B5ASBrp8RaDZbMbq1asxYsQIGI1Guz6m1FyKof8dCotkwcprViI+IN6lNTalJbVrgah1A6xdDaLWDbi39uLiYkRGRrb6JMYZudRULmRlZWHz5s12fT10+1fC8MWtACTUbh3Ib1kmvg8p5SqHP7+WaMn3scxchsGfD4YECauvXY0IP/ePcOTvjvuxbvdi3RcmQjY4es6g278Shp8eg67kjHKfFBQPy8iFbsmFlnz//nvov1i4dSEGxA7A61e+7uIKG8ffF/cTtXbW7V7urltLuQA0nQ3Lly/HNddcY/fXRO1skLX0+/nQbw9hzck1uO/i+zAtdZrrCmyGqL9DAGtXg6h1A6y9MaJkQ8t6DTtg/GCkW3sNQMu/VyuPrcTjGx/HxVEX470R77mwwubx90Q9rF9drP88e7PB4fHpEydOxJYtW7B48WJ8/fXXAIBu3bphy5YtSE9Pb3HBTTGbzbjhhhtw4sQJ/PzzzxcMkv79+6O6uhrHjx9H165dG32Mj49Pow1zo9Ho2Bc+41tgpW1vEV1FIbw+mQAExwOjnwNSr7b/OC3kSL27snfBIlmQGJSI9qHtXVzZhTn8tdYIUesGWLsaRK0bcE/tzjq+M3KpuVyQb5ut12oBVj8GZd+nWnSQAOjgtfqfQPer3TpK3ZHv49GCo5AgIdo/GrHBsS6urHn83XE/1u1erLv553AGV2eD3XVmfAvUXDBVm64kC15f3Arc8JFbzhsAx+red24fACAtKk31n1X+vrifqLWzbvdyV91aygWg6WwAHPiaaCgbZI5+PzMKMgAAvaJ7qf7zK+rvEMDa1SBq3QBrr388Z3B1NojWawAcr1lL5w0Af0/UxPrVxfrtzwaHmuJmsxl33HEH5s2bh08++aRFhTn6fDfccAMOHTqEdevWISLiwqvWdu7cCb1ej+joaNcWl/Et8NktaNAAKc6y3a/CSUxzNmVtAgAMiBugciVERM7j7lxq0ok/gOIzzTxAAopP2x7X4XK3leWIjHzbC1upEakqV0JE1DqayQarBVj1MBq7YAo1F0xh1SNAyji3XjBljz35ewAA3SO7q1wJEVHraSYXAKGzQZZfno8zZWegg47nDkQkLE1lAyBcr0G2J8923tAjoofKlRAR2cehzYqMRiO++OILpz15aWkpdu7ciZ07dwIAjh07hp07dyIzMxNmsxnXX389tm3bhmXLlsFisSA7OxvZ2dmoqqoCAGzcuBEvv/wydu3ahaNHj2LZsmWYNWsW/u///g9hYWFOq7OBC57EwHYSY7W4rgYHbTpT0xSPZ1OciDyHs3OpxUpznPs4FbApTkSeQjPZ4MgFUxpiMptwpPAIAL64RUSeQTO5AAibDbXtzd8LAOgQ0gGB3oEqV0NE1DKaygYBew0AYLaYsb9gPwAgLTJN5WqIiOzjUFMcACZMmKCME2mtbdu2IT09XRlHMnv2bKSnp+Pxxx/H6dOn8e233+LUqVO4+OKLERcXp/z3xx+2kwMfHx+sWLECV1xxBbp3745nnnkGs2bNwtKlS51SX5MEO4nJNeXiSNER6KBDv9h+apdDRORUzsylFguMce7jVKA0a7r5sAAAXVRJREFUxcPZFCci8WkiGwS9YGpfwT5YJSui/aIRE6Dd3CIicoQmcgEQNhtq2523GwDQI5IXThGR2DSTDYL1GmQHzx2E2WpGiE8I2gW1U7scIiK7OLyneHJyMp588kls2LABl1xyCQICAuq8/7777rP7WEOGDIEkNXYFlE1z7wOA3r17Y9OmTXY/n9MIdhKzOWszANvqvxCfEJWrISJyLmfmUou1H2Tb56k4C41f2auzvb/9INfX0gImswnHio8B4EpxIvIMmsgGQS+YkkcgcnQ6EXkSTeQCIGw21KaMymVTnIgEp5lsEKzXIKs9Ol2n06lcDRGRfRxuir/77rsIDQ3F9u3bsX379jrv0+l07gsLNQl2EiPvJ94/rr/KlRAROZ8mcklvAEY/V7P/kw51G+M1JwajF2l2X8CD5w7CKlkR5ReFKP8otcshImo1TWSDoBdM7c2zjcVls4OIPIkmcgEQNhtkkiSdzwlusUFEgtNMNgjWa5DJk0N4MS0RicThpvixY8dcUYdYBDqJkSTp/H7icdxPnIg8j2ZyKfVq4IaPbPtA1R57FRxva4inXq1ebRcg7wvIVeJE5Ck0kQ2CXjC1J//8ig8iIk+hiVwAhM0G2enS0zhXeQ5eei90De+qdjlERK2imWwQqNdQm/xaEvcTJyKROLynOOH8SQwA5aRFoa2TmGNFx5BbngtvvTfSo9PVLoeIyLOlXg08sAeYuhKY+K7t9oHdmm6IA7X2E2dTnIjIueQLpoLj6t4fHG+7X2P5UFhRiJMlJwFwxQcRkcsIlg21yRdOdQ3rCm+Dt8rVEBF5CIF6DbIycxmOFB4BwAlTRCQWh1eKWywWfPDBB1i7di1yc3NhtVrrvP/nn392WnGaJsiKwI1ZGwEA6THp8PXyVbkaIiLn01wu6Q1Ah8vd+5yttK9gHwCgW3g3lSshInIOTWVD6tVAyjjgxB+2fQADY2yrPDT0opZMXu2RGJSIEJ8QlashInIeTeUCIFQ21LbnLPcTJyLPoalsEKTXIMvIz4AECbEBsYj0i1S7HCIiuzncFL///vvxwQcfYNy4cejRowd0uvpXL7UhApzEyPuJc3Q6EXkq5lLrVFRX4GjhUQBcKU5EnkNz2SDIBVN78mzNDq4SJyJPo7lcAITJhtqULTbYFCciD6C5bBCg1yCTzxs4Op2IRONwU3zFihX47LPPMHbsWFfUIx4Nn8RUW6uxNXsrAGBg3ECVqyEicg3mUuscOHcAFsmCCN8IRPtHq10OEZFTMBtahvuJE5GnYi60nsVqUbZdYk4QkSfQZDZouNdQ2+683QCA7hG8mJaIxOLwnuLe3t7o3LmzK2ohJ9uTtwdl5jIEewcjJTxF7XKIiFyCudQ6tfcTV/2qaCIiJ2E2OE6SJGXFB1cAEpGnYS603tGioyivLoe/lz86hHRQuxwiolZjNrTc3jzbtktcKU5EonG4Kf6Pf/wDr7zyCiRJckU95ETy6PT+cf1h0OCYFSIiZ2AutY7cFO8Wwf3EiURgsUrYeCQf3+w8jY1H8mGx8m9fY5gNjssx5SCvPA8GnYEX1BKRx2EutJ584VRqRCpfYyIij8BsaJn88nycKTsDHXTcho+IhGPX+PTrrruuzts///wzfvjhB3Tv3h1Go7HO+7788kvnVUetwv3EicRisUrYcqwAuSUViA7yRb8O4TDouXK3Mcwl59mXvw8A9xMnEsGqPVlY8L8MZBVVKPfFhfjiifGpGN0jTsXKtIHZ0Dryao9OoZ3gb/RXuRoiag7PG+zDXHAuThMhEhMzoy5mQ+vtzbedN3QI6YBA70CVqyEicoxdTfGQkJA6b1977bUuKYacx2Q2YdfZXQDYFCcSARsdjmEuOUelpRJHCo8A4D5QRFq3ak8WZn6yA/XXMGQXVWDmJzvwxv/1bvN5wWxoHWU/cTY7iDSN5w32Yy44F3OCSDzMjIaYDa0n7yfOPCDyDG3t4im7muLvv/8+MjMz0a5dO+j1Dk9cJxXsyN2Bams14gPikRCUoHY5RNQMNjocx1xyjoMFB1EtVSPcNxwx/jFql0NETbBYJSz4X0aDnAAACYAOwIL/ZWBEaqyqJy6SxQLTtu2oPnsWXlFR8O9zCXQG941XZTa0jrwCkBdJEWkXzxscw1xwnkpLJQ4WHATAJgiRKJgZjWM2tB4nhxB5jrZ48ZTdf/k7dOiAvLw8V9ZCTrTpTM3o9PgB0Ok896oOItFdqNEB2Bod3DO2IeZS6yn7iYd3Y1YQadiWYwV1TlDqkwBkFVVgy7EC9xVVT/FPP+HwsOHInDoVZ+bMQebUqTg8bDiKf/rJrXUwG1rGKlmV8elpkWkqV0NEjeF5Q8swF5zjQMEB5WLa+IB4tcshogtgZjSP2dBykiQpTXGeNxCJTb54qv7rTfLFU6v2ZLmtFsliQdnmLSha+R3KNm+BZLG47LnsbopLUtsMSVFxP3EiMYjQ6NAq5lLr7SvgfuJEIsgtaTonWvI4Zyv+6Secvv8BVGdn17m/OicHp+9/wK2NcWZDy2QWZ6LEXAIfgw86h3VWuxwiaoSo5w3ufIGr0ednLjhF7WkivJiWSPu0nBlq5wLAbGiNU6WnUFhZCC+9F7qEdVG7HCJqIS1dPBW4Zw+OjxrttkUWdo1Pl/F/fMWQX56PA+cOAAD6xfZTuRoiao7WGx1NUXtEroy51DrySnE2xYm0LTrI16mPcybJYkHOwmeBxl5YkiRAp0POwmcRNGyY23KC2eA4eZ/YruFdYdQbVa6GiBoj4nlD8U8/IWfhs3UumvKKjUXMY48ieORIt9XBXGg9rgokEotWM0MruQAwG1pKni6VEpYCb4O3ytUQUUs5cvHUwE4RLqujdM0axH38CepfHiUvssArLzs9Hxxqis+bNw/+/v7NPmbx4sWtKohax2K14NP9nwIAEgITEOoTqm5BRNQsLTc6mqKlkxjmUstYrBZsztqsXEDVNbyryhURUXP6dQhHXIgvsosqGr2KVwcgNsQX/TqEu7s02wVS9VaI1yFJqM7OhmnbdgT0d8/FmswG+1msFuzI3YGVR1YCAFLDeZEUkVaJdt4gTxGpf9GUK1/gagpzofXki6e6R3ZXuRIisocWM0NLuQAwG1rCYrVg9YnVAIAo/yhYrBYY9O5fIENEraeFi6ckiwVnFz3XxDtdt8jCoab47t274e3d9BVAvMJKXWtOrMGiLYuQY8oBAJwsPYlRX4zCI/0ewfD2w1Wujogao+VGR2O0dhLDXHJc/awAgFtX3cqsINIwg16HJ8anYuYnOxq8T/4r98T4VBj07v+bV332rFMf5wzMBvs0lgffH/se/eP6Mw+INEik8watTRFhLrScxWrB76d/x7GiYwCAbuHdVK6IiOyhtczQWi4AzAZH1T93WHdyHfsORALTwsVTpm3bYcnJQZN/bV20yMKhpvhXX32F6Ohopz05Oc+aE2swe/1sSPX+VyfXlIvZ62dj8ZDFDCgiDard6NABdX6D1W501KfFkxjmkmOYFUTiGt0jDm/8X2/c++mfMFvO/w7HhvjiifGpGN0jTpW6vKKinPo4Z2A2XFhTeVBcVcw8INIokc4btDZFhLnQMo1dPDX5u8lsgBAJQGsX1Zbv2KGpXACYDY7ga0lEnkcLF0+ptchCb+8DeXWUdlmsFizasqhBMAFQ7ntuy3OwWOtP5iciLZAbHTEhda+8ig3xxRv/11u1Rkd9jry45Q7MJccwK4jEN7pHHGKCfQAADwxPxqczBuD3h69UNSf8+1wCr9hYoKm/yTodvGJj4d/nErfUw2y4sObyQMY8INIm5bwhWNvnDVqaIsJcaBm5AVK7IQ6cb4CsObFGpcqIyF5yZnh71X35X43MsGgoFwBmgyP4WhKRZ5IvngLQYKW2uy6eUmuRhd1NcamxlYGkCTtydzQ4UalNgoRsUzZ25Da8OpCItGF0jzj8cP/lytsf3tZX9UZHfVp6cQtgLjmKWUEkPkmSkFdaBQC4Lr0dBnaKUH1FoM5gQMxjjzbxTlttMY896rYJIsyGC2MeEIltdI84fHXXIAC2F6yW/b2/5s4btDRFhLngODZAiDzH6B5xaB/uBwCYeUUn1S6qNWgoFwBmgyN47kDkueSLpyIC624l4a6Lp/z7XAJDTEzTl+u7aJGF3U3x999/HyEhIU59cnKOsyb7GlD2Po6I1FFaUQ0A8DXqcUWXaNUbHfVp6cUtgLnkKGYFkfhKK6tRYbYCACKDmt7/zt2CR47ERa+8DNRrfHvFxOCiV15G8MiRbquF2XBhzAMi8eWX2S6QigzywaWdIzV33qClKSLMBcexAULkWbKLKwEAEy+5SLWLav1699ZMLgDMBkfw3IHIs43uEYcXru8FAIgL9nXrxVM6gwFRjzzcxDtdt8jC7qb41KlT4ePj49QnJ+eI8revAWXv44hIHedMthe3Qv200+ioTUsvbgHMJUcxK4jEd7bE9oJWoI8X/L29VK6mrqAhQwCrrWEf8/g8JH74ITqvXePWhjjAbLAH84BIfDnFFQCgbKmhNVqaIsJccBwbIESeo7SyGiU1CzDiQvxUq0NLuQAwGxzBcwciz5dbanutqUtskNsvngocPhxZN/8f9IGBde535SILu5vipF29o3sjxj8GugbT/2100CHWPxa9o3u7uTIickShyQwACPU3qlxJ47R2EkOOYVYQiU9uikcFae8FnKrTpwFJgs7fH2GTJyOgfz/mgUYxD4jEl1uTB9FBvhd4pHqCR47ERS++0OB+NaaIkGPYACHyHFmF5QCAYF8vBPioe1GtPF1KHxRU537mgrbx3IHI82UX2S64jQ1W59yitEcPBF13LQAg4PLLXb7Igk1xD2DQG/BIv0cAoEFAyW8/3O9hGPR8YZJIy+SV4mH+2lwpDtQakauvGx88idE+ZgWR+OT9xCMDtZcT5sxMAIB3YiJ0TU0UIU1gHhCJT+srxWW+PXva/uHlhfgX/qXaFBFyDBsgRJ7jTE2jQ81V4rUFjxyJkGtrGh9XXMFcEADPHYg8X3bNuUVsiHoX3FrPnQMA+Pfr6/JFFmyKe4jh7Ydj8ZDFiPaPrnN/jH8MFg9ZjOHth6tUGRHZq6hc2yvFZf59+yojcuOeeZonMQJhVhCJ7WyJ7URFkyvFT5xvipP2MQ+IxJarTA7R7kpxADCfOQMA8L7oIoRcdRWniAiCDRAizyGvFI8L1U5emM+cBgAEXjGYuSAInjsQeTZlpbiKTfHqvHwAgFek6ycROTw3pWPHjti6dSsiIiLq3F9YWIjevXvj6NGjTiuOHDO8/XAMTRiKHbk7cNZ0FlH+Uegd3ZsnKkSCOFcmN8W1twKwtsqDhwAAxsREhE6cqHI1zCVHMSuIxHW2Zp+nqEANNsXlleLttdEUZzZcGPOASFy5xfL4dO3lQW3VWVkAAK/4OJUrYS44Sm6ALNqyCDmmHOX+GP8YPNzvYTZAiAShtZXiAGA+eQoA4N2uncqVMBscwXMHIs+lhaa4JV9uikdc4JGt53BT/Pjx47BYLA3ur6ysxOnTp51SFLWcQW9A39i+apdBRC0gj0/X+krxygMHAAA+XZJVrsSGueQ4ZgWRmDS9p3jmCQC2C6a0gNlgH+YBkZhyS+Tx6dpZ+dcYc01T3BirflOcueA4NkCIxCevFI9XsdFRmyRJMJ88CQAwtktQuRpmg6N47kDkmeTx6XGaaIpHuvy57G6Kf/vtt8q/f/zxR4SEhChvWywWrF27FklJSU4tjoioLZHHp4dpvClecdDWFPft0lXVOphLRNTWaLopfsLWFPdObK9qHcwGImoLRFkpbj5T0xSPU68pzlxoHTZAiMSmNDpCtbFS3HLuHKwmE6DTwXhRvGp1MBuIiGwqzBYUlNkW6sWqdcGt1QpLzZ7ihggNrRSfMGECAECn02Hq1Kl13mc0GpGUlIQXX3zRqcUREbUlykpxP42PTz9wEADg01XdpjhziYjamrxSW05Eamx8umQ2w3y6Zt9YlcenMxuIyNNZrJKynYYwK8VVHJ/OXCCituyMxlaKy6vEvWJioPdR75yG2UBEZCNfbOtr1CPET52FeoayMsBqBXQ6eIWHu/z57G6KW61WAECHDh2wdetWRLphGTsRUVtSaJL3FNfuSnHJYkHlIdue4r5du6haC3OJiNoara4UN2dlAdXV0Pn4wCs6WtVamA1E5OkKyqpgsUrQ6YDIQG1fTFudXbOnuIorxZkLRNRWSZKErCJtrRSv0sh+4swGIiKbrCLbxVOxwb7Q6XSq1GAoKbHdhoVB5+Xwjt8O0zv6AceOHXNaUPz6668YP3484uPjodPp8PXXX9d5vyRJePzxxxEXFwc/Pz8MHz4ch2qaMbKCggLcdNNNCA4ORmhoKKZPn47S0lKn1EdE5E6FNSvFwwK0++JWVWYmpIoK6Pz8YExQf/8nwLm5RESkVVarhLxSbTbFq05kAgC8ExOg0zt8euESzAYi8lTyfuIRAd7wMmjjb25jJElSpogY49QbkStjLhBRW1NcXg1TlW2/bNVG4tZjPmVrivP1JCIibZC32YhVcaKIV00/18sNo9MBB1aKy5588slm3//444/bfayysjL06tULt912G6677roG73/++eexZMkSfPjhh+jQoQPmzZuHUaNGISMjA76+tm/STTfdhKysLKxevRpmsxm33norbr/9dixfvtyxT4yISGWFNXuKh6o0qsQeyuj05GToDAaVq7FxZi4REWlVYbkZ1VYJABARoLGmeKZtP3GjyvuJ18ZsICJPdX4/cW00OJpiLSmx7RsLwBgXq3I1zAUianvO1Kz+C/M3ws9bG6/fVJ2yjU83Jqi7UlzGbCCiti67ZqKImhdPySvFvaLcc5GSw03xr776qs7bZrMZx44dg5eXFzp16uRQWIwZMwZjxoxp9H2SJOHll1/G3Llzcc011wAAPvroI8TExODrr7/GjTfeiH379mHVqlXYunUr+vTpAwB49dVXMXbsWLzwwguIj1f/amQiIntYrBKK5Ka4v3ZXilcePABA/dHptTkzl4iItEpeJR7mb4S3l7ZWBpoz5ZXi6u4nXhuzgYg8lbxSPDpYWxdI1SfvJ24IC4PeT/2xvcwFImpr5JG4cSHq/w2WmeXx6RpZKc5sIKK2Tt5mI1bFrJBXihsiNNoU//PPPxvcV1xcjGnTpuHaa691SlGAbXxJdnY2hg8frtwXEhKC/v37Y+PGjbjxxhuxceNGhIaGKg1xABg+fDj0ej02b97cZD2VlZWorKysUz9gCz6z2ey0z8FV5BpFqLU+UWsXtW6AtauhJXWfM1VBsi0AhL+Xep/zhWov37cfAODVqXOra3TW5+iMXGouF2rfioJ1u5+otbNu92pN3VnnygDYxuW6+/O+UN0Vx48DAAztLmoz2SDSz15b/H1Rk6h1A+LW3tbqPnPOtvo6KtD9eQDYX3fFSdtqQK/Y2FbVqaVcAJrOBkC8n0FA3N8fgLWrRdTaRa0baHntJ/Nt5w+xwT6aeY2pqiYbdLFxbSIbRDpvEPl3BBC7fpFrB1i/2lpbf1ah7dwiOtCo2rmFocTWFNeHh7klG3SSJLdhWmf37t0YP348jte8KOYonU6Hr776ChMmTAAA/PHHH7j00ktx5swZxMXFKY+74YYboNPp8J///AcLFy7Ehx9+iAMHDtQ5VnR0NBYsWICZM2c2+lzz58/HggULGty/fPly+Pv7t6h+IqLWyC0HntnpBR+DhOf7WdQup0lJzz0P74ICnLzjdpR37NiqY5lMJkyZMgVFRUUIDg52UoXnOZJLzAUi0rptZ3X4+LABycFW3NPdqnY5dbR/4UX4nD2LU3+fDlNycquOxWwgImreZ0f12JCjx8iLrBiXqK08qC1k40bEfP0NSrun4swtt7T4OFrKBYDZQETi+C5Tj59O63FZjBV/66iBvKiuRvLcedBJEo7M/ScsQUEtPhSzgYjIORbvNuBEqQ63dbGgV4RTWsUOi12xAsF/7sTZsWNw7oorWnwce7PB4ZXiTSkqKkJRUZGzDudSjz76KGbPnq28XVxcjISEBIwcOdIlQepsZrMZq1evxogRI2A0anfv4caIWruodQOsXQ0tqfvPk4XAzi2IDPLD2LGDXVtgM5qr3VpaiqMPPwIAGHzzzTCEhLTquWqvqnAFR3KpqVwYOnQoNm/e3CZ+BrVA1LoBcWtn3e7VmrqzNhwHDh9ESlI8xo7t6ZoCm9Bc3ZLFgiNz5wEALr3+ehgvuqhVzyVCNohyziBri78vahK1bkDc2tta3SuX7wRycjEoPRVj+7t/2wp76847cBCFAC66+GJcPHZsi59PS7kANJ0NAIT7GQTE/f0BWLtaRK1d1LqBlte+/ovdwOksDOjZFWMHd3BhhU2rXbuUlYVMSYLOzxcjaxa9tZQo2SDSeYPIvyOA2PWLXDvA+tXW2vqf3fsLgEqMGzoIPdu17rX+ljCbzch4+x0AQPdLL0WwG84bHG6KL1mypM7bkiQhKysLH3/8cZP7g7dEbGwsACAnJ6fOSvGcnBxcfPHFymNyc3PrfFx1dTUKCgqUj2+Mj48PfHwa7r9lNBqF+sEXrd7aRK1d1LoB1q4GR+ouq7JdiRUW4K2Jz7Wx2k3HjgOwjUD0jWz9Hh/O+jydkUvN5YJ8q4Xvi6NYt/uJWjvrdq+W1F1gqgYAxAT7qfY5N1a3OTcXMJuhMxrhl5AAncHQ6udwBldnQ1v5udMC1u1+otbeVurOLa0CAMSFBaj6+V6obmtODgDAJ/6iVtWppVwAms4GQNyfQYC1q4W1u5+odQOO155dbMuLduHq5gVgq70yKxsA4N2uHby9vVt9PGdwdTaI+PMmYs21iVy/yLUDrF9tLam/2mLF2Zpzi3YRgap9/l6lJQAAn5hYt5w3ONwUf+mll+q8rdfrERUVhalTp+LRRx919HBN6tChA2JjY7F27VqlCV5cXIzNmzcrY9EHDhyIwsJCbN++HZdccgkA4Oeff4bVakX//v2dVgsRkaudM9kCKNSvdScGrlR50LZVhU/XLipXUpe7comISE15JbY96qKCGn8hXi1VmZkAAKMTGuLOxGwgIk91trgCABCtsTyoz5yVBQAwxsdd4JHuwVwgorYmq6gcABAb4qtyJTbmU7b9xI3tElSu5DxmAxG1ZXmlVbBYJRj0OkQGqnduIe8p7hUZ4Zbnc7gpfuzYMac9eWlpKQ4fPlzn2Dt37kR4eDgSExPxwAMP4Omnn0ZycjI6dOiAefPmIT4+Xtl3vFu3bhg9ejRmzJiBN998E2azGffccw9uvPFGxMfHO61OIiJXKzSZAQCh/tq9Iq3igK0p7tulq8qV1OXMXCIi0qqzpbamuJonKo2pOmFrinsnun+Eb3OYDUTkiaxWCbk1F0nFBGujydEUpSkep42mOHOBiNoSSZKQVWS7iCo+xE/lamyqTtY0xRPaqVzJecwGImrLsmsuto0J8oFB3/ItLVpDqq6GwWQCAHg5YTKtPVq1p/jJmjCT909y1LZt/9/encfHWdb7/39PMjNZmiZpkzRLd+gCpQeUVmpZRLpRRE6VPoAfiwcPHL4im4AogtaWehDEoyKKeNwoyiZ48AiHRWhZFCxL2SoUu1la2mxNm7VpMtv1+2PmniRN0iaQzH3dM6/n49EHzWSS+eRukjf3/bk/17VOp5xySvJtZ++NCy+8UKtWrdLXv/517du3T//v//0/NTU16cQTT9RTTz2l3Nyuk7/77rtPV1xxhebPn6+srCwtXbq019InAGC7psSk+Kh8iyfFN26SJOVMt6sp3t1HzSUAsNVuyyfFgxPtaop3RzYASBeN7SFFYvFtl2y7Sao7E4koktjqzl9p38ACuQAg3e3dF1JnJCZJKi+yIy/CH+yUJAUtmhTvjmwAkGlqEyuKlLu4oki0sVE+Y6SsLGWPGpWS18wa7AdEIhEtW7ZMRUVFmjRpkiZNmqSioiJ961vfUjgcHtTn+vSnPy1jTK8/q1atkiT5fD6tXLlStbW16ujo0OrVqzVtWs9le0ePHq37779fra2tam5u1m9+8xsVFBQM9ssCAFc1Wj4pboxRpzMpbtny6UOZSwBgK2ub4tu3S5IClk2Kkw0A0pEzJV4yIqigf9CXc1Imsnu3FI1KgYD8ZamZ+DgUcgFAJnGmxEsLcpTjt2OLo9BO+ybFyQYAmaw2kRWVbjbF9+yRJGUXF6dsS75BT4pfeeWVeuSRR3Tbbbdp7ty5kqS1a9dqxYoV2rNnj+66664hLxIA0l3TfqcpbuekeHhXtWL79skXCCg4aZLb5fRALgFId5FoTHsTK4rY1hQP74g3xYMTJrpcSU9kA4B0VJdY4tC2LDhQcun08nL5suxo3pMLADJJcun0Ynu22gjv3CVJClo0iU02AMhkNc7y6S5uyxRpaJAkZado6XTpQzTF77//fj344IM67bTTko8dffTRGj9+vM4991zCAgA+BGf59OI8OyfFOzfFp8SDU6bIF7CrRnIJQLrbuy8kY6TsLJ9V22yYWEyhHfGJD9uWTycbAKQjZ1J8jO37iVfbtZ+4RC4AyCw1iSVx3Zz+6y7a3KJYS4skKTB2rMvVdCEbAGSyOpsmxUtKUvaag75lNycnR5P6mBKcPHmygkF7LtIBgJc0JZZPHzXCroazI7l0+jS7lk6XyCUA6c9pgoweEVR2ls/larpE6utlOjslv1+BKrv2jCUbAKSjemeaw/pJ8WpJUqDKnqY4uQAgk1Q3OY2OPJcriQvvjO8n7i8rU1aeHTVJZAOAzOasKuLmpLjTFPeXWtwUv+KKK/Sd73xHnZ2dycc6Ozt1880364orrhjS4gAgUzQmJsWL8uz8n+6OjZskSTnTp7tcSW/kEoB0t7stsZ94gV1NkND2HZKkwNgq+fyDXoBqWJENANJR16S4XXlwoEhi+XS/RZPi5AKATGLbpHgk0RQPWLR0ukQ2AMhstS3u30AVaUj9pPigr169+eabWrNmjcaNG6djjjlGkvT2228rFApp/vz5OvPMM5PPfeSRR4auUgBIY83OpHi+3ZPiOdPtmxQnlwCku92JJohte8iGLN1PXCIbAKSn+pZ4Hrg5zTEQXcun27OKCLkAIJPUOJPixXZMZTuT4oFx9iydLpENADKXMUa1Ni2fbvOe4sXFxVq6dGmPx8ZbdpcXAHhJOBpTa2dEkqzaK9YR279foe3xxkeuhZPi5BKAdGdrUzy8Iz4pHpxg137iEtkAID3VtcYvXI2xLA8OFK5xmuIVLlfShVwAkEmqE5PiVZZMijtN8eA4u37vkg0AMlVTe1idkZgkd1ehiu5pkCT5R1s8KX733XcPRx0AkLGc/cR9Pqkwz75J8c4tW6VYTNklJfKn8K6tgSKXAKS7hjY7m+LO8unBifY1xckGAOnImRQvG2lHk6M/XU1xe5ZPJxcAZIpYzKiuxdJJccsazmQDgEzlLJ1eMiKoHH+2a3UkJ8VTuHz6oPcUnzdvnpqamno93tLSonnz5g1FTQCQUZr3x/cTL8wNKDvL53I1vXVuii+dnmvh0ukSuQQg/TmT4qW27SmemBQPWDgpTjYASDfGmGQelFu8p3i0bZ9iLS2S7NpTnFwAkCka2joVjhpl+aRyS26qTU6Kjx/nciU9kQ0AMpWzdLrb2zJF9uyVJGWXWtwUf/755xUKhXo93tHRob/+9a9DUhQAZJLGxKR4saX7iXc4+4lPs2/pdIlcApD+bFw+3RiTbIoHJ9q3pzjZACDdNLWHFYrGlzi0KQ8OFKmNT4lnFRYqu6DA5Wq6kAsAMkVNs7PVRq782YO+9D/0olFFnBVELJsUJxsAZKoaC/YTN+GwYo2NklI7KT7g5dPXr1+f/PuGDRtUW1ubfDsajeqpp57S2LFjh7Y6AMgATcmmuH37iUtS58ZNkqQcy/YTJ5cAZIrdzvLpFk2KRxsaZNrbpawsBS36XUs2AEhX9YkbpEblB1xd4vBQbFs6nVwAkGlqEvuJVxbbsdWGv7lZikblCwblLytzuxxJZAMAOMunV7jYFI/sjTfETVaWsouLU/a6A26Kf+xjH5PP55PP5+tz+ZC8vDz95Cc/GdLiACATNLbH70ottnA/cWOMOjfauXw6uQQgU9g4KZ5cOr2qSr6gPTd1kQ0A0pWzP+wY2/cTr7arKU4uAMg01U3xvKgqsmM/8cDe+NK4gXHj5MuyYHJdZAMA1CZuoKpwcfn0SMNuSVJ0xAj5slN30++Am+Lbtm2TMUaHHXaYXn31VZV1u7MrGAxqzJgxyk5h4QCQLpoTk+KjLFw+PVK/W9GmJik7W8HDD3e7nB7IJQCZoCMcVWtHRJJdk+Kh7Yml0y3bT5xsAJCunEnxMRbvJy5J4ZpqSVKgyo6mOLkAINM4k+JuTv91F0zsFxuwaD9xsgFApqttiZ9buJkV0T17JEmRkandcmnATfGJib0CY7HYsBUDAJkoOSlu4fLpnZviU+LByZOUlWPXBThyCUAmaEgsnR7MzlJh3oD/133YhXZslyQFJtrVFCcbAKQrr0yKO/vG+i2ZFCcXAGSaagv2ie3O3xhvigfH2bOfONkAINPVWnADVWR3gyQpWjAypa876Ctrv/3tbw/6/n/7t3/70MUAQCZq2u/sKW7fpHhy6fRpdu0n3h25BCCddV863efzuVxNl9D2eFM8OGGiy5X0jWwAkG52e2VSPLl8epXLlfRELgDIFDVN8UZHVbEly6dbOCnuIBsAZKoaC26gijiT4gWWToo7vvKVr/R4OxwOq729XcFgUPn5+YQFAAxSU2JSfJSFk+IdGzdJknKm29sUJ5cApDOnCVJq0X7ikhR2lk+3bFLcQTYASDf1rfELV+WW5cGBwjVOU7zC5Up6IhcAZAobGh3dBfc6k+L2NcXJBgCZaF9nJLlNX0WRezdQRfckJsVHpnZSPGuwH9DY2NjjT1tbmzZu3KgTTzxRDzzwwHDUCABprXGf/ZPiOdOnuVxJ/8glAOlsd2L5dJv2EzfGKLTDzj3FHWQDgHRT1+JMitvR5OiLiUYVrquTJAUsWT7dQS4AyASRaEz1iZtqrZkU3+tMituzfLqDbACQiWoT2zIV5PhVkOPeNn3O8umpnhQfdFO8L1OnTtWtt97a6+4qAMChdS2fbtekuAmF1PnPf0qSci2eFO8LuQQgXTS0xlcTKRtpT0ZEGxsVa2uTfD4rL271h2wA4GXOpPgYiyfFIw17pHBYysqSf8wYt8s5JHIBQLrZ3dapaMzIn+VTqQU31UZbW5Xd3i5JCoy1b1K8L2QDgHRXl1hRxM39xKWu5dOtnxTvj9/vV3V19VB9OgDIGM7y6cV5dk2Kd27bJkUiyioslL/CruUPB4JcApAOdrfFT1ZsmhR39hP3V1QoK8eeugaCbADgRcaY5KR4ucWT4pHa+NLp/vJy+fzuTZ0MBrkAIJ1UNyW22ijMVXaWz+VqpMiuXZKk7NGjlF0wwuVqBo5sAJDOnG02Klw+r4g0JCbFR1q+p/ijjz7a421jjGpqavTTn/5UJ5xwwpAVBgCZoqk9Pilu257iztLpudOmyedz/2SqP+QSgHTm7CleZtFkYNjypdMlsgFAemnZH1EoEpNkVx4cqGs/cbuWTpfIBQCZoaZ5vyR79hMP79wpSfJbuJ+4RDYAyEzO8uluT4pHE03xaIqXTx90U/xzn/tcj7d9Pp/Kyso0b948/eAHPxiqugAgI3SEo9ofjkqSikfYNSnekdxP3O6l08klAOnMxqZ4aLv9TXGyAUA6cZZOL8oLKDeQ7XI1/QtX29sUJxcAZIKaxKR4pSX7iTtNcVuXTicbAGSiWgsmxU0opGhzsyQpkuLl0wfdFI/FYsNRBwBkpObEfuLZWT6NzLFricHOjZskSTnTp7lcycGRSwDSWUObs6e4RU1xZ1J8or1NcbIBQDpxlk63eT9xqdukeJV9TXFyAUAmqE5MildZNikesHRSnGwAkIlqLNhTPLJ3b/wv2dmK5aX2Rq4Pvad4Q0ODGhLj7QCAD6ex237iti1Rnlw+3fJJcQe5BCDdGGOSk+KlNu0pnmiKByyeFHeQDQDSgTMpPqbQnizoS7gmvv+q38JJcQe5ACCdJSfFLWmKR5ym+Hg7m+IOsgFAJqlrcT8rIg17JEnZJSVS1oduU38og3q1pqYmXX755SotLVV5ebnKy8tVWlqqK664Qk1NTcNUIgCkr8Z98Unxony7lk6P7t2ryO7dks+nnClT3C6nX+QSgHS2L9S1xYZNTfHw9u2SpODESe4W0g+yAUC6cSbFy0fa0eToT8TS5dPJBQCZoqbFtuXTd0myc09xsgFApnImxctdXD490rBbkuQvGZ3y1x7wWr179+7V3LlztWvXLp1//vk68sgjJUkbNmzQqlWrtGbNGv3tb3/TqFGjhq1YAEg3zfvjk+Kj8oMuV9JT5+bNkqTAhPHKGjHC5Wr6Ri4BSHfOlPiIYLZGWLLFRrSpKbnvU9DCiQ+yAUA6cibFy6yfFHeWT69yuZIu5AKATFLT5Cyf7n5T3ESjClfHVxCxbfl0sgFApgpFYtqzL36tyc1J8egeZ1K8NOWvPeCraytXrlQwGNTWrVtVXl7e632LFi3SypUr9aMf/WjIiwSAdNXYHp8UL86za1I8tCm+n3juNHuXTieXAKQ7pylu1X7iH3wgSfKPGaOs/HyXq+mNbACQjupb7Z8Uj+3fr2hjoyQpUFHhcjVdyAUAmSIUiWl3W6LRUex+XkTq66VwWCY7W/4xY9wupweyAUCmqm/tkDFSMDtLo0e4N6TXY/n0FBvw8un/+7//q//6r//qFRSSVFFRodtuu01//OMfh7Q4AEh3TU5T3LZJ8U3xSfEci/cTJ5cApLuGNgub4tvj+4kHLd1PnGwAkI7qW+zfUzxcUytJysrPV1ZhocvVdCEXAGSKupZujQ4LrjE5N9OGi4vly852uZqeyAYAmarWWTq9KEc+n8+1OiINDZIsb4rX1NToqKOO6vf9M2fOVG1t7ZAUBQCZoqndWT7dzknxnOnTXK6kf+QSgHTnTIrbtJ94aPv7kqTARDub4mQDgHTkTIqPsXhSPFwTXyLXX1Xp6gW2A5ELADKFs0dsRVGusrLc/z0c/mBn/L+jU79f7KGQDQAyVW3iZtvKQne32YjuiTfF/aUWN8VLS0v1/vvv9/v+bdu2abSFIQcANuuaFLeoKR6NKrR1qyQp1+JJcXIJQLqzcfn08A5nUnyiy5X0jWwAkG6MMapLXLwqt3hSPJJoHgQq7dlPXCIXAGSOmub4fuJu7hHbXWhnYlK8xL7fsWQDgEzVNSnublZEdntgUvzUU0/VN7/5TYVCoV7v6+zs1LJly7R48eIhLQ4A0l1jYlLcpuXTA3v2yHR2ypefr8C4cW6X0y9yCUC6SzbFrZoUTzTFLZ0UJxsApJvWzog6wjFJlk+KV9dIkgKVlS5X0hO5ACBTVDfFGx1Vxe5O/zlsnhQnGwBkKqcp7vYNVJE9iT3FS0tT/tr+gT5x5cqVmj17tqZOnarLL79cRxxxhIwxeu+99/Szn/1MnZ2d+t3vfjfkBU6aNEnbt2/v9fhll12mO++8U5/+9Kf1wgsv9Hjfl770Jf385z8f8loAYKjZOCmek9gPMHfqVPmyBnzvVMq5lUsAkCq7bdxTfIfde4qTDQDSTX1LPAtG5vqVF7RrT9buwjWJpniVXU1xcgFApqi1bFI87OwpbmFTnGwAkKlqkitQudwUT+wp7i8pkRJ/T5UBN8XHjRuntWvX6rLLLtMNN9wgY4wkyefzaeHChfrpT3+q8ePHD3mBr732mqLRaPLtd955RwsXLtRZZ52VfOySSy7RypUrk2/n5+cPeR0AMBya9jt7itszKZ5TG7+glWPx0umSe7kEAKnSYFlTPNraqujevZKkgKVNcbIBQLqpT1y4GmNJFvTH2VPctklxcgFApqh2pv8smRQP7bR3UpxsAJCp6iyYFI+FQoq1tEhKTIpv3JjS1x9wU1ySJk+erCeffFKNjY3avHmzJGnKlCnDusdGWVlZj7dvvfVWHX744Tr55JOTj+Xn56uiomLYagCA4dKYmBQvyrNvUjxn+jSXKzk0N3IJAFLFWT691JLl051pj+ySEmUXFLhcTf/IBgDppD6RBTYvnS5JkcTy6X7LmuISuQAgMzh7ildZMCke27dP0cTSuOHRqd8vdiDIBgCZqCbRFK9wMSucfFAgoKzCwpS//qCa4o5Ro0bpuOOOG+paDikUCunee+/VtddeK5/Pl3z8vvvu07333quKigqdccYZWrZs2UGnxTs7O9XZ2Zl8uyVxV0I4HFY4HB6+L2CIODV6odYDebV2r9YtUbsbBlq3MUZNiT3FRwZ9Vnyd4XA4OSnuP/zwYa1pKD/3R82lg+VC9/96BXWnnldrp+7UGkzdsZhJToqPyst29Wt1Xrtj2zZJUmD8+GGrxyvZ4KXvvUz4ebGJV+uWvFt7utdd3bRPklRWELTia+yrbmNMcvl0X1nZkNZpUy5I/WeD5L3vQcm7Pz8StbvFq7V7tW5pMHkRb4qXjvC7/nV2vv++JCmrqEixvNwhr8cr2eCl8wYv/4xI3q7fy7VL1O+2wV5nqkusQlWa715WdNTGB/KyR41SJBKRNDTHf6Cfw2ec9UE84KGHHtJ5552nHTt2qKqqSpL0i1/8QhMnTlRVVZXWr1+v66+/Xscdd5weeeSRfj/PihUrdNNNN/V6/P7772fpdQAp0xmVvv5q/N6k246LKMeCLQKz9ndoyooVkqQtK5Yrljd8y261t7frvPPOU3NzswpduCusO3IBgG32haUb18Uz4gdzIvJnuVyQpNHPPqfSP/9Zzcceq7pzzh6W1yAbAKCnP76fpedrsjSvMqYlk2Jul9On7LY2Hf6d/5Tx+bT5P78j+T/U/EWfbMoFiWwAYKdwTLrulfjv3u/OjmiEy4sRjnj3XY397e/UMXasdlx15ZB/frIBAAavJSQte90vn4x+MCeqbJeuM4147z2NXXXPkGfEQLPBU03xU089VcFgUI899li/z3n22Wc1f/58bdmyRYcffnifz+nr7q3x48eroaHBiiA9lHA4rGeeeUYLFy5UIGDPkssD4dXavVq3RO1uGGjd1U37dfIP/qpAtk/vLl/QYwUMt7S++qrqLv4PZVdUaPIzTw/ra7W0tKi0tNSKk5j+cqGmpkavvPJK2n4P2sardUverZ26U2swdW+ua9Nnfvo3FecF9NqNp6Sowr45dX/8pb9p36OPavTll2v0pV8altfyQjZ45ZzBkQk/Lzbxat2Sd2tP97qvfmi9Hv97rW48bbr+/fiJKaywb33V3fHuu9r5/52r7LIyTX52zZC+nk25IPWfDffff7+WLFniqe9Bybs/PxK1u8WrtXu1bmlgtW/f064Ft7+o3ECW1i+b7/r1pabf/U4Nt31f+QsX6K0FC4b8uHslG7x03uDlnxHJ2/V7uXaJ+t02mPrf2dWiz//8ZY0ZmaOXvn7yQZ87nJr/53+0e8VNyj/pJJX9+PYhO/4DzYahu313mG3fvl2rV68+6AS4JM2ZM0eSDtoUz8nJUU5O770ZA4GAp77xvVZvd16t3at1S9TuhkPV3RpqlySNyg8qGAymqqyDiv3zn5Li+4kP9zG36d/0YLng/NemegeKulPPq7VTd2oNpO6mjqgkqWxkjjVfY3TnTklS7mGTh60mW75WKX3OGRzUnVperVvybu3pWvfutvh2SxXF+VZ9fd3r3l9fH3+sqnLIa7Tpa5b6zwbJu9+DErW7hdpTz6t1SwevvX5ffMnYqqI8K64vRavjW2oEx0+QNPTH3bZ/w3Q6b/Bizd15uX4v1y5Rv9sGUn9De3yp8sqiXHe/1qYmSVJgTNmQXnsf6MdbsBDjwNx9990aM2aMTj/99IM+76233pIkVVZWpqAqAPjwmtrjJy3F+fYEbuemzZKknGnTXK4EADLb7sR+4qUFfV94d0P4gw8kScEJ7k8qAkCm2N0az4MxI+3JgwNFEvsCBiqrXK4EADJTbXN8j9jK4lyXK4lzzhsC48a5XAkAwFHbvF+SVF7oblZEGvZIkvwlpa68vicmxWOxmO6++25deOGF8nfbm2rr1q26//779ZnPfEYlJSVav369rrnmGn3qU5/S0Ucf7WLFAHBoTfvjUx/F+e7fxesIbdokSQrSFAcAVzlNkDJLmiC+UEjR3bslScEJ412uBgAyR11LvNHh9sWrgwknJgIDDCcAgCtqnKZ4UZ7LlcSFEitMBcaNkxr3ulwNAEDqnhVuN8UbJEn+0hJXXt8TTfHVq1drx44duuiii3o8HgwGtXr1at1+++3at2+fxo8fr6VLl+pb3/qWS5UCwMA1OpPieXZMiptYTJ2bmRQHABvY1hQP7InfyZtdXKzsoiKXqwGAzNDWGVF7KL6dhs2T4uEamuIA4Kbqpvj0X5XLjQ4pfm0pTFMcAKxTm7jZtsLlG6iiyaY4k+L9WrRokYwxvR4fP368XnjhBRcqAoCPrrk9Pik+ypJJ8fCuXTLt7Yr5/QpMmOB2OQCQ0WxrigcTTfHARPIBAFKlPnHhqiDHrxE59l6+STbFq2iKA4AbnOk/txsdkhTZ3SDT2SllZ8tfUS793e2KAABS11YbFUXuXmdyJsWzXVo+3TN7igNAuklOio+wY1K8c+NGSVKovFw+v70X3QAgE9i2p3ggcdLCfuIAkDp1LfbvJy5J4ZpqSZKfSXEAcIUzKW7DnuLhnYn9xCsr5QvYcb0LANBtUrzQ3RuoIomhC38ZTXEAyChNyeXT7ZgU70g0xTsrKlyuBABg3aR4Q/ykJchKIgCQMvWt8QtXtmRBX2KhkKK74zdOBaqqXK4GADKTMyleZcGkeOiDRFN8/DiXKwEAOIwx3SbF3buBKtbZqVhrqyTJX+LOnuI0xQHAJU3J5dPtuHO2c+Om+H8raYoDgNsaEpPiZbZMiifu5A2yfDoApEx9YlK8vND9yb/+RGprJUm+3FxlFxe7WwwAZKD2UETN++NDF1ZMin8Q3088OG68y5UAABwtHRG1h6KSpAoXzy2c/cR9gYCyCgtdqYGmOAC4pDHRFC+2pimemBRn2UMAcFUkGtOeffGMsGU6MNkUZ1IcAFLGmRS3efn0cHViP/GKCvl8PperAYDM40yJF+T4VZjr/vWl5PLp45gUBwBb1CWWTi/ODygvmO1aHc7S6dmlpa6dO9AUBwCXNCXu5C3Od3/59Fh7u0I7dkiSQiyfDgCu2rsvJGOkLJ80eoQFGdHRoUBzsyQpMJE9xQEgVepb7Z8UD9ckmuJV3FgLAG6oaYo3OipdXA63u5AzKc7y6QBgDecGKjenxCUpkpgUd2vpdImmOAC4JrmnuAWT4p1btkjGKLu0VNGCArfLAYCMtjuxdProETnKznJ/6i68M35hK2vkSJbGBYAUciY6xhRaPCleUy1J8rPaFAC4orp5vySpstj9/cSlrnOHwHiWTwcAW9RZsJ+41K0pXlrqWg00xQHABbGY6banuPtTgB2JpdOD06a5XAkAYHdiMtCWpdPDOxJLII4fz9K4AJBC9ZblQV8iNfE9xQOVVS5XAgCZKTkpbsGqIrGODkXq6yWxfDoA2MSWSfFocvl0JsUBIKO0dkYUM/G/F+VZMCm+cZMkKWfaVJcrAQBY1xT/IL69RoD9xAEgpepbPLR8OpPiAOCKmuSkuPtZEd61S5KUVVDAClMAYJHaFksmxXczKQ4AGak5sXR6XiBbuYFsl6uROpkUBwBrOMunlxVY0hTfkWiKswQiAKRMeyiits6IJGmMJTdJ9YU9xQHAXdWJ6b+qIveXTw99wApTAGCj2sQNVG5PikcSk+L+EpriAJBRGpNLp7s/JW6MUccmZ1KcpjgAuK2hNZ4R1kyKO8unT2RSHABSIRoz+vM78WXJc/xZyrPgJtq+GGOYFAcAl9U0WTQp/kF8P/EgS6cDgFVqbNtTvIymOABkFKcpXmTBfuKRujrFmpslv1/ByZPdLgcAMp4zKV5a4H5GSCyfDgCp9NQ7NTrxe8/qmofeliR1RmI66bbn9NQ7NS5X1lusuVmmvV2S5K+ocLkaAMhMtYlGR6UFk+LhnV2T4gAAe9S12JEVUacpXsKe4gCQUZr3x5dPt2FS3Fk6PWfyJPmCdjRgACCT7W6Nn6zYMCkeC4UUqYlPKwbG0xQHgOH01Ds1+vK9byQnORy1zR368r1vWNcYd6bEs0tKlJXr/oQiAGSSaMzo2ffq1JrYaqO80P1zh5AzKT6eSXEAsEVHOKrGxFauri+fnmiKZ7N8OgBkjmjM6I3tjYm/xxSNGVfr6djoLJ0+3dU6AADxjNixJz51t7u10/WMCO/cJcViigWDyi4Z7WotAJDOojGjmx7boL5+6zuP3fTYBtdzoTuWTgcAdzirilx0z7rkY4t+9BfXb54KO3uKj2NSHABsEI2ZZDYEs7M0Ise9bZliHR2K7dsnieXTASBjOCcu96zdLkl6ZVujTvzes66euCQnxafTFAcANzkZUZ2YEPzPx99zPSNC29+P/7e0RD6fz7U6ACDdvbptb68J8e6M4nsBvrptb+qKOoRwtdMUZ+l0AEgVW1cVMcYotDM+KR4YN9aVGgAAXZxrTFf/Pr4tUyjq7rZMkYY9kiRfMKisggJXapBoigNAyth64tK5Kd4Uz50+zZXXBwDYmxHhHfH9xMMuLm0FAJmgvrX/hviHeV4qhGuqJUl+JsUBICVsXVXERKNqfWa1zP79kiR/BTdLAYCbbLzGFG3YLUnyl5a6OnRBUxwAUsDGExcTjartpZfUuWWrJCk4ZUrKXhsA0MXGjJDiObHv1dfif8/KkolGU/r6AJBJxowc2P5+A31eKkSSy6dXuVwJAGQGG1cVaXn6aW2Zv0C7rroq+dg/T/uMWp5+OmU1AAC62HiNyUSjalu7Nv5Gbq6r15doigNACth24uKctHxw8X9IJh6A288/X22rV6fk9QEAXWzLCKkrJ9rWrJEkFb79tt4/dTEXtwBgmBw3ebQqi3LV38yET1JlUa6Omzw6lWX1y0Sj6ti0WZIUa2vjxikASAHbVhVpefpp7frK1YrU1vZ4PFJXp11fuZprTADgAtuuMTnXlxp+fIckKfzPf2rL/AWuXV+iKQ4AKWDTiUv/Jy31qr32qyp4551hrwEA0MWmjJD6z4lofb12feVqGuMAMAyys3xafsYMSerVGHfeXn7GDGVnubfUoKPgnXf0/qmLFdqyRZLUcOedrl7YAoBMYdOqIiYaVd13b0kOWvR8Z/yxhu/dJsViw14LAKCLTdeYbLx5iqY4AKSALScuAzlpKXv0MSY9ACCFbMkIaWA5UffdW8gJABgGi2dW6q4LjlVFUc/f9xVFubrrgmO1eKb7e3e3rV6tyt/dq2hdXY/HnQtbNMYBYPjYtKpI+7rXezU5ejBGkdpa5W3bNuy1AAC62HKNydabp/wpfTUAyFDOiUttc0ef+3n4FL/YNdwnLgM5aQk0N2v/G28oePzxw1oLACDOloyQBn5xq33d6xox57hhrwcAMs3imZVaOKNCr27bq/rWDo0ZGf/9b8OEuIlGtfvW7/XzTiP5fKr77i0aOX++fNnZqS0OADKAs6rIl+99Qz6px7lDqlcViezePaDn+Vtbh7kSAEB3tlxjsvXmKSbFASAFbFkOcaAnLdEBPg8A8NHZkhHSwHNioM8DAAxedpZPcw8v0ZKPjdXcw0usaIhL8Qtb0bq6ficUu984BQAYHrasKuIvKxvQ8yIjRw5zJQCA7my5xmTrzVNMigNAijgnLjc9tkE1zV17dlQU5Wr5GTNScuIy0JOW7AE+DwAwNGzICGngOTHQ5wEA0gc3TgGAHWxYVSR/9iz5KyoUqavre2lcn0/+8nLtnzw5ZTUBAOJsuMZk681TNMUBIIXcPnEZyElLuLBQeccem5J6AABd3M4IaeAXt/Jnz0pZTQAAO3DjFADYw1lVxC2+7GyV33iDdn3lasnn63nu4Iufv5Re/3UpFHKnQADIcG5fY7L15imWTweAFHNzOUTnpCX+xgGvm3h797+ewR6AAOASt5fMHUhOlN94AzkBABkof/YsZZeX97k3oaT4ha2KCm6cAoAMUbhokcb++Hb5y8t7PO4vL9fYH9+uggULXKoMACDZ34covf7rUlZq29Q0xQEgwxzspKXihz9Q28yZLlUGALDBoS5uFS5a5FJlAAA3+bKzVfaN6xNvcOMUACB+7jBlzWpNuOceVf3Xf2nCPfdoyprVnDMAAKy8eYrl0wEgAxUuWqSR8+erfd3riuzeLX9ZmfJnz1IkFpOeeMLt8gAALuueE521NVq3das+fdllCubmul0aAMBFBQsWqOYLF2j8088oWleXfNxfXq7yG2+gCQIAGciXna0Rc45zuwwAgIX660P4srMVDodTXg9NcQDIUH2etMRi7hQDALCOkxPBcFj7n3iCyT8AgCSpbeZMTfrqVxV+e32vC1sAAAAA0J1NN0/RFAcAAAAAAMCA2XRhCwAAAAAGgj3FAQAAAAAAAAAAAABpy+qm+IoVK+Tz+Xr8OeKII5Lv7+jo0OWXX66SkhIVFBRo6dKlquu2pxUAAAAAAAAAAAAAILNZ3RSXpKOOOko1NTXJPy+++GLyfddcc40ee+wxPfzww3rhhRdUXV2tM88808VqAQAAAAAAAAAAAAA2sX5Pcb/fr4qKil6PNzc369e//rXuv/9+zZs3T5J0991368gjj9TLL7+sT37yk6kuFQAAAAAAAAAAAABgGeub4ps3b1ZVVZVyc3M1d+5c3XLLLZowYYJef/11hcNhLViwIPncI444QhMmTNDatWsP2hTv7OxUZ2dn8u2WlhZJUjgcVjgcHr4vZog4NXqh1gN5tXav1i1Ruxu8WreU2tptOj4Hy4Xu//UK6k49r9ZO3alF3QN/LRt4/ZzBwfddanm1bsm7tVN3aqW6btuOT3/ZINlX60B49ftQona3eLV2r9YtUfvBPq8t0uG8wcvfZ5K36/dy7RL1u436e3+uQ/EZY8xHfrVh8uSTT6qtrU3Tp09XTU2NbrrpJu3atUvvvPOOHnvsMf37v/97j8CRpOOOO06nnHKKvve97/X7eVesWKGbbrqp1+P333+/8vPzh/zrAAD01t7ervPOO0/Nzc0qLCx0tRZyAQDsQDYAALqzKRcksgEAbEA2AAAONNBssLopfqCmpiZNnDhRP/zhD5WXl/ehm+IH3r3V3NysCRMmaNu2bRo5cuSw1T9UwuGwnnvuOZ1yyikKBAJulzMoXq3dq3VL1O4Gr9Ytpbb21tZWTZ48WU1NTSoqKhrW1zqU/nJh06ZNWrdunef+Lb36PejVuiXv1k7dqUXdh+aFbPDKOYOD77vU8mrdkndrp+7USnXdNuWC1H82/OpXv9Lpp5/uqX9LybvfhxK1u8WrtXu1bona++KVbPDSeYOXv88kb9fv5dol6ncb9XcZaDZYv3x6d8XFxZo2bZq2bNmihQsXKhQKqampScXFxcnn1NXV9bkHeXc5OTnKyclJvu0saTJ58uRhqRsA0L/W1lbXT2L6y4Vp06a5VRIAZDSbs4FzBgBIPRtyQeo/G/7jP/7DrZIAIGPZng2cNwBA6h0qGzzVFG9ra9PWrVv1hS98QbNmzVIgENCaNWu0dOlSSdLGjRu1Y8cOzZ07d1Cft6qqSh988IFGjhwpn883HKUPqZaWFo0fP14ffPCBFUvEDIZXa/dq3RK1u8GrdUuprd0Yo9bWVlVVVQ3r63wYTi4YYzRhwgTP/Vt69XvQq3VL3q2dulOLug/NC9nglXMGB993qeXVuiXv1k7dqZXqum3OBSmeDRs2bNCMGTM8928peff7UKJ2t3i1dq/WLVF7X7yQDV47b/Dy95nk7fq9XLtE/W6j/i4DzQarm+LXXXedzjjjDE2cOFHV1dVavny5srOzde6556qoqEgXX3yxrr32Wo0ePVqFhYW68sorNXfuXH3yk58c1OtkZWVp3Lhxw/RVDJ/CwkJPfqNL3q3dq3VL1O4Gr9Ytpa52G+7o7YuTC87dvV79t6Tu1PNq7dSdWtR9cLZng1fxfZdaXq1b8m7t1J1aqazb1lyQ4tkwduxYSd79t5So3S3UnnperVui9gPZng1ePW/w8veZ5O36vVy7RP1uo/64gWSD1U3xnTt36txzz9WePXtUVlamE088US+//LLKysokST/60Y+UlZWlpUuXqrOzU6eeeqp+9rOfuVw1AAAAAAAAAAAAAMAWVjfFH3zwwYO+Pzc3V3feeafuvPPOFFUEAAAAAAAAAAAAAPCSLLcLwODl5ORo+fLlysnJcbuUQfNq7V6tW6J2N3i1bsnbtQ8Hrx4P6k49r9ZO3alF3XCDV//9qDv1vFo7daeWV+seTl4+JtTuDmpPPa/WLVE7UsPr/1Zert/LtUvU7zbqHzyfMcak7NUAAAAAAAAAAAAAAEghJsUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4pa66667dPTRR6uwsFCFhYWaO3eunnzyyeT7P/3pT8vn8/X4c+mll7pYcf9uvfVW+Xw+XX311cnHOjo6dPnll6ukpEQFBQVaunSp6urq3CuyD33VbetxX7FiRa+6jjjiiOT7bT7eh6rd1mMuSbt27dIFF1ygkpIS5eXl6V/+5V+0bt265PuNMfr2t7+tyspK5eXlacGCBdq8ebOLFXc5VO1f/OIXex33xYsXu1jx8DnU71ubf37SISu8mhGSd3LCqxlBPriDfPCmdMgDybuZ4JU8kMgEN5AJ6YFzBjuQE6nh1ayQvJ0XEpmB4ZVOeSB5NxMk7+WCRDa4zcv5INmVEf5h+az4yMaNG6dbb71VU6dOlTFG99xzj5YsWaI333xTRx11lCTpkksu0cqVK5Mfk5+f71a5/Xrttdf03//93zr66KN7PH7NNdfo8ccf18MPP6yioiJdccUVOvPMM/XSSy+5VGlP/dUt2XvcjzrqKK1evTr5tt/f9eNt+/E+WO2Snce8sbFRJ5xwgk455RQ9+eSTKisr0+bNmzVq1Kjkc2677TbdcccduueeezR58mQtW7ZMp556qjZs2KDc3Fyra5ekxYsX6+67706+nZOTk+pSU+JQv29t/vnxelZ4NSMk7+WEVzOCfLCvdilz8sFLvJ4HknczwWt5IJEJqUQmpA/OGdxHTqSWV7NC8mZeSGQGhl+65IHk3UyQvJsLEtngFi/ng2RhRhh4xqhRo8yvfvUrY4wxJ598svnKV77ibkGH0NraaqZOnWqeeeaZHvU2NTWZQCBgHn744eRz33vvPSPJrF271qVqu/RXtzH2Hvfly5ebY445ps/32X68D1a7MfYe8+uvv96ceOKJ/b4/FouZiooK8/3vfz/5WFNTk8nJyTEPPPBAKkrs16FqN8aYCy+80CxZsiQ1BVnI+X1r+89PX7ySFV7NCGO8lxNezQjyIfXIh/TilTwwxruZ4LU8MIZMSDUyIb1xzpA65ERqeTUrjPFuXhhDZsAdXssDY7ybCcZ4NxeMIRvc5OV8MMa+jGD5dA+IRqN68MEHtW/fPs2dOzf5+H333afS0lLNnDlTN9xwg9rb212ssrfLL79cp59+uhYsWNDj8ddff13hcLjH40cccYQmTJigtWvXprrMXvqr22Hrcd+8ebOqqqp02GGH6fzzz9eOHTsk2X+8pf5rd9h4zB999FHNnj1bZ511lsaMGaOPf/zj+uUvf5l8/7Zt21RbW9vjuBcVFWnOnDmuH/dD1e54/vnnNWbMGE2fPl1f/vKXtWfPHheqTa0Df9964efH4bWs8GpGSN7MCa9mBPmQWuRDevBaHkjezQQv5oFEJqQSmZCeOGdIPXIi9byaFZI380IiM5BaXs0DybuZIHk7FySywS1ezgfJvoxg+XSL/f3vf9fcuXPV0dGhgoIC/fGPf9SMGTMkSeedd54mTpyoqqoqrV+/Xtdff702btyoRx55xOWq4x588EG98cYbeu2113q9r7a2VsFgUMXFxT0eLy8vV21tbYoq7NvB6pbsPe5z5szRqlWrNH36dNXU1Oimm27SSSedpHfeecfq4y0dvPaRI0dae8z/+c9/6q677tK1116rG2+8Ua+99pquuuoqBYNBXXjhhcljW15e3uPjbDjuh6pdii9XcuaZZ2ry5MnaunWrbrzxRp122mlau3atsrOzXa1/OPT3+/att96y+udH8mZWeDUjJG/mhFczgnxIPfLB27yYB5J3M8GLeSCRCalGJqQXzhncQU6knlezQvJuXkhkBlLDy3kgeTcTJG/ngkQ2uMnL+SBZmBEpmUfHh9LZ2Wk2b95s1q1bZ77xjW+Y0tJS8+677/b53DVr1hhJZsuWLSmusrcdO3aYMWPGmLfffjv5WPclKO677z4TDAZ7fdwnPvEJ8/Wvfz1VZfZyqLr7YtNx766xsdEUFhaaX/3qV9Ye7/50r70vthzzQCBg5s6d2+OxK6+80nzyk580xhjz0ksvGUmmurq6x3POOussc/bZZ6eszr4cqva+bN261Ugyq1evHu7yXNHf71sv/Px4LSu8mhHGpE9OeDUjyIfhRz54m9fywBjvZkK65IExZMJwIxPSC+cMqUdO2MGrWWGMd/LCGDIDqeHVPDDGu5lgTPrlgjFkQyp5OR+MsS8jWD7dYsFgUFOmTNGsWbN0yy236JhjjtGPf/zjPp87Z84cSdKWLVtSWWKfXn/9ddXX1+vYY4+V3++X3+/XCy+8oDvuuEN+v1/l5eUKhUJqamrq8XF1dXWqqKhwp2gduu5oNNrrY2w67t0VFxdr2rRp2rJliyoqKqw83v3pXntfbDnmlZWVyTspHUceeWRy6RXn2NbV1fV4jg3H/VC19+Wwww5TaWmp68d9uPT3+9YLPz9eywqvZoSUPjnh1YwgH4Yf+eBtXssDybuZkC55IJEJw41MSC+cM6QeOWEHr2aF5J28kMgMpIZX80DybiZI6ZcLEtmQSl7OB8m+jKAp7iGxWEydnZ19vu+tt96SFP8Gc9v8+fP197//XW+99Vbyz+zZs3X++ecn/x4IBLRmzZrkx2zcuFE7duzosYeJbXX3tUyDTce9u7a2Nm3dulWVlZWaNWuWlce7P91r74stx/yEE07Qxo0bezy2adMmTZw4UZI0efJkVVRU9DjuLS0teuWVV1w/7oeqvS87d+7Unj17XD/uqeL8vvXaz49kf1Z4NSOk9MkJr2YE+TD8yIf0YnseSN7NhHTJA4lMGG5kQnrjnGH4kRN28GpWSN7JC4nMgDu8kgeSdzNBSr9ckMiGVPJyPkgWZsSQz55jSHzjG98wL7zwgtm2bZtZv369+cY3vmF8Pp95+umnzZYtW8zKlSvNunXrzLZt28yf/vQnc9hhh5lPfepTbpfdrwOXA7n00kvNhAkTzLPPPmvWrVtn5s6d22sJBRt0r9vm4/7Vr37VPP/882bbtm3mpZdeMgsWLDClpaWmvr7eGGP38T5Y7TYf81dffdX4/X5z8803m82bN5v77rvP5Ofnm3vvvTf5nFtvvdUUFxebP/3pT2b9+vVmyZIlZvLkyWb//v0uVn7o2ltbW811111n1q5da7Zt22ZWr15tjj32WDN16lTT0dHhau3D4WC/b42x++cnXbLCqxlhjDdywqsZQT6kHvngXemSB8Z4NxO8kAfGkAmpRiakD84Z7EFODD+vZoUx3s0LY8gMDL90ywNjvJsJxngrF4whG9zk5Xwwxr6MoCluqYsuushMnDjRBINBU1ZWZubPn5882dqxY4f51Kc+ZUaPHm1ycnLMlClTzNe+9jXT3NzsctX9OzCg9u/fby677DIzatQok5+fbz7/+c+bmpoa9wrsR/e6bT7u55xzjqmsrDTBYNCMHTvWnHPOOT32vLD5eB+sdpuPuTHGPPbYY2bmzJkmJyfHHHHEEeYXv/hFj/fHYjGzbNkyU15ebnJycsz8+fPNxo0bXaq2p4PV3t7ebhYtWmTKyspMIBAwEydONJdccompra11seLhc7Dft8bY/fOTLlnh1Ywwxhs54dWMIB/cQT54U7rkgTHezQQv5IExZIIbyIT0wDmDPciJ4efVrDDG23lhDJmB4ZVueWCMdzPBGG/lgjFkg9u8nA/G2JURPmOMGfr5cwAAAAAAAAAAAAAA3Mee4gAAAAAAAAAAAACAtEVTHAAAAAAAAAAAAACQtmiKAwAAAAAAAAAAAADSFk1xAAAAAAAAAAAAAEDaoikOAAAAAAAAAAAAAEhbNMUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4gAAAAAAAAAAAACAtEVTHLDEr3/9ay1atMjtMgZk1apVKi4uHtbX+PnPf64zzjhjWF8DAGxGLvRELgAA2XAgsgEAyIYDkQ0AQDYciGyAg6Y40prP5zvonxUrVuj999/v8VhJSYkWLVqkN998s8fnevfdd3X22WerrKxMOTk5mjZtmr797W+rvb1dkvT8888f8vWef/75Puvs6OjQsmXLtHz58uE+JJKkyZMna/Xq1SkJnA/roosu0htvvKG//vWvbpcCII2QC30jFwBkMrKhb2QDgExGNvSNbACQyciGvpEN8BKa4khrNTU1yT+33367CgsLezx23XXXJZ+7evVq1dTU6M9//rPa2tp02mmnqampSZL08ssva86cOQqFQnr88ce1adMm3XzzzVq1apUWLlyoUCik448/vsfnPvvss7V48eIejx1//PF91vmHP/xBhYWFOuGEE4b9mKxfv16NjY06+eSTh/21PopgMKjzzjtPd9xxh9ulAEgj5EJv5AKATEc29EY2AMh0ZENvZAOATEc29EY2wHMMkCHuvvtuU1RU1Ovxbdu2GUnmzTffTD720ksvGUnmqaeeMrFYzMyYMcPMnj3bRKPRHh/71ltvGZ/PZ2699dZen/fCCy80S5YsGVBtp59+urnuuuv6/Pibb77ZjBkzxhQVFZmbbrrJhMNhc91115lRo0aZsWPHmt/85jfJj+ns7DSXX365qaioMDk5OWbChAnmu9/9bo/Pu3LlSnPOOeeY5557zkjq8Wf58uXGGGP27t1rvvCFL5ji4mKTl5dnFi9ebDZt2tTvsayvrzezZs0yn/vc50xHR4eJRqPmu9/9rpk0aZLJzc01Rx99tHn44YeTz3dee/Xq1WbWrFkmLy/PzJ071/zjH//oUesLL7xggsGgaW9vH9BxBIDBIBfiyAUA6EI2xJENANCFbIgjGwCgC9kQRzbAa5gUB/qQl5cnSQqFQnrrrbe0YcMGXXvttcrK6vkjc8wxx2jBggV64IEHPtLrvfjii5o9e3avx5999llVV1frL3/5i374wx9q+fLl+uxnP6tRo0bplVde0aWXXqovfelL2rlzpyTpjjvu0KOPPqqHHnpIGzdu1H333adJkyb1+JyPPvqolixZouOPP77XHW3O3Wxf/OIXtW7dOj366KNau3atjDH6zGc+o3A43KvGDz74QCeddJJmzpypP/zhD8rJydEtt9yi3/72t/r5z3+ud999V9dcc40uuOACvfDCCz0+9pvf/KZ+8IMfaN26dfL7/brooot6vH/27NmKRCJ65ZVXPsrhBYCPjFwgFwDgQGQD2QAAByIbyAYAOBDZQDbAIq625IEUGujdW42Njebzn/+8KSgoMLW1tebBBx/sdXdXd1dddZXJy8vr9fhA795qbGw0ksxf/vKXXh8/ceLEHneMTZ8+3Zx00knJtyORiBkxYoR54IEHjDHGXHnllWbevHkmFov1+Vo7d+40wWDQNDY2GmP6PiabNm0yksxLL72UfKyhocHk5eWZhx56qMfH/eMf/zDjx483V111VfI1Ozo6TH5+vvnb3/7W4/NefPHF5txzzzXG9Lx7y/H4448bSWb//v09Pm7UqFFm1apVfR88APgIyAVyAQAORDaQDQBwILKBbACAA5ENZAO8yZ+KxjvgBccff7yysrK0b98+HXbYYfr973+v8vLy5PuNMcPyuvv375ck5ebm9nrfUUcd1eOOsfLycs2cOTP5dnZ2tkpKSlRfXy8pftfVwoULNX36dC1evFif/exntWjRouTzH330UZ144okqLi7ut5733ntPfr9fc+bMST5WUlKi6dOn67333utR90knnaTzzjtPt99+e/LxLVu2qL29XQsXLuzxeUOhkD7+8Y/3eOzoo49O/r2yslKSVF9frwkTJiQfz8vLU3t7e7/1AsBwIRfiyAUA6EI2xJENANCFbIgjGwCgC9kQRzbANjTFgYTf//73mjFjhkpKSnr8Ip82bZqk+C/wA3/ROo87z/kwSkpK5PP51NjY2Ot9gUCgx9s+n6/Px2KxmCTp2GOP1bZt2/Tkk09q9erVOvvss7VgwQL94Q9/kBQPqn/913/90LV2l5OTowULFuj//u//9LWvfU1jx46VJLW1tUmSHn/88eRj3T+mv6/P5/NJUvJrcezdu1dlZWVDUjMADAa5MDjkAoBMQDYMDtkAIBOQDYNDNgDIBGTD4JANSBX2FAcSxo8fr8MPP7zXnU0f+9jHdMQRR+hHP/pRr1+ib7/9tlavXq1zzz33Q79uMBjUjBkztGHDhg/9OborLCzUOeeco1/+8pf6/e9/r//5n//R3r171dbWpueee05Llizp8drRaLTHxx955JG99tbYs2ePNm7cqBkzZiQfy8rK0u9+9zvNmjVLp5xyiqqrqyVJM2bMUE5Ojnbs2KEpU6b0+DN+/PhBfS1bt25VR0dHn/+DAADDjVyIIxcAoAvZEEc2AEAXsiGObACALmRDHNkA29AUBw7B5/Pp17/+tTZs2KClS5fq1Vdf1Y4dO/Twww/rjDPO0Ny5c3X11Vd/pNc49dRT9eKLL37kWn/4wx/qgQce0D/+8Q9t2rRJDz/8sCoqKlRcXKynnnpK06ZN06RJk5LPnzRpktra2rRmzRo1NDSovb1dU6dO1ZIlS3TJJZfoxRdf1Ntvv60LLrhAY8eO7RFyUnxJlfvuu0/HHHOM5s2bp9raWo0cOVLXXXedrrnmGt1zzz3aunWr3njjDf3kJz/RPffcM6iv569//asOO+wwHX744R/52ADAUCEXyAUAOBDZQDYAwIHIBrIBAA5ENpANcBdNcWAAjj/+eL388svKzs7WaaedpilTpuiGG27QhRdeqGeeeabXUh2DdfHFF+uJJ55Qc3PzR/o8I0eO1G233abZs2frE5/4hN5//3098cQTysrK0p/+9Kdey5kcf/zxuvTSS3XOOeeorKxMt912myTp7rvv1qxZs/TZz35Wc+fOlTFGTzzxRK/lVCTJ7/frgQce0FFHHaV58+apvr5e3/nOd7Rs2TLdcsstOvLII7V48WI9/vjjmjx58qC+ngceeECXXHLJhz8gADBMyAVyAQAORDaQDQBwILKBbACAA5ENZAPc4zPGGLeLACCdddZZOvbYY3XDDTcM+eeORCIqLy/Xk08+qeOOO27IP/9wePfddzVv3jxt2rRJRUVFbpcDAClHLvRELgAA2XAgsgEAyIYDkQ0AQDYciGyAg0lxwBLf//73VVBQMCyfe+/evbrmmmv0iU98Ylg+/3CoqanRb3/7W0IKQMYiF3oiFwCAbDgQ2QAAZMOByAYAIBsORDbAwaQ4AAAAAAAAAAAAACBtMSkOAAAAAAAAAAAAAEhbNMUBAAAAAAAAAAAAAGmLpjgAAAAAAAAAAAAAIG3RFAcAAAAAAAAAAAAApC2a4gAAAAAAAAAAAACAtEVTHAAAAAAAAAAAAACQtmiKAwAAAAAAAAAAAADSFk1xAAAAAAAAAAAAAEDaoikOAAAAAAAAAAAAAEhb/z8c3Pk+fvR86QAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "<Figure size 2000x500 with 5 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create a list to store the throughput and tpot data\n",
+    "throughput_tpot_data = []\n",
+    "\n",
+    "# Iterate over the models, batch sizes, and arrival rates to calculate throughput and tpot\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                throughput = get_throughput(filepath)\n",
+    "                tpot = get_tpot(filepath)\n",
+    "                throughput_tpot_data.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'Throughput': throughput,\n",
+    "                    'TPOT': tpot\n",
+    "                })\n",
+    "\n",
+    "# add incremental decoding entry\n",
+    "for batch_size in batch_sizes:\n",
+    "    for arrival_rate in arrival_rates:\n",
+    "        model_name = ssm.replace(\"/\", \"-\")\n",
+    "        filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "        if os.path.exists(filepath):\n",
+    "            throughput = get_throughput(filepath)\n",
+    "            tpot = get_tpot(filepath)\n",
+    "            throughput_tpot_data.append({\n",
+    "                'Model': \"Incr Dec (baseline)\",\n",
+    "                'Batch Size': batch_size,\n",
+    "                'Arrival Rate': arrival_rate,\n",
+    "                'Throughput': throughput,\n",
+    "                'TPOT': tpot\n",
+    "            })\n",
+    "\n",
+    "# Convert the list to a DataFrame\n",
+    "throughput_tpot_df = pd.DataFrame(throughput_tpot_data)\n",
+    "\n",
+    "# Plot the data\n",
+    "fig, axes = plt.subplots(nrows=1, ncols=len(arrival_rates), figsize=(20, 5), sharey=True)\n",
+    "\n",
+    "for i, arrival_rate in enumerate(arrival_rates):\n",
+    "    ax = axes[i]\n",
+    "    for model_name in throughput_tpot_df['Model'].unique():\n",
+    "        model_data = throughput_tpot_df[(throughput_tpot_df['Model'] == model_name) & (throughput_tpot_df['Arrival Rate'] == arrival_rate)]\n",
+    "        ax.plot(model_data['TPOT'], model_data['Throughput'], marker='o', label=model_name)\n",
+    "        ax.set_title(f'Arrival Rate: {arrival_rate} {\"requests/sec\" if arrival_rate != \"offline\" else \"\"}')\n",
+    "        ax.set_xlabel('TPOT (ms/token)')\n",
+    "        ax.set_ylabel('Output Throughput (tokens/sec)')\n",
+    "        ax.grid(True)\n",
+    "    if i == 0:\n",
+    "        ax.legend(title='Model')\n",
+    "\n",
+    "plt.suptitle('Throughput vs TPOT for Different Arrival Rates\\nLLM: LLAMA-3.1-70B-Instruct\\nBatch Sizes: 4, 8')\n",
+    "plt.tight_layout(rect=[0, 0, 1, 0.96])\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/wildchat/throughput_vs_tpot.pdf')\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return ttft.mean()[1] / 1000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Model  Batch Size Arrival Rate        TTFT\n",
+      "0  Zhuominc-Llama-3-330M           4      offline  236.037453\n",
+      "1  Zhuominc-Llama-3-330M           4            1  239.494513\n",
+      "2  Zhuominc-Llama-3-330M           4            2  236.035863\n",
+      "3  Zhuominc-Llama-3-330M           4            4  237.153932\n",
+      "4  Zhuominc-Llama-3-330M           4            8  237.309231\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADqJ0lEQVR4nOzdeVyN6f8/8NeptC9atJFKoiSh7EP2KCR7tsq+CxmMqGQdu5mxLxlK9u0jS4OIjGXsNJgohpQ1khZ1//7o1/11nJZTIqbX8/s4j++c676W93Wfuz68Xee6JIIgCCAiIiIiIiIiIiKiz6ZQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxEREREREREREVEpYcKViIiIiIiIiIiIqJQw4UpERCJvb29YWFh80TEkEgkCAwO/6BhlYeHChahWrRoUFRVRt27dsg5HRsuWLdGyZUupsqSkJPTo0QP6+vqQSCRYtmwZAODevXto3749dHR0IJFIsG/fvq8e739NfvdfXhYWFvD29i7VeL60z5lvWbOwsECnTp2++DiBgYGQSCRffBwiIiIi+vqYcCWib45EIinVV1RUFOLj4wu83rhxY/EvvkW9vpUEwo8//giJRILevXuXdShfxKefl4KCAvT09NCxY0ecO3euxP2uXLkSISEhpRfo/3fs2DH8+OOPaNasGTZt2oS5c+eW+hgf8/b2lro/mpqaqFatGnr06IHdu3cjJydHrn4mTJiAo0ePYtq0adiyZQs6dOgAAPDy8sKNGzcwZ84cbNmyBU5OTl9yOp9l7ty5cieEP36uZs+enW+dfv36ifeUvjwLCwupZ1lVVRXW1taYPHkyXr58WaI+Y2JiEBgYiNevX5dusHJKTU1FQEAAateuDQ0NDejr66Nu3boYP348njx5UiYxEREREdHXpVTWARARfWrLli1S73///XdERkbKlGdnZ0NRUbHIera2tnj//j0AwNPTE66urlLXK1WqBBMTE1SvXl0sS01NxciRI+Hh4YFu3bqJ5UZGRp83uVIgCAK2bdsGCwsLHDx4EG/fvoWWllap9L1u3Tq5k3VfQ97nlZ2djbt372LlypVo1aoVLl68CHt7+2L3t3LlShgYGJT6asETJ05AQUEBGzZsgLKycqn2XRAVFRWsX78eAPD+/XskJCTg4MGD6NGjB1q2bIn9+/dDW1tbrH/s2LF843Z3d4efn59Y9v79e5w7dw7Tp0/HmDFjvvxEPtPcuXPRo0cPdO3aVe42qqqq2LZtG/z9/aXK3717h/3790NVVbWUo6TC1K1bF5MmTQIApKen46+//sKyZctw6tQpXLhwodj9xcTEICgoCN7e3qhYsWIpR1u4rKwstGjRAn///Te8vLwwduxYpKam4tatWwgLC4OHhwdMTU0BAP7+/pg6depXjY+IiIiIvg4mXInom9O/f3+p93/++SciIyNlyj9VWL34+HgAQP369Qvsp06dOuJ/P3/+HCNHjkSdOnWKHPdri4qKwr///osTJ07AxcUFe/bsgZeXV5Ht0tPToaysDAUF2S83vHv3DhoaGqhQocKXCLnEPv28mjdvjo4dO2LVqlVYuXJlGUYmLTk5GWpqaqWWbBUEAenp6VBTUyuwjpKSksyzOXv2bMyfPx/Tpk3D0KFDsX37dvFafrElJyfLJKSePXsGAKWaqCrs2SsLrq6u2LNnD65duwYHBwexfP/+/cjMzESHDh1w4sSJMoywfKlcubLUszxkyBBoampi0aJFuHfvHqytrcswuuLZt28frly5gtDQUPTt21fqWnp6OjIzM8X3SkpKUFLiH8WJiIiI/ou+jb/5EBF953bt2gWJRIJTp07JXFuzZg0kEglu3rwJAHj69Cl8fHxQpUoVqKiowMTEBO7u7mJSuCihoaGoVasWWrVqhbZt2yI0NFSmTlRUFCQSCcLDw+Hv74/KlStDXV0db968gbe3NzQ1NREXFwdXV1doaWmhX79+AKT3cM3KyoKenh58fHxk+n/z5g1UVVXFlZGZmZmYOXMmHB0doaOjAw0NDTRv3hwnT56Ua07yat68OQAgLi5OqnzTpk1o3bo1DA0NoaKiglq1amHVqlVSdSwsLHDr1i2cOnUq3y0iXr9+DV9fX5iZmUFFRQXVq1fHggULilzxK5FIsGnTJrx7907sN2/bgg8fPiA4OBhWVlZQUVGBhYUFfvrpJ2RkZMjE1qlTJxw9ehROTk5QU1PDmjVrSnSPpk6divbt22Pnzp24e/euWP7xnpohISGQSCQQBAG//fabGHdgYCDMzc0BAJMnT4ZEIpHa0/fx48cYNGgQjIyMoKKiAjs7O2zcuFFq/MKePQA4f/48OnToAB0dHairq8PZ2Rlnz56V6iNvi49//vlHXKWoo6MDHx8fpKWlSd37d+/eYfPmzeIc5Fm93KRJE1haWiIsLEyqPDQ0FB06dICenl6+7VauXAk7OzuoqKjA1NQUo0ePzvdr62vXroWVlRXU1NTQsGFDREdH59tfRkYGAgICUL16daioqMDMzAw//vijzPPxqaysLAQFBcHa2hqqqqrQ19fHDz/8gMjIyELbvXz5En5+frC3t4empia0tbXRsWNHXLt2Tape3me4Y8cOzJkzB1WqVIGqqiratGmDf/75p8TzLQ5jY2MAkEpIXr9+Hd7e3qhWrRpUVVVhbGyMQYMG4cWLF2KdwMBATJ48GQBgaWkpPhcf/37dunUrGjZsCHV1dejq6qJFixb5rgA/c+YMGjZsCFVVVVSrVg2///57kXHn/W5q1qyZzDVVVVWpVeef7uH66VYhH78+3vda3ucmMjISP/zwAypWrAhNTU3UrFkTP/30U5FzICIiIqLPx39WJ6JyJS0tDc+fP5cq09HR+eyVnW5ubtDU1MSOHTvg7OwsdW379u2ws7ND7dq1AQDdu3fHrVu3MHbsWFhYWCA5ORmRkZF4+PBhkQdWZWRkYPfu3eLXbz09PeHj44OnT5+KCYqPBQcHQ1lZGX5+fsjIyBBXOX748AEuLi744YcfsGjRIqirq8u0rVChAjw8PLBnzx6sWbNGaoXkvn37kJGRgT59+gDITcCuX78enp6eGDp0KN6+fYsNGzbAxcUFFy5cKLVDpPKSJrq6ulLlq1atgp2dHbp06QIlJSUcPHgQo0aNQk5ODkaPHg0AWLZsGcaOHQtNTU1Mnz4dwP9tEZGWlgZnZ2c8fvwYw4cPR9WqVRETE4Np06YhMTFRPEwqP1u2bMHatWtx4cIF8Sv+TZs2BZC7Um/z5s3o0aMHJk2ahPPnz2PevHmIjY3F3r17pfq5c+cOPD09MXz4cAwdOhQ1a9Ys8X0aMGAAjh07hsjISNSoUUPmeosWLbBlyxYMGDAA7dq1w8CBAwHkrvKuWLEiJkyYIG7nkLeXaVJSEho3bgyJRIIxY8agUqVKOHz4MAYPHow3b97A19dXaoz8nr0TJ06gY8eOcHR0REBAABQUFMRkeXR0NBo2bCjVR69evWBpaYl58+bh8uXLWL9+PQwNDbFgwQLx3g8ZMgQNGzbEsGHDAABWVlZy3SNPT09s3boV8+fPh0QiwfPnz3Hs2DFs2bIFR44ckakfGBiIoKAgtG3bFiNHjsSdO3ewatUqXLx4EWfPnhV/h2zYsAHDhw9H06ZN4evri/v376NLly7Q09ODmZmZ2F9OTg66dOmCM2fOYNiwYbC1tcWNGzewdOlS3L17t9B9aQMDAzFv3jxx7m/evMGlS5dw+fJltGvXrsB29+/fx759+9CzZ09YWloiKSkJa9asgbOzM27fvi1+1T3P/PnzoaCgAD8/P6SkpODnn39Gv379cP78ebGOvPMtTFZWlvh7OT09HVeuXMGSJUvQokULWFpaivUiIyNx//59+Pj4wNjYGLdu3cLatWtx69Yt/Pnnn5BIJOjWrRvu3r2Lbdu2YenSpTAwMACQu3UMAAQFBSEwMBBNmzbFrFmzoKysjPPnz+PEiRNo3769ONY///yDHj16YPDgwfDy8sLGjRvh7e0NR0dH2NnZFTiXvH+w+P333+Hv71+sQ7GGDx+Otm3bSpUdOXIEoaGhMDQ0BCD/c3Pr1i106tQJderUwaxZs6CiooJ//vlH5h83iIiIiOgLEYiIvnGjR48W5Pl1VVi9Bw8eCADyfZ08eVKm/rNnzwQAQkBAgNxxenp6CoaGhsKHDx/EssTEREFBQUGYNWuWIAiC8OrVKwGAsHDhQrn7/diuXbsEAMK9e/cEQRCEN2/eCKqqqsLSpUul6p08eVIAIFSrVk1IS0uTuubl5SUAEKZOnSrTv5eXl2Bubi6+P3r0qABAOHjwoFQ9V1dXoVq1auL7Dx8+CBkZGVJ1Xr16JRgZGQmDBg2SKpfnvuZ9XkFBQcKzZ8+Ep0+fCtHR0UKDBg0EAMLOnTul6n86R0EQBBcXF6kYBUEQ7OzsBGdnZ5m6wcHBgoaGhnD37l2p8qlTpwqKiorCw4cPC43Xy8tL0NDQkCq7evWqAEAYMmSIVLmfn58AQDhx4oRYZm5uLgAQjhw5Uug4hY33sStXrggAhAkTJohlzs7OMnMHIIwePVqqLO/ef/qMDh48WDAxMRGeP38uVd6nTx9BR0dH/AwKevZycnIEa2trwcXFRcjJyRHL09LSBEtLS6Fdu3ZiWUBAgABA5tnx8PAQ9PX1pco0NDQELy+vAu9FQXO7efOmAECIjo4WBEEQfvvtN0FTU1N49+6dzP1NTk4WlJWVhfbt2wvZ2dli+a+//ioAEDZu3CgIgiBkZmYKhoaGQt26daV+HtauXSsAkLr/W7ZsERQUFMTx86xevVoAIJw9e1YsMzc3l5qjg4OD4ObmJtecP5aeni4Vf949UVFREX9HCcL/fYa2trZS81i+fLkAQLhx40ax51uQvGf/01ezZs1knrX8fs63bdsmABBOnz4tli1cuFAAIDx48ECq7r179wQFBQXBw8ND5j58/EzmxfRxn8nJyYKKioowadKkQueTlpYm1KxZUwAgmJubC97e3sKGDRuEpKQkmbp5z3lB7t27J+jo6Ajt2rUT/3dF3udm6dKlAgDh2bNnhcZLRERERF8GtxQgonJl2LBhiIyMlHp9vIfj5+jduzeSk5MRFRUllu3atQs5OTno3bs3AIj7fEZFReHVq1fFHiM0NBROTk7iAV9aWlpwc3PLd1sBIPe0+YL2AR05cmSR47Vu3RoGBgZSe4G+evUKkZGR4pwAQFFRUVwBm5OTg5cvX+LDhw9wcnLC5cuX5Z7fpwICAlCpUiUYGxujefPmiI2NxeLFi9GjRw+peh/PMSUlBc+fP4ezszPu37+PlJSUIsfZuXMnmjdvDl1dXTx//lx8tW3bFtnZ2Th9+nSxY4+IiAAATJw4Uao8b3XyoUOHpMotLS3h4uJS7HHyk7cq9e3bt6XSnyAI2L17Nzp37gxBEKTukYuLC1JSUmQ+50+fvatXr+LevXvo27cvXrx4IbZ/9+4d2rRpg9OnT8ts3zBixAip982bN8eLFy/E7Qk+h52dHerUqYNt27YBAMLCwuDu7p7vau8//vgDmZmZ8PX1ldqHdujQodDW1hY/y0uXLiE5ORkjRoyQWhHu7e0NHR0dqT537twJW1tb2NjYSN3P1q1bA0Ch23FUrFgRt27dwr1794o1ZxUVFTH+7OxsvHjxQvyqeX4/pz4+PlLzyNvS4/79+8Web2EaNWok/j7+3//+hzlz5uDWrVvo0qWLeOAhIP1znp6ejufPn6Nx48YAINfvmX379iEnJwczZ86U2U/405WotWrVEucL5K6QrVmzpjj3gqipqeH8+fPitgYhISEYPHgwTExMMHbs2CK3i8jz7t07eHh4QFdXF9u2bRMPiJT3ucnbg3n//v3f1EGIREREROUFtxQgonLF2tpa5iubpSVvX8rt27ejTZs2AHK3E6hbt674tW4VFRUsWLAAkyZNgpGRERo3boxOnTph4MCB+W4J8LHXr18jIiICY8aMkdpHsVmzZti9ezfu3r0r8/Xxj7+O+zElJSVUqVKlyDkpKSmhe/fuCAsLQ0ZGBlRUVLBnzx5kZWVJJVwBYPPmzVi8eDH+/vtvZGVlFRmDPIYNG4aePXsiPT0dJ06cwIoVK5CdnS1T7+zZswgICMC5c+ek9vgEchOwRSV/7t27h+vXr4tfO/5UcnJysWNPSEiAgoKCmBzPY2xsjIoVKyIhIUGq/HPu06dSU1MB5CbkS8OzZ8/w+vVrrF27FmvXrs23zqf36NP55CUHCzvgLSUlRWq7iKpVq0pdz7v26tUrqb0wS6pv375YvHgxJkyYgJiYmAL3t8z7rD7d5kFZWRnVqlUTr+f9/08PeapQoQKqVasmVXbv3j3ExsaW6JmbNWsW3N3dUaNGDdSuXRsdOnTAgAEDpA7+y09OTg6WL1+OlStX4sGDB1I/S/r6+jL1C7v/QPHmWxgDAwOp38tubm6oWbMmevTogfXr12Ps2LEAcvegDQoKQnh4uMz9kecfVuLi4qCgoIBatWoVWffTuQO585fnH8p0dHTw888/4+eff0ZCQgKOHz+ORYsW4ddff4WOjg5mz55dZB9Dhw5FXFwcYmJipD4beZ+b3r17Y/369RgyZAimTp2KNm3aoFu3bujRo8c3c3gdERER0X8ZE65ERKVERUUFXbt2xd69e7Fy5UokJSXh7NmzmDt3rlQ9X19fdO7cGfv27cPRo0cxY8YMzJs3DydOnEC9evUK7H/nzp3IyMjA4sWLsXjxYpnroaGhCAoKkioraHXrxyvditKnTx+sWbMGhw8fRteuXbFjxw7Y2NhIrQzeunUrvL290bVrV0yePBmGhoZQVFTEvHnzZA64Ko6PE+SdOnWCoqIipk6dilatWsHJyQlAbhKlTZs2sLGxwZIlS2BmZgZlZWVERERg6dKlcq3uysnJQbt27fDjjz/mez2/fVDlJe8ejgV9ViWRd0Dbp8neksq7h/379y8wYfppsu/T+eT1sXDhwgL39M1bmZsnb1XfpwRBKDJmeXh6emLatGkYOnQo9PX1pfbw/NJycnJgb2+PJUuW5Hu9sP1PW7Rogbi4OOzfvx/Hjh3D+vXrsXTpUqxevRpDhgwpsN3cuXMxY8YMDBo0CMHBwdDT04OCggJ8fX3z/Tn50ve/MHn/aHX69Gkx4dqrVy/ExMRg8uTJqFu3LjQ1NZGTk4MOHTqU+irO0pq7ubk5Bg0aBA8PD1SrVg2hoaFFJlyXL1+Obdu2YevWrTI/K/I+N2pqajh9+jROnjyJQ4cO4ciRI9i+fTtat26NY8eOFTg/IiIiIiodTLgSEZWi3r17Y/PmzTh+/DhiY2MhCILMSlAg92CfSZMmYdKkSbh37x7q1q2LxYsXY+vWrQX2HRoaitq1ayMgIEDm2po1axAWFiaTcC0NLVq0gImJCbZv344ffvgBJ06cEA+dyrNr1y5Uq1YNe/bskUow5hfr55g+fTrWrVsHf39/8WCjgwcPIiMjAwcOHJBalZbfV7ILSn5aWVkhNTW1VFc/m5ubIycnB/fu3YOtra1YnpSUhNevX4uH63wJW7ZsgUQiKfQApeKoVKkStLS0kJ2dXeJ7lHeYlba2dqne5+IcSvSpqlWrolmzZoiKisLIkSOhpJT/H4vyPqs7d+5IrdzMzMzEgwcPxPnk1bt37574FW8g91CoBw8eSP0jhZWVFa5du4Y2bdqUaA56enrw8fGBj48PUlNT0aJFCwQGBhaacN21axdatWqFDRs2SJW/fv1aPFyqOIoz3+L68OEDgP9brf3q1SscP34cQUFBmDlzplgvv20VCvs5z8nJwe3bt0vtID956erqwsrKSvzHkIJER0fDz88Pvr6+6Nevn8z14jw3CgoKaNOmDdq0aYMlS5Zg7ty5mD59Ok6ePPnFvulBRERERLn4nSIiolLUtm1b6OnpYfv27di+fTsaNmwo9dXqtLQ0pKenS7WxsrKClpZWoXv7PXr0CKdPn0avXr3Qo0cPmZePjw/++ecfqdPDS4uCggJ69OiBgwcPYsuWLfjw4YNMEjlvtdTHq7/Onz+Pc+fOlWosFStWxPDhw3H06FFcvXq1wLFTUlKwadMmmfYaGhp4/fq1THmvXr1w7tw5HD16VOba69evxeRPcbi6ugIAli1bJlWetzLNzc2t2H3KY/78+Th27Bh69+4t81XvklJUVET37t2xe/fufBNGz549K7IPR0dHWFlZYdGiRWISrbh95Kegz1Res2fPRkBAgLiKMj9t27aFsrIyVqxYIfWcbdiwASkpKeJn6eTkhEqVKmH16tXIzMwU64WEhMjE2KtXLzx+/Bjr1q2TGe/9+/d49+5dgfG8ePFC6r2mpiaqV69e5P6gioqKMis0d+7cicePHxfariDFmW9xHTx4EADEpG1+P+eA7M8XkPtMAJCJoWvXrlBQUMCsWbNkVsSW1qrda9eu4fnz5zLlCQkJuH37tsy2FB9LTExEr1698MMPP2DhwoX51pH3uXn58qXM9bwks7z7yBIRERFRyXGFKxFRKapQoQK6deuG8PBwvHv3DosWLZK6fvfuXbRp0wa9evVCrVq1oKSkhL179yIpKQl9+vQpsN+wsDAIgoAuXbrke93V1RVKSkoIDQ1Fo0aNSnVOQO7K3V9++QUBAQGwt7eXWrEJ5H7df8+ePfDw8ICbmxsePHiA1atXo1atWvkm1z7H+PHjsWzZMsyfPx/h4eFo3749lJWV0blzZwwfPhypqalYt24dDA0NkZiYKNXW0dERq1atwuzZs1G9enUYGhqidevWmDx5Mg4cOIBOnTrB29sbjo6OePfuHW7cuIFdu3YhPj6+2CsAHRwc4OXlhbVr1+L169dwdnbGhQsXsHnzZnTt2hWtWrX6rPvw4cMHcUV0eno6EhIScODAAVy/fh2tWrUqcK/Vkpo/fz5OnjyJRo0aYejQoahVqxZevnyJy5cv448//sg3wfMxBQUFrF+/Hh07doSdnR18fHxQuXJlPH78GCdPnoS2traYZCsOR0dH/PHHH1iyZAlMTU1haWlZrJ8BZ2dnODs7F1qnUqVKmDZtGoKCgtChQwd06dIFd+7cwcqVK9GgQQP0798fQO7P/+zZszF8+HC0bt0avXv3xoMHD7Bp0yaZPU0HDBiAHTt2YMSIETh58iSaNWuG7Oxs/P3339ixYweOHj0qbpvxqVq1aqFly5ZwdHSEnp4eLl26hF27dmHMmDGFzqNTp06YNWsWfHx80LRpU9y4cQOhoaHF2m/1Y8WZb2EeP34sPsuZmZm4du0a1qxZAwMDAzERrq2tjRYtWuDnn39GVlYWKleujGPHjuHBgwcy/Tk6OgLIXRHfp08fVKhQAZ07d0b16tUxffp0BAcHo3nz5ujWrRtUVFRw8eJFmJqaYt68eSW6Dx+LjIxEQEAAunTpgsaNG0NTUxP379/Hxo0bkZGRgcDAwALbjhs3Ds+ePcOPP/6I8PBwqWt16tRBnTp15H5uZs2ahdOnT8PNzQ3m5uZITk7GypUrUaVKFfzwww+fPU8iIiIiKoJARPSNGz16tCDPr6vC6j148EAAICxcuFCuMZ89eyYAEAICAooTqiAIghAZGSkAECQSifDo0SOpa8+fPxdGjx4t2NjYCBoaGoKOjo7QqFEjYceOHYX2aW9vL1StWrXQOi1bthQMDQ2FrKws4eTJkwIAYefOnTL1vLy8BA0NjXz78PLyEszNzWXKc3JyBDMzMwGAMHv27Hyvz507VzA3NxdUVFSEevXqCf/73//y7U+e+1rU5+Xt7S0oKioK//zzjyAIgnDgwAGhTp06gqqqqmBhYSEsWLBA2LhxowBAePDggdju6dOngpubm6ClpSUAEJydncVrb9++FaZNmyZUr15dUFZWFgwMDISmTZsKixYtEjIzMwuNt6B7mpWVJQQFBQmWlpZChQoVBDMzM2HatGlCenq6VD1zc3PBzc2t0DE+HQ+A+FJXVxcsLCyE7t27C7t27RKys7Nl2jg7O0vNVxByP4vRo0dLlRV275OSkoTRo0cLZmZmQoUKFQRjY2OhTZs2wtq1a8U6hT17giAIV65cEbp16ybo6+sLKioqgrm5udCrVy/h+PHjYp2AgAABgPDs2TOptps2bZL5TP/++2+hRYsWgpqamgBA8PLyKui2yf17oKDP89dffxVsbGyEChUqCEZGRsLIkSOFV69eydRbuXKlYGlpKaioqAhOTk7C6dOn873/mZmZwoIFCwQ7OztBRUVF0NXVFRwdHYWgoCAhJSVFrGdubi41r9mzZwsNGzYUKlasKKipqQk2NjbCnDlzinxO09PThUmTJgkmJiaCmpqa0KxZM+HcuXMysRX0Gebdv02bNpVovvkxNzeXepYVFBQEQ0NDwdPTU/z5zvPvv/8KHh4eQsWKFQUdHR2hZ8+ewpMnT/L9nRIcHCxUrlxZUFBQkHlmNm7cKNSrV0+8587OzkJkZKRUTPn9PMozp/v37wszZ84UGjduLBgaGgpKSkpCpUqVBDc3N+HEiRNSdfOe84/7//hefPz6eH7yPDfHjx8X3N3dBVNTU0FZWVkwNTUVPD09hbt37xYaPxERERGVDokgfIWTD4iIiIiIiIiIiIjKAe7hSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiL6KkJAQSCQSXLp0qcA68fHxkEgkWLRoUaF9WVhYQCKRoG3btvleX7duHSQSSZHjFSYwMBASiQTPnz8vsE5UVBQkEgl27dold7+9evWCRCLBlClTCu1TIpFg69at+dZp1qwZJBIJateune/17OxsmJqaQiKR4PDhw3LHBgATJkxA/fr1oaenB3V1ddja2iIwMBCpqalytV+1ahV69uyJqlWrQiKRwNvbu1jj5322+b2sra1l6m/YsAG2trZQVVWFtbU1fvnlF5k63t7eUv0oKSnBzMwMffr0we3bt+WKS57n93Pcvn0bgYGBiI+P/yL9fy8xEBERERH9FyiVdQBEREQloaqqipMnT+Lp06cwNjaWuhYaGgpVVVWkp6eXUXT5e/PmDQ4ePAgLCwts27YN8+fPh0QiybeuqqoqwsLC0L9/f6ny+Ph4xMTEQFVVtcBxTpw4gcTERFhYWCA0NBQdO3aUO8aLFy+iefPm8PHxgaqqKq5cuYL58+fjjz/+wOnTp6GgUPi/1S5YsABv375Fw4YNkZiYKPe4eZYtWyaT3E1ISIC/vz/at28vVb5mzRqMGDEC3bt3x8SJExEdHY1x48YhLS1NJqGtoqKC9evXAwA+fPiAuLg4rF69GkeOHMHt27dhampa7FhL0+3btxEUFISWLVvCwsKi3MZARERERPRfwIQrERF9l5o1a4aLFy9i+/btGD9+vFj+77//Ijo6Gh4eHti9e3cZRihr9+7dyM7OxsaNG9G6dWucPn0azs7O+dZ1dXXFgQMH8Pz5cxgYGIjlYWFhMDIygrW1NV69epVv261bt6J+/frw8vLCTz/9hHfv3kFDQ0OuGM+cOSNTZmVlBT8/P1y4cAGNGzcutP2pU6fE1a2amppyjfmxrl27ypTNnj0bANCvXz+x7P3795g+fTrc3NzEFcZDhw5FTk4OgoODMWzYMOjq6or1lZSUZJLXjRs3RqdOnXDo0CEMHTq02LGWFUEQkJ6eDjU1tbIOhYiIiIiI8sEtBYiI6LukqqqKbt26ISwsTKp827Zt0NXVhYuLi0ybrKws/P333yVaeVkaQkND0a5dO7Rq1Qq2trYIDQ0tsK67uztUVFSwc+dOqfKwsDD06tULioqK+bZ7//499u7diz59+qBXr154//499u/f/1lx5612fP36dZF1zc3NC1y1W1JhYWGwtLRE06ZNxbKTJ0/ixYsXGDVqlFTd0aNH4927dzh06FCR/eatjFZSKtm/P3t7e0NTUxOPHz9G165doampiUqVKsHPzw/Z2dlSdcPDw+Ho6AgtLS1oa2vD3t4ey5cvB5C7XUHPnj0BAK1atRK3PoiKigKQe/87deqEo0ePwsnJCWpqalizZo24BUdISIhMbBKJBIGBgVJljx8/xuDBg2FqagoVFRVYWlpi5MiRyMzMLDIGIiIiIiKSHxOuRET03erbty8uXLiAuLg4sSwsLAw9evRAhQoVZOo/fvwYtra2mDZt2tcMEwDw5MkTnDx5Ep6engAAT09P7Nq1C5mZmfnWV1dXh7u7O7Zt2yaWXbt2Dbdu3ULfvn0LHOfAgQNITU1Fnz59YGxsjJYtWxaa2M3Phw8f8Pz5czx58gTHjh2Dv78/tLS00LBhw2L1UxquXLmC2NhYmTlfuXIFAODk5CRV7ujoCAUFBfH6x54/f47nz58jKSkJ586dw4QJE6Cvr49OnTqVOL7s7Gy4uLhAX18fixYtgrOzMxYvXoy1a9eKdSIjI+Hp6QldXV0sWLAA8+fPR8uWLXH27FkAQIsWLTBu3DgAwE8//YQtW7Zgy5YtsLW1Ffu4c+cOPD090a5dOyxfvhx169YtVpxPnjxBw4YNER4ejt69e2PFihUYMGAATp06hbS0NLliICIiIiIi+XBLASIi+m61bt0axsbG2LZtG/z9/REbG4urV69i+fLluH//flmHJ2Xbtm1QUVGBu7s7AKBPnz6YOXMmIiIi8v0aPZCbUO7cuTMePXoEMzMzhIaGolq1aoV+rX/r1q1o2rQpzMzMxHFGjRqFZ8+eoVKlSnLFeunSJTRp0kR8X7NmTRw4cAB6enpyzrb05CWLP95OAAASExOhqKgIQ0NDqXJlZWXo6+vjyZMnUuXv3r2TmX/lypVx7Ngxue9LftLT09G7d2/MmDEDADBixAjUr18fGzZswMiRIwEAhw4dgra2No4ePZrvyuRq1aqhefPmWLFiBdq1a4eWLVvK1Pnnn39w5MgRqZXbxTncatq0aXj69CnOnz8vlaSeNWsWBEFAxYoVi4yBiIiIiIjkwxWuRET03VJUVESvXr3EVaChoaEwMzND8+bN861vYWEBQRDy/Qr2lxYaGgo3NzdoaWkBAKytreHo6Fjo6tP27dtDT08P4eHhEAQB4eHh4grZ/Lx48QJHjx6VqtO9e3dIJBLs2LFD7lhr1aqFyMhI7Nu3Dz/++CM0NDRkDrL6GnJychAeHo569erJrLR8//49lJWV822nqqqK9+/fy5RFRkYiMjISR48exZo1a6CpqQlXV1fcvXv3s+IcMWKE1PvmzZtLJfwrVqyId+/eITIyssRjWFpa5rtNhjxycnKwb98+dO7cWWZFMIBS3wKCiIiIiKi84wpXIiL6rvXt2xcrVqzAtWvXEBYWhj59+nxzCaTY2FhcuXIFAwcOxD///COWt2zZEr/99hvevHkDbW1tmXYVKlRAz549ERYWhoYNG+LRo0eFbiewfft2ZGVloV69elLjNGrUCKGhoRg9ejQA4OXLl1JbGaipqUFHR0d8r62tjbZt2wLI3Us2LCwM7u7uuHz5MhwcHEp+I/6/9+/fIyUlRaosbz/Vj506dQqPHz/GhAkTZK6pqakVuB1DfgdKKSoqinPK4+rqCmtra0ybNk080OzZs2dSdfT09ApM7AK5idxPV8jq6upKHWg2atQo7NixAx07dkTlypXRvn179OrVCx06dCiw309ZWlrKXfdTz549w5s3b1C7du0S90FERERERPLjClciIvquNWrUCFZWVvD19cWDBw8KTUiWla1btwIAJkyYAGtra/G1ePFipKenY/fu3QW27du3L65evYrAwEA4ODigVq1aBdbNWy3brFkzqXHOnDmDc+fOiasuu3XrBhMTE/E1fvz4QuPv1q0bgNyDn0rD9u3bpcY3MTEpcD4KCgr5ruo1MTFBdnY2kpOTpcozMzPx4sULmJqaFhlHlSpVULNmTZw+fRoA8OjRI5m4YmJiCu2joMPLPmZoaIirV6/iwIED6NKlC06ePImOHTvCy8uryLZ5Pk0gAwWvTP30wC4iIiIiIvq6uMKViIi+e56enpg9ezZsbW2LfZjQlyYIAsLCwtCqVSuMGjVK5npwcDBCQ0Ph4+OTb/sffvgBVatWRVRUFBYsWFDgOA8ePEBMTAzGjBkDZ2dnqWs5OTkYMGAAwsLC4O/vj8WLF0utwCwqOZmRkYGcnByZVakl5eLiUuTX6zMyMrB79260bNky3/jyPudLly7B1dVVLL906RJycnLkfg4+fPggbpdgbGwsE1dprOgFcveW7dy5Mzp37oycnByMGjUKa9aswYwZM1C9evUSrcrW1dUFALx+/VqqPCEhQep9pUqVoK2tjZs3bxba37e2MpyIiIiI6HvFhCsREX33hgwZAkVFRTRq1KjQellZWYiLi4OOjk6BqypL29mzZxEfH49Zs2ahR48eMtfv3r2LGTNm4MmTJ/kmFiUSCVasWIErV65gwIABBY6Tt7r1xx9/FA/M+tj69esRGhoKf39/ODo65tvH69evoaGhgQoVKsi0BSC1/2daWhoePnwIAwMDGBgYFBhXfgpb1ZonIiICr1+/ljksK0/r1q2hp6eHVatWSSVcV61aBXV1dbi5uRUZx927d3Hnzh3xfqiqqspsO1AaXrx4AX19ffG9goIC6tSpAyA3sQwAGhoaAGSTp4XR1taGgYEBTp8+DV9fX7F85cqVUvUUFBTQtWtXbN26FZcuXZLZx1UQBEgkkhLFQEREREREsphwJSKir2rjxo04cuSITPnHX2s/fvw40tPTZep07do1330ozc3NERgYWOTYjx8/hq2tLby8vOQ+OGvJkiVQV1eXKlNQUMBPP/0kvt+9ezf+/vtvmbZeXl4IDQ2FoqJigQnALl26YPr06QgPD8fEiRPzrePu7g53d/dC4wwNDUXdunXzTbbmjTN27FhcvnwZ9evXz7dOVFQUxo0bhx49esDa2hqZmZmIjo7Gnj174OTkhP79+4t1L1y4gFatWiEgIEDq3h88eBDXrl0DkJvgvn79OmbPni3GkJdoLEpoaChUVFTQvXv3fK+rqakhODgYo0ePRs+ePeHi4oLo6Ghs3boVc+bMgZ6enlT9Dx8+iFs75OTkID4+HqtXr0ZOTg4CAgLkiqmkhgwZgpcvX6J169aoUqUKEhIS8Msvv6Bu3briYWB169aFoqIiFixYgJSUFKioqKB169YwNDQssu/58+djyJAhcHJywunTp/M9BGzu3Lk4duwYnJ2dMWzYMNja2iIxMRE7d+7EmTNnULFixRLHQERERERE0phwJSKir2rVqlX5lnt7e4v/feTIkXyTshYWFl/94J958+bJlCkqKkolXAva29TZ2Rk7d+5E06ZNZRKAeWrXrg1LS0ts3bq1wIRrUS5fvoy///4bM2bMKLBO586dMXbsWGzdurXAhKu9vT1atWqF/fv3IzExEYIgwMrKCjNnzsTkyZMLPTwqz+7du7F582bx/ZUrV3DlyhUAuXumypNwffPmDQ4dOgQ3Nzepw7w+NWrUKFSoUAGLFy/GgQMHYGZmhqVLl+a7J21GRobUCmFtbW00aNAAW7ZsQZs2bYqM6XP0798fa9euxcqVK/H69WsYGxujd+/eCAwMhIJC7nb6xsbGWL16NebNm4fBgwcjOzsbJ0+eLDLZOXPmTDx79gy7du0SD+Y6fPiwTLvKlSvj/PnzmDFjBkJDQ/HmzRtUrlwZHTt2FP9BoaQxEBERERGRNIkgCEJZB0FERERERERERET0X6BQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKiVJZB/AtyMnJwZMnT6ClpQWJRFLW4RAREREREdF/nCAIePv2LUxNTaGgwLVQRET/JUy4Anjy5AnMzMzKOgwiIiIiIiIqZx49eoQqVaqUdRhERFSKmHAFoKWlBSD3f+i0tbXLOJryISsrC8eOHUP79u1RoUKFsg6H6Ivgc07lAZ9zKg/4nFN5wOf863vz5g3MzMzEv48SEdF/BxOugLiNgLa2NhOuX0lWVhbU1dWhra3NP9DRfxafcyoP+JxTecDnnMoDPudlh9vaERH993CjGCIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolHAPVyIiIiIiIiIqVdnZ2cjKyirrMIiISo2ysjIUFORbu8qEKxERERERERGVCkEQ8PTpU7x+/bqsQyEiKlUKCgqwtLSEsrJykXWZcCUiIiIiIiKiUpGXbDU0NIS6ujokEklZh0RE9NlycnLw5MkTJCYmomrVqkX+bmPClYiIiIiIiIg+W3Z2tphs1dfXL+twiIhKVaVKlfDkyRN8+PABFSpUKLQuD80iIiIiIiIios+Wt2erurp6GUdCRFT68rYSyM7OLrIuE65EREREREREVGq4jQAR/RcV53cbE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrEREREREREVE5FBUVBYlEgtevX8vdxsLCAsuWLftiMRWkJLF+Td7e3ujatWup99uyZUv4+vqWer/0ZTHhSkRERERERET0jfH29oZEIsGIESNkro0ePRoSiQTe3t5fP7BvmIWFBSQSCSQSCRQVFWFqaorBgwfj1atXxernayU5s7OzMX/+fNjY2EBNTQ16enpo1KgR1q9fL9bZs2cPgoODv3gsVLqYcCUiIiIiIiIi+gaZmZkhPDwc79+/F8vS09MRFhaGqlWrlmFk365Zs2YhMTERDx8+RGhoKE6fPo1x48aVdVj5CgoKwtKlSxEcHIzbt2/j5MmTGDZsmNQqXj09PWhpaZVdkFQiTLgSEREREREREX2D6tevDzMzM+zZs0cs27NnD6pWrYp69epJ1c3IyMC4ceNgaGgIVVVV/PDDD7h48aJUnYiICNSoUQNqampo1aoV4uPjZcY8c+YMmjdvDjU1NZiZmWHcuHF49+6d3DFfvHgR7dq1g4GBAXR0dODs7IzLly9L1ZFIJFi/fj08PDygrq4Oa2trHDhwoNix5kdLSwvGxsaoXLkyWrVqBS8vL6nxX7x4AU9PT1SuXBnq6uqwt7fHtm3bxOve3t44deoUli9fLq6WzRv71q1b6NSpE7S1taGlpYXmzZsjLi5OavxFixbBxMQE+vr6GD16NLKysgqM9cCBAxg1ahR69uwJS0tLODg4YPDgwfDz8xPrfLzaNm9bhU9fH6903r9/P+rXrw9VVVVUq1YNQUFB+PDhg1z3jkoPE65ERERERERERN+oQYMGYdOmTeL7jRs3wsfHR6bejz/+iN27d2Pz5s24fPkyqlevDhcXF7x8+RIA8OjRI3Tr1g2dO3fG1atXMWTIEEydOlWqj7i4OHTo0AHdu3fH9evXsX37dpw5cwZjxoyRO963b9/Cy8sLZ86cwZ9//glra2u4urri7du3UvWCgoLQq1cvXL9+Ha6urujXr1+xYpXH48ePcfDgQTRq1EgsS09Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuAAAWL58OZo0aYKhQ4ciMTERiYmJMDMzw+PHj9GiRQuoqKjgxIkT+OuvvzBo0CCpZObJkycRFxeHkydPYvPmzQgJCUFISEiB8RkbG+PEiRN49uyZXPNp2rSpGFNiYiJOnDgBVVVVtGjRAgAQHR2NgQMHYvz48bh9+zbWrFmDkJAQzJkzp9j3jj6TQEJKSooAQEhJSSnrUMqNzMxMYd++fUJmZmZZh0L0xfA5p/KAzzmVB3zOqTzgc/71/Rf/Hvr+/Xvh9u3bwvv378s6lP8ELy8vwd3dXUhOThZUVFSE+Ph4IT4+XlBVVRWePXsmuLu7C15eXoIgCEJqaqpQoUIFITQ0VGyfmZkpmJqaCj///LMgCIIwbdo0oVatWlJjTJkyRQAgvHr1ShAEQRg8eLAwbNgwqTrR0dGCgoKC+Lmam5sLS5culXse2dnZgpaWlnDw4EGxDIDg7+8vvk9NTRUACIcPH5Y71vyYm5sLysrKgoaGhqCqqioAEBo1alRoG0EQBDc3N2HSpEnie2dnZ2H8+PFSdaZNmyZYWloW+HvSy8tLMDc3Fz58+CCW9ezZU+jdu3eB4966dUuwtbUVFBQUBHt7e2H48OFCRESEVJ38YhEEQXj+/LlQrVo1YdSoUWJZmzZthLlz50rV27Jli2BiYlJgDCS/4vyO4wpXIiIiIiIiIqJvVKVKleDm5oaQkBBs2rQJbm5uMDAwkKoTFxeHrKwsNGvWTCyrUKECGjZsiNjYWABAbGys1EpPAGjSpInU+2vXriEkJASampriy8XFBTk5OXjw4IFc8SYlJWHo0KGwtraGjo4OtLW1kZqaiocPH0rVq1OnjvjfGhoa0NbWRnJystyxFmTy5Mm4evUqrl+/juPHjwMA3NzckJ2dDSD3oKrg4GDY29tDT08PmpqaOHr0qEx8n7p69SqaN2+OChUqFFjHzs4OioqK4nsTExNxTvmpVasWbt68iT///BODBg1CcnIyOnfujCFDhhQaS1ZWFrp37w5zc3MsX75cLL927RpmzZol9fnlrdRNS0srtE8qXUplHQARERERERERERVs0KBB4tf6f/vtty82TmpqKoYPH57vIVPyHtLl5eWFFy9eYPny5TA3N4eKigqaNGmCzMxMqXqfJi4lEglycnJKHvz/Z2BggOrVqwMArK2tsWzZMjRp0gQnT55E27ZtsXDhQixfvhzLli2Dvb09NDQ04OvrKxPfp9TU1IocuyRzUlBQQIMGDdCgQQP4+vpi69atGDBgAKZPnw5LS8t824wcORKPHj3ChQsXoKT0f6m91NRUBAUFoVu3bjJtVFVVi4yfSg8TrkRERERERERE37AOHTogMzMTEokELi4uMtetrKygrKyMs2fPwtzcHEDuKsiLFy+KBy7Z2trKHEz1559/Sr2vX78+bt++LSYsS+Ls2bNYuXIlXF1dAeTux/r8+fNi9SFPrPLKW3H6/v17MT53d3f0798fAJCTk4O7d++iVq1aYhtlZWVxRWyeOnXqYPPmzcjKyip0levnyoujoIPKlixZgh07diAmJgb6+vpS1+rXr487d+581udHpYNbChARERERERERfcMUFRURGxuL27dvS31lPY+GhgZGjhyJyZMn48iRI7h9+zaGDh2KtLQ0DB48GAAwYsQI3Lt3D5MnT8adO3cQFhYmc6DTlClTEBMTgzFjxuDq1au4d+8e9u/fX6xDs6ytrbFlyxbExsbi/Pnz6Nevn1yrQz8mT6wFefv2LZ4+fYrExERcuHABkydPRqVKldC0aVMxvsjISMTExCA2NhbDhw9HUlKSVB8WFhY4f/484uPj8fz5c+Tk5GDMmDF48+YN+vTpg0uXLuHevXvYsmUL7ty5U6y5faxHjx5YunQpzp8/j4SEBERFRWH06NGoUaMGbGxsZOr/8ccf+PHHH7Fw4UIYGBjg6dOnePr0KVJSUgAAM2fOxO+//46goCDcunULsbGxCA8Ph7+/f4ljpJJhwpWIiIiIiIiI6Bunra0NbW3tAq/Pnz8f3bt3x4ABA1C/fn38888/OHr0KHR1dQHkbgmwe/du7Nu3Dw4ODli9ejXmzp0r1UedOnVw6tQp3L17F82bN0e9evUwc+ZMmJqayh3nhg0b8OrVK9SvXx8DBgzAuHHjYGhoWKy5yhNrQWbOnAkTExOYmpqiU6dO0NDQwLFjx8TVoP7+/qhfvz5cXFzQsmVLGBsbo2vXrlJ9+Pn5QVFREbVq1UKlSpXw8OFD6Ovr48SJE0hNTYWzszMcHR2xbt26z1rt6uLigoMHD6Jz586oUaMGvLy8YGNjg2PHjkltFZDnzJkzyM7OxogRI2BiYiK+xo8fL/b3v//9D8eOHUODBg3QuHFjLF26VFz1TF+PRBAEoayDKGtv3ryBjo4OUlJSCv3lRaUnKysLERERcHV1/aJL8YnKEp9zKg/4nFN5wOecygM+51/ff/Hvoenp6Xjw4AEsLS25XyQR/ecU53ccV7gSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUiJ75BnRt+jeseLVt27/ZeIgIiIiIiIiIiIqBFe4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLCQ7OIiIiIiIiI6Ita/mr5VxtrvO74ErV78eIFbG1tceHCBVhYWJRuUN+hFi1aYMSIEejbty8AQCKRYO/evejatWuZxWRhYQFfX1/4+vp+sZgaN26MyZMno3v37qXWJwBkZmaiRo0a2LVrF5ycnEq1b/r2cIUrEREREREREZV7c+bMgbu7u5hsjY+Ph0QiwdWrV796LN7e3pBIJJBIJKhQoQKMjIzQrl07bNy4ETk5OV98/AMHDiApKQl9+vT54mN9jsTERHTs2LFU+/T398fUqVM/6z7Pnz8fEolETAwDgLKyMvz8/DBlypRSiJK+dUy4EhEREREREVG5lpaWhg0bNmDw4MFfddzMzMwCr3Xo0AGJiYmIj4/H4cOH0apVK4wfPx6dOnXChw8fvmhcK1asgI+PDxQUvu20kbGxMVRUVEq1z44dO+Lt27c4fPhwidpfvHgRa9asQZ06dWSu9evXD2fOnMGtW7c+N0z6xn3bPzlERERERERERF9YREQEVFRU0Lhx4wLrREVFQSKR4Pjx43BycoK6ujqaNm2KO3fuSNU7ePAgGjRoAFVVVRgYGMDDw0O8ZmFhgeDgYAwcOBDa2toYNmxYgeOpqKjA2NgYlStXRv369fHTTz9h//79OHz4MEJCQsR6r1+/xpAhQ1CpUiVoa2ujdevWuHbtmtwxferZs2c4ceIEOnfuLHMtb0WpmpoaqlWrhl27dkldnzJlCmrUqAF1dXVUq1YNM2bMQFZWlnj92rVraNWqFbS0tKCtrQ1HR0dcunRJvH7mzBk0b94campqMDMzw7hx4/Du3bsCY5VIJNi3bx+A/1uRvGfPHrRq1Qrq6upwcHDAuXPnpNoUNYaioiJcXV0RHh5e4LgFSU1NRb9+/bBu3Tro6urKXNfV1UWzZs1K1Dd9X5hwJSIiIiIiIqJyLTo6Go6OjnLVnT59OhYvXoxLly5BSUkJgwYNEq8dOnQIHh4ecHV1xZUrV3D8+HE0bNhQqv2iRYvg4OCAK1euYMaMGcWKs3Xr1nBwcMCePXvEsp49eyI5ORmHDx/GX3/9hfr166NNmzZ4+fKl3DF97MyZM1BXV4etra3MtRkzZqB79+64du0a+vXrhz59+iA2Nla8rqWlhZCQENy+fRvLly/HunXrsHTpUvF6v379UKVKFVy8eBF//fUXpk6digoVKgAA4uLi0KFDB3Tv3h3Xr1/H9u3bcebMGYwZM6ZY92j69Onw8/PD1atXUaNGDXh6eoorguUdo2HDhoiOji7WuAAwevRouLm5oW3btgXWKWnf9H3hoVlEREREREREVK4lJCTA1NRUrrpz5syBs7MzAGDq1Klwc3NDeno6VFVVMWfOHPTp0wdBQUFifQcHB6n2rVu3xqRJk0ocq42NDa5fvw4gNzl64cIFJCcni1+tX7RoEfbt24ddu3Zh2LBhcsX0sYSEBBgZGeW7nUDPnj0xZMgQAEBwcDAiIyPxyy+/YOXKlQBy9z/NY2FhAT8/P4SHh+PHH38EADx8+BCTJ0+GjY0NAMDa2lqsP2/ePPTr10/c99Ta2horVqyAs7MzVq1aBVVVVbnuj5+fH9zc3AAAQUFBsLOzwz///AMbGxu5xzA1NcWjR4+Qk5Mj97YK4eHhuHz5Mi5evFhoPVNTUyQkJMjVJ32/uMKViIiIiIiIiMq19+/fy53Q+3hvThMTEwBAcnIyAODq1ato06ZNoe0/94R6QRAgkUgA5H5FPzU1Ffr6+tDU1BRfDx48QFxcnNwxfaywe9GkSROZ9x+vcN2+fTuaNWsGY2NjaGpqwt/fHw8fPhSvT5w4EUOGDEHbtm0xf/58Mca8uYSEhEjNw8XFBTk5OXjw4IHc8Rf2+cg7hpqaGnJycpCRkSHXmI8ePcL48eMRGhpa5HOkpqaGtLQ0uedD3yeucCUiIiIiIiKics3AwACvXr2Sq27eV+ABiInPvBPt1dTUimyvoaFRggj/T2xsLCwtLQHk7hlqYmKCqKgomXoVK1aUO6aPFedefOzcuXPo168fgoKC4OLiAh0dHYSHh2Px4sVincDAQPTt2xeHDh3C4cOHERAQgPDwcHh4eCA1NRXDhw/HuHHjZPquWrWq3HEU9vnIO8bLly+hoaEh973766+/kJycjPr164tl2dnZOH36NH799VdkZGRAUVFR7LtSpUpyz4e+T0y4EhEREREREVG5Vq9ePWzduvWz+6lTpw6OHz8OHx+fUohK1okTJ3Djxg1MmDABAFC/fn08ffoUSkpKsLCwKJWY6tWrh6dPn+LVq1cyBz/9+eefGDhwoNT7evXqAQBiYmJgbm6O6dOni9fz++p8jRo1UKNGDUyYMAGenp7YtGkTPDw8UL9+fdy+fRvVq1eXK86SkHeMmzdvivOSR5s2bXDjxg2pMh8fH9jY2GDKlClisrUkfdP3iVsKEBEREREREVG55uLiglu3bpVoZefHAgICsG3bNgQEBCA2NhY3btzAggULStRXRkYGnj59isePH+Py5cuYO3cu3N3d0alTJzHp2bZtWzRp0gRdu3bFsWPHEB8fj5iYGEyfPh2XLl0qUUz16tWDgYEBzp49K3Nt586d2LhxI+7evYuAgABcuHBBPHDK2toaDx8+RHh4OOLi4rBixQrs3btXbPv+/XuMGTMGUVFRSEhIwNmzZ3Hx4kXxcK4pU6YgJiYGY8aMwdWrV3Hv3j3s37+/2IdmFUbeMaKjo9G+fXu5+9XS0kLt2rWlXhoaGtDX10ft2rU/q2/6PnGFKxERERERERF9UeN1x5d1CIWyt7dH/fr1sWPHDgwfPrzE/bRs2RI7d+5EcHAw5s+fD21tbbRo0aJEfR05cgQmJiZQUlKCrq4uHBwcsGLFCnh5eYkHOUkkEkRERGD69Onw8fHBs2fPYGxsjBYtWsDIyKhEMSkqKsLHxwehoaHo1KmT1LWgoCCEh4dj1KhRMDExwbZt21CrVi0AQJcuXTBhwgSMGTMGGRkZcHNzw4wZMxAYGCj2++LFCwwcOBBJSUkwMDBAt27dxMO86tSpg1OnTmH69Olo3rw5BEGAlZUVevfuXaL7lx95xnj8+DFiYmKkVjzHx8fD0tISJ0+eRMuWLUs8/rlz55CSkoIePXp8zjToOyARBEEo6yDK2ps3b6Cjo4OUlBRoa2uXdTjlQlZWFiIiIuDq6iq1v0qB7h0r3gDW/NciKnvFfs6JvkN8zqk84HNO5QGf86/vv/j30PT0dDx48ACWlpZyH0D1LTl06BAmT56Mmzdvyn0y/X/V06dPYWdnh8uXL8Pc3Lysw/mqpkyZglevXmHt2rVi2cmTJ9GtWzfcv39fZpuF4ujduzccHBzw008/lUao9JUV53ccV7gSERERERERUbnn5uaGe/fu4fHjxzAzMyvrcMqUsbExNmzYgIcPH5a7hKuhoSEmTpwoVRYREYGffvrps5KtmZmZsLe3F/ffpf82JlyJiIiIiIiIiAD4+vqWdQjfjK5du5Z1CGVi0qRJMmULFy787H6VlZXh7+//2f3Q96F8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolCiVdQBERERERERE9N82/8rzrzbW1HoGJWr34sUL2Nra4sKFC7CwsCjdoL5DLVq0wIgRI9C3b18AgEQiwd69e9G1a9cyi8nCwgK+vr7w9fX9YjE1btwYkydPRvfu3Uutz6/RN31buMKViIiIiIiIiMq9OXPmwN3dXUy2xsfHQyKR4OrVq189Fm9vb0gkEkgkElSoUAFGRkZo164dNm7ciJycnC8+/oEDB5CUlIQ+ffp88bE+R2JiIjp27Fiqffr7+2Pq1KnFvs/Z2dmYMWMGLC0toaamBisrKwQHB0MQhM/um74/TLgSERERERERUbmWlpaGDRs2YPDgwV913MzMzAKvdejQAYmJiYiPj8fhw4fRqlUrjB8/Hp06dcKHDx++aFwrVqyAj48PFBS+7bSRsbExVFRUSrXPjh074u3btzh8+HCx2i1YsACrVq3Cr7/+itjYWCxYsAA///wzfvnll8/um74/3/ZPDhERERERERHRFxYREQEVFRU0bty4wDpRUVGQSCQ4fvw4nJycoK6ujqZNm+LOnTtS9Q4ePIgGDRpAVVUVBgYG8PDwEK9ZWFggODgYAwcOhLa2NoYNG1bgeCoqKjA2NkblypVRv359/PTTT9i/fz8OHz6MkJAQsd7r168xZMgQVKpUCdra2mjdujWuXbsmd0yfevbsGU6cOIHOnTvLXMtbUaqmpoZq1aph165dUtenTJmCGjVqQF1dHdWqVcOMGTOQlZUlXr927RpatWoFLS0taGtrw9HREZcuXRKvnzlzBs2bN4eamhrMzMwwbtw4vHv3rsBYJRIJ9u3bB+D/ViTv2bMHrVq1grq6OhwcHHDu3DmpNkWNoaioCFdXV4SHhxc4bn5iYmLg7u4ONzc3WFhYoEePHmjfvj0uXLjw2X3T94cJVyIiIvp67h0r3ouIiIjoK4iOjoajo6NcdadPn47Fixfj0qVLUFJSwqBBg8Rrhw4dgoeHB1xdXXHlyhUcP34cDRs2lGq/aNEiODg44MqVK5gxY0ax4mzdujUcHBywZ88esaxnz55ITk7G4cOH8ddff6F+/fpo06YNXr58KXdMHztz5gzU1dVha2src23GjBno3r07rl27hn79+qFPnz6IjY0Vr2tpaSEkJAS3b9/G8uXLsW7dOixdulS83q9fP1SpUgUXL17EX3/9halTp6JChQoAgLi4OHTo0AHdu3fH9evXsX37dpw5cwZjxowp1j2aPn06/Pz8cPXqVdSoUQOenp7iimB5x2jYsCGio6OLNW7Tpk1x/Phx3L17F0BucvnMmTMyWx6UpG/6/vDQLCIiIiIiIiIq1xISEmBqaipX3Tlz5sDZ2RkAMHXqVLi5uSE9PR2qqqqYM2cO+vTpg6CgILG+g4ODVPvWrVtj0qRJJY7VxsYG169fB5CbHL1w4QKSk5PFr9YvWrQI+/btw65duzBs2DC5YvpYQkICjIyM8t1OoGfPnhgyZAgAIDg4GJGRkfjll1+wcuVKALl7lOaxsLCAn58fwsPD8eOPPwIAHj58iMmTJ8PGxgYAYG1tLdafN28e+vXrJx6IZW1tjRUrVsDZ2RmrVq2CqqqqXPfHz88Pbm5uAICgoCDY2dnhn3/+gY2NjdxjmJqa4tGjR8jJyZF7W4WpU6fizZs3sLGxgaKiIrKzszFnzhz069dPql5J+qbvDz9ZIiIiIiIiIirX3r9/L3dCr06dOuJ/m5iYAACSk5MBAFevXkWbNm0Kbe/k5FTCKHMJggCJRAIgdxVlamoq9PX1oampKb4ePHiAuLg4uWP6WGH3okmTJjLvP17hun37djRr1gzGxsbQ1NSEv78/Hj58KF6fOHEihgwZgrZt22L+/PlijHlzCQkJkZqHi4sLcnJy8ODBA7njL+zzkXcMNTU15OTkICMjQ+5xd+zYgdDQUISFheHy5cvYvHkzFi1ahM2bN0vVK0nf9P3hClciIiIiIiIiKtcMDAzw6tUruermfQUegJj4zDt1Xk1Nrcj2GhoaJYjw/8TGxsLS0hIAkJqaChMTE0RFRcnUq1ixotwxfaw49+Jj586dQ79+/RAUFAQXFxfo6OggPDwcixcvFusEBgaib9++OHToEA4fPoyAgACEh4fDw8MDqampGD58OMaNGyfTd9WqVeWOo7DPR94xXr58CQ0NjWLdu8mTJ2Pq1Kno06cPAMDe3h4JCQmYN28evLy8Pqtv+v4w4UpEREREVJqKu/+wdfsvEwcREcmtXr162Lp162f3U6dOHRw/fhw+Pj6lEJWsEydO4MaNG5gwYQIAoH79+nj69CmUlJRgYWFRKjHVq1cPT58+xatXr6Crqyt17c8//8TAgQOl3terVw9A7qFR5ubmmD59ung9ISFBpv8aNWqgRo0amDBhAjw9PbFp0yZ4eHigfv36uH37NqpXry5XnCUh7xg3b94U5yWvtLQ0mS0CFBUVxWTv5/RN3x9uKUBERERERERE5ZqLiwtu3bpVopWdHwsICMC2bdsQEBCA2NhY3LhxAwsWLChRXxkZGXj69CkeP36My5cvY+7cuXB3d0enTp3EpGfbtm3RpEkTdO3aFceOHUN8fDxiYmIwffp0XLp0qUQx1atXDwYGBjh79qzMtZ07d2Ljxo24e/cuAgICcOHCBfHAKWtrazx8+BDh4eGIi4vDihUrsHfvXrHt+/fvMWbMGERFRSEhIQFnz57FxYsXxcO5pkyZgpiYGIwZMwZXr17FvXv3sH///mIfmlUYeceIjo5G+/bF+wfRzp07Y86cOTh06BDi4+Oxd+9eLFmyBB4eHp/dN31/uMKViIiIiIiIiL6oqfUMyjqEQtnb26N+/frYsWMHhg8fXuJ+WrZsiZ07dyI4OBjz58+HtrY2WrRoUaK+jhw5AhMTEygpKUFXVxcODg5YsWIFvLy8xJWUEokEERERmD59Onx8fPDs2TMYGxujRYsWMDIyKlFMioqK8PHxQWhoKDp16iR1LSgoCOHh4Rg1ahRMTEywbds21KpVCwDQpUsXTJgwAWPGjEFGRgbc3NwwY8YMBAYGiv2+ePECAwcORFJSEgwMDNCtWzfxMK86derg1KlTmD59Opo3bw5BEGBlZYXevXuX6P7lR54xHj9+jJiYGKkVz/Hx8bC0tMTJkyfRsmXLfPv+5ZdfMGPGDIwaNQrJyckwNTXF8OHDMXPmzEL7pv8miSAIQlkHUdbevHkDHR0dpKSkQFtbu6zDKReysrIQEREBV1dXqf1VCsSv5tF3qNjPOdF3iL/PqTzgc07lAf/c8vX9F/8emp6ejgcPHsDS0lLuA6i+JYcOHcLkyZNx8+bNcn96/NOnT2FnZ4fLly/D3Ny8rMP5qqZMmYJXr15h7dq1YtnJkyfRrVs33L9/X2abhc/tm74fxfkdxxWuRERERERERFTuubm54d69e3j8+DHMzMzKOpwyZWxsjA0bNuDhw4flLuFqaGiIiRMnSpVFRETgp59++qxka0F9038TE65ERERERERERAB8fX3LOoRvRteuXcs6hDIxadIkmbKFCxd+sb7pv6l8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSolTWARAR0f9371jx6lu3/zJxEBEREREREVGJcYUrEREREREREZV7L168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJQcWKFctsfCq+1atXo3PnzmUdxneNK1yJiIiIiIiI6ItKCQr6amPpBASUqN2cOXPg7u4OCwsLALmJ1FOnThVYPyoqCs7OziUa62tp2rQpEhMToaOj80XHkUgk2Lt3L7p27fpFx/kSAgMDER4ejkePHkFZWRmOjo6YM2cOGjVqVGCbFy9eoF+/frh+/bqYqHd3d8fcuXOhra0NADhz5gymTJmCv//+G2lpaTA3N8fw4cMxYcIEqb5+++03LFy4EE+fPoWDgwN++eUXNGzYULxuYWGBhIQEbNu2DX369JFqa2dnh9u3b2PTpk3w9vYu9txfvHgBBwcHPH78GK9evRIT44MGDUJwcDCio6PRvHnzYvdLXOFKREREREREROVcWloaNmzYgMGDB4tle/bsQWJiotQrISEBtWvXhpOTU6EJuW+FsrIyjI2NIZFIyjqUb1aNGjXw66+/4saNGzhz5gwsLCzQvn17PHv2rMA2CgoKcHd3x4EDB3D37l2EhITgjz/+wIgRI8Q6GhoaGDNmDE6fPo3Y2Fj4+/vD398fa9euFets374dEydOREBAAC5fvgwHBwe4uLggOTlZajwzMzNs2rRJquzPP//E06dPoaGhUeK5Dx48GHXq1JEpV1ZWRt++fbFixYoS913eMeFKREREREREROVaREQEVFRU0LhxY7FMT08PxsbGUq/g4GA8f/4ce/fuhaqqqlg3JycHP/74o9gmMDBQvBYfHw+JRIKrV6+KZa9fv4ZEIkFUVJRYdurUKTRs2BAqKiowMTHB1KlT8eHDB/F6y5YtMXbsWPj6+kJXVxdGRkZYt24d3r17Bx8fH2hpaaF69eo4fPiw2ObTLQXyvt5/9OhR2NraQlNTEx06dEBiYqLU/di4cSPs7OzEWMaMGfOZd/j/xMXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23bthUZT9++fdG2bVtUq1YNdnZ2WLJkCd68eYPr168X2EZXVxcjR46Ek5MTzM3N0aZNG4waNQrR0dFinXr16sHT0xN2dnawsLBA//794eLiIlVnyZIlGDp0KHx8fFCrVi2sXr0a6urq2Lhxo9R4/fr1w6lTp/Do0SOxbOPGjejXrx+UlEr25fVVq1bh9evX8PPzy/d6586dceDAAbx//75E/Zd3TLgSERERERERUbkWHR0NR0fHQuusXLkSv//+O3bv3o0qVapIXdu8eTM0NDRw/vx5/Pzzz5g1axYiIyPlHv/x48dwdXVFgwYNcO3aNaxatQobNmzA7NmzZcYxMDDAhQsXMHbsWIwcORI9e/ZE06ZNcfnyZbRv3x4DBgxAWlpagWOlpaVh0aJF2LJlC06fPo2HDx9KJd1WrVqF0aNHY9iwYbhx4wYOHDiA6tWryz2XoqSmpsLV1RXHjx/HlStX0KFDB3Tu3BkPHz6Uqrd06VI0a9YMV65cgZubGwYMGICBAweif//+uHz5MqysrDBw4EAIggAASE9Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuCB3bJmZmVi7di10dHTg4OAgd7snT55gz549hW4xceXKFcTExIh1MjMz8ddff6Ft27ZiHQUFBbRt2xbnzp2TamtkZAQXFxds3rwZQO5nuH37dgwaNEjuGD92+/ZtzJo1C7///jsUFPJPDTo5OeHDhw84f/58icYo75hwJSIiIiIiIqJyLSEhAaampgVeP336NHx9ffHbb7+hadOmMtfr1KmDgIAAWFtbY+DAgXBycsLx48flHn/lypUwMzPDr7/+ChsbG3Tt2hVBQUFYvHgxcnJyxHoODg7w9/eHtbU1pk2bBlVVVRgYGGDo0KGwtrbGzJkz8eLFi0JXZ2ZlZWH16tVwcnJC/fr1MWbMGKlYZ8+ejUmTJmH8+PGoUaMGGjRoUKqHgjk4OGD48OGoXbs2rK2tERwcDCsrKxw4cECqnqurK4YPHy7O682bN2jQoAF69uyJGjVqYMqUKYiNjUVSUhIAoHLlyvDz80PdunVRrVo1jB07Fh06dMCOHTuKjOl///sfNDU1oaqqiqVLlyIyMhIGBgZFtvP09IS6ujoqV64MbW1trF+/XqZOlSpVoKKiAicnJ4wePRpDhgwBADx//hzZ2dkwMjKSqm9kZISnT5/K9DNo0CCEhIRAEATs2rULVlZWqFu3bpExfiojIwOenp5YuHAhqlatWmA9dXV16OjoICEhodhjEBOuRERERERERFTOvX//XmqLgI89fPgQPXr0wLBhw8Rk2ac+3QfTxMREZh/OwsTGxqJJkyZSe602a9YMqamp+Pfff/MdR1FREfr6+rC3txfL8pJ3hY2trq4OKyurfGNNTk7GkydP0KZNm3zbjhgxApqamuKrJFJTU+Hn5wdbW1tUrFgRmpqaiI2NlVnh+vFc8+ZV2Fyzs7MRHBwMe3t76OnpQVNTE0ePHhX7DQ0NlYr946/2t2rVClevXkVMTAw6dOiAXr16if127NhRbGNnZycV49KlS3H58mXs378fcXFxmDhxosx8o6OjcenSJaxevRrLli2Ta5uD/Li5uSE1NRWnT5/Gxo0bS7y6ddq0abC1tUX//v2LrKumplboamkqWMk2eiAiIiL6SHxYPBShWGS9arILQoiIiIjKnIGBAV69eiVT/v79e3h4eMDOzg7Lli0rsH2FChWk3kskEnFlat5XtvO++g7krjItifzG+bgsL2H78apYefrIi01NTa3Q8WfNmlXgnp/y8vPzQ2RkJBYtWoTq1atDTU0NPXr0QGZmZoFx5s2rsLkuXLgQy5cvx7Jly2Bvbw8NDQ34+vqK/Xbp0kXqoLPKlSuL/62hoYHq1aujevXqaNy4MaytrbFhwwZMmzYN69evF/cx/fTe5e3ta2NjAz09PTRv3hwzZsyAiYmJWMfS0hJAbrI4KSkJgYGB8PT0hIGBARQVFcUVunmSkpJgbGwsc9+UlJQwYMAABAQE4Pz589i7d2+h97kgJ06cwI0bN7Br1y4A//dcGhgYYPr06QgKChLrvnz5EpUqVSrROOUdE65EREREREREVK7Vq1cPW7dulSkfMmQIXr58iaNHj5b4cKK8hFViYiLq1asHAFIHaAGAra0tdu/eDUEQxETi2bNnoaWlJbNf7JekpaUFCwsLHD9+HK1atZK5bmhoCENDw88a4+zZs/D29oaHhweA3BWv8fHxn9VnXr/u7u7iys2cnBzcvXsXtWrVApA7Ny0tLbn6ysnJQUZGBgDpxGxRbQCI7YrqV1lZGY6Ojjh+/Di6du0qXj9+/HiBh5QNGjQIixYtQu/evaGrqytXXJ/avXu31EFYFy9exKBBgxAdHS218jkuLg7p6eniM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRMLPNToSwkMDMSIESNgaGiIjh074u3btzh79izGjh1baLsHDx7IJJKtra1l6llbW2PPnj3o3LkzJBIJZsyYUeiKXHlZW1tj165diImJga6uLpYsWYKkpCQx4Zqfd+/eYc6cOejSpQtMTEzw/Plz/Pbbb3j8+DF69uxZYLuIiAgkJSWhQYMG0NTUxK1btzB58mQ0a9YMFhYWAIDffvsNVatWhY2NDYDcfYAXLVqEcePGif1MnDgRXl5ecHJyQsOGDbFs2TK8e/cOPj4++Y5ra2uL58+fQ11dvQR3KNfHSVUgdy/ZvL4rVqwolkdHR6NatWoy9Uk+TLgSERERERERUblmb2+P+vXrY8eOHRg+fDiA3IOssrKy0KFDh3zbbNq0Cd7e3nL1v3HjRgwePBiOjo6oWbMmfv75Z7Rv3168XrlyZURERGDy5MlwcHCAnp4eBg8eLJOY/Rq8vLyQnp6OpUuXws/PDwYGBujRo0eR7Qrav/RTS5YswaBBg9C0aVMYGBhgypQpePPmzWfH7e/vj/v378PFxQXq6uoYNmwYunbtipSUlALbKCoq4u+//8bmzZvx/Plz6Ovro0GDBoiOjpbZr/VjampqWLduHSZMmICMjAyYmZmhW7dumDp1qlgnJycH06ZNw4MHD6CkpAQrKyssWLBAfL4AoHfv3nj27BlmzpyJp0+fom7dujhy5IjMQVof09fXL/Q+eHt7Iz4+HlFRUYXWK8q2bdswdOjQz+qjPJMIH28iUk69efMGOjo6SElJgba2dlmHUy5kZWUhIiICrq6uMnug5OveseINYN2+6DpEXxifcyoP8p7zWq9rybmH6z/FG4DPOX0D+PucygM+51/ff/Hvoenp6Xjw4AEsLS0LPIDqW3bo0CFMnjwZN2/e/OqrSolKi7OzM1q1aoXAwMAS93Hr1i20bt0ad+/ehY6OTukF950rzu84rnAlIiIiIiIionLPzc0N9+7dw+PHj2FmZlbW4RAVW0pKCuLi4nDo0KHP6icxMRG///47k62fgQlXIiIiIiIiIiIAvr6+ZR0CUYnp6Ojg33///ex+2rZtWwrRlG/fzBr5+fPnQyKRSP1yS09Px+jRo6Gvrw9NTU10794dSUlJUu0ePnwINzc3qKurw9DQEJMnT8aHDx++cvRERERERERERERE30jC9eLFi1izZg3q1KkjVT5hwgQcPHgQO3fuxKlTp/DkyRN069ZNvJ6dnQ03NzdkZmYiJiYGmzdvRkhICGbOnPm1p0BERERERERERERU9lsKpKamol+/fli3bh1mz54tlqekpGDDhg0ICwtD69atAeSeAGhra4s///wTjRs3xrFjx3D79m388ccfMDIyQt26dREcHIwpU6YgMDAQysrKZTUtIiJRfFi8nIcJfYVgiIiIiArBP7cQERF9vjJPuI4ePRpubm5o27atVML1r7/+QlZWltS+ETY2NqhatSrOnTuHxo0b49y5c7C3t4eRkZFYx8XFBSNHjsStW7dQr169fMfMyMhARkaG+P7NmzcAck/mzMrKKu0pUj7y7rPc9ztbKO4AxYyIqPTlPd/ZyJavPp9z+g7xOafygH9uofKAv8+/Pv7dk4jov6tME67h4eG4fPkyLl68KHPt6dOnUFZWRsWKFaXKjYyM8PTpU7HOx8nWvOt51woyb948BAUFyZQfO3YM6urqxZ0GfYbIyMgv0/GdiC/TL1EJ3Kl4R656t+Wr9lHHfM7p28HnnMoD/rmFygP+Pv960tLSyjoEIiL6Qsos4fro0SOMHz8ekZGRUFVV/apjT5s2DRMnThTfv3nzBmZmZmjfvj20tbW/aizlVVZWFiIjI9GuXTtUqFCh6AZxJ4o3gFXrkgVGVIrynvOar2vK9dU8i0b3izcAn3P6BvA5p/KAf26h8oC/z7++vG9aEhHRf0+ZJVz/+usvJCcno379+mJZdnY2Tp8+jV9//RVHjx5FZmYmXr9+LbXKNSkpCcbGxgAAY2NjXLhwQarfpKQk8VpBVFRUoKKiIlNeoUIF+f4QTaXm8c7Hcu4RJSlex/wc6Rui+P//rygVFPmc0/eLzzmVB3L/WZHPOX3H+Pv86+HfPYmI/rsUymrgNm3a4MaNG7h69ar4cnJyQr9+/cT/rlChAo4fPy62uXPnDh4+fIgmTZoAAJo0aYIbN24gOTlZrBMZGQltbW3UqlXrq8+JiIiIiIiIiL5PL168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJkdkukr5tR44cQd26dZGTk1PWoXy3yizhqqWlhdq1a0u9NDQ0oK+vj9q1a0NHRweDBw/GxIkTcfLkSfz111/w8fFBkyZN0LhxYwBA+/btUatWLQwYMADXrl3D0aNH4e/vj9GjR+e7gpWIiIiIiIiIysDfkq/3KqE5c+bA3d0dFhYWAHITqRKJpMDXqVOnSunmfDlNmzZFYmIidHR0vug4EokE+/bt+6JjfCmBgYGwsbGBhoYGdHV10bZtW5w/f77QNi9evECHDh1gamoKFRUVmJmZYcyYMVJbhZw5cwbNmjWDvr4+1NTUYGNjg6VLl8r09dtvv8HCwgKqqqpo1KiRzDe5LSwsIJFIEB4eLtPWzs4OEokEISEhxZrzxYsX0aZNG1SsWBG6urpwcXHBtWvXxOsdOnRAhQoVEBoaWqx+6f+UWcJVHkuXLkWnTp3QvXt3tGjRAsbGxtizZ494XVFREf/73/+gqKiIJk2aoH///hg4cCBmzZpVhlETERERERER0fckLS0NGzZswODBg8WyPXv2IDExUeqVkJCA2rVrw8nJCY0aNSrDiOWjrKwMY2NjSCQlT0T/19WoUQO//vorbty4gTNnzsDCwgLt27fHs2fPCmyjoKAAd3d3HDhwAHfv3kVISAj++OMPjBgxQqyjoaGBMWPG4PTp04iNjYW/vz/8/f2xdu1asc727dsxceJEBAQE4PLly3BwcICLi4vUN7kBwMzMDJs2bZIq+/PPP/H06VNoaGgUa76pqano0KEDqlativPnz+PMmTPQ0tKCi4sLsrKyxHre3t5YsWJFsfqm//NNJVyjoqKwbNky8b2qqip+++03vHz5Eu/evcOePXtk9mY1NzdHREQE0tLS8OzZMyxatAhKSmW2NS0RERERERERfWciIiKgoqIifqMWAPT09GBsbCz1Cg4OxvPnz7F3716pA8BzcnLw448/im0CAwPFa/Hx8ZBIJLh69apY9vr1a0gkEkRFRYllp06dQsOGDaGiogITExNMnToVHz58EK+3bNkSY8eOha+vL3R1dWFkZIR169bh3bt38PHxgZaWFqpXr47Dhw+LbT7dUiDv6/1Hjx6Fra0tNDU10aFDByQmJkrdj40bN8LOzk6MZcyYMZ95h/9PXFwc3N3dYWRkBE1NTTRo0AB//PGHVB0LCwvMnj0bAwcOhKamJszNzXHgwAE8e/YM7u7u0NTURJ06dXDp0iWxzYsXL+Dp6YnKlStDXV0d9vb22LZtW5Hx9O3bF23btkW1atVgZ2eHJUuW4M2bN7h+/XqBbXR1dTFy5Eg4OTnB3Nwcbdq0wahRoxAdHS3WqVevHjw9PWFnZwcLCwv0798fLi4uUnWWLFmCoUOHwsfHB7Vq1cLq1auhrq6OjRs3So3Xr18/nDp1Co8ePRLLNm7ciH79+hU7B/b333/j5cuXmDVrFmrWrAk7OzsEBAQgKSkJCQkJYr3OnTvj0qVLiIuLK1b/lOubSrgSEREREX2r4sPicX/z/SJfRET0/YmOjoajo2OhdVauXInff/8du3fvRpUqVaSubd68GRoaGjh//jx+/vlnzJo1C5GRkXKP//jxY7i6uqJBgwa4du0aVq1ahQ0bNmD27Nky4xgYGODChQsYO3YsRo4ciZ49e6Jp06a4fPky2rdvjwEDBiAtLa3AsdLS0rBo0SJs2bIFp0+fxsOHD+Hn5ydeX7VqFUaPHo1hw4bhxo0bOHDgAKpXry73XIqSmpoKV1dXHD9+HFeuXEGHDh3QuXNnPHz4UKre0qVL0axZM1y5cgVubm4YMGAABg4ciP79++Py5cuwsrLCwIEDIQgCACA9PR2Ojo44dOgQbt68iWHDhmHAgAEyX9EvTGZmJtauXQsdHR04ODjI3e7JkyfYs2cPnJ2dC6xz5coVxMTEiHUyMzPx119/oW3btmIdBQUFtG3bFufOnZNqa2RkBBcXF2zevBlA7me4fft2DBo0SO4Y89SsWRP6+vrYsGEDMjMz8f79e2zYsAG2trbidhoAULVqVRgZGUkliEl+TLgSERERERERUbmWkJAAU1PTAq+fPn0avr6++O2339C0aVOZ63Xq1EFAQACsra0xcOBAODk5SR0CXpSVK1fCzMwMv/76K2xsbNC1a1cEBQVh8eLFUgcXOTg4wN/fH9bW1pg2bRpUVVVhYGCAoUOHwtraGjNnzsSLFy8KXZ2ZlZWF1atXw8nJCfXr18eYMWOkYp09ezYmTZqE8ePHo0aNGmjQoEGpHgrm4OCA4cOHo3bt2rC2tkZwcDCsrKxw4MABqXqurq4YPny4OK83b96gQYMG6NmzJ2rUqIEpU6YgNjYWSUlJAIDKlSvDz88PdevWRbVq1TB27Fh06NABO3bsKDKm//3vf9DU1ISqqiqWLl2KyMhIGBgYFNnO09MT6urqqFy5MrS1tbF+/XqZOlWqVIGKigqcnJwwevRoDBkyBADw/PlzZGdnw8jISKq+kZERnj59KtPPoEGDEBISAkEQsGvXLlhZWaFu3bpFxvgpLS0tREVFYevWrVBTU4OmpiaOHDmCw4cPy6yWNTU1lVr1SvJjwpWIiIiIiIiIyrX3799LbRHwsYcPH6JHjx4YNmyYmCz7VJ06daTem5iYyOzDWZjY2Fg0adJEaq/VZs2aITU1Ff/++2++4ygqKkJfXx/29vZiWV7yrrCx1dXVYWVllW+sycnJePLkCdq0aZNv2xEjRkBTU1N8lURqair8/Pxga2uLihUrQlNTE7GxsTIrXD+ea968CptrdnY2goODYW9vDz09PWhqauLo0aNiv6GhoVKxf7xys1WrVrh69SpiYmLQoUMH9OrVS+y3Y8eOYhs7OzupGJcuXYrLly9j//79iIuLw8SJE2XmGx0djUuXLmH16tVYtmyZXNsc5MfNzQ2pqak4ffo0Nm7cWKLVrUDusz548GA0a9YMf/75J86ePYvatWvDzc0N79+/l6qrpqZW6GppKhg3OyUiIiIiIiKics3AwACvXr2SKX///j08PDxgZ2cndebMpypUqCD1XiKRiCtTFRRy17rlffUdgNThRMWR3zgfl+UlbD9eFStPH3mxqampFTr+rFmzpLYfKAk/Pz9ERkZi0aJFqF69OtTU1NCjRw9kZmYWGGfevAqb68KFC7F8+XIsW7YM9vb20NDQgK+vr9hvly5dpA46q1y5svjfGhoaqF69OqpXr47GjRvD2toaGzZswLRp07B+/XoxEfnpvcvb29fGxgZ6enpo3rw5ZsyYARMTE7GOpaUlgNxkcVJSEgIDA+Hp6QkDAwMoKiqKK3TzJCUlyZxfBABKSkoYMGAAAgICcP78eezdu7fQ+1yQsLAwxMfH49y5c+KzGRYWBl1dXezfvx99+vQR6758+RKVKlUq0TjlHROuRERERERERFSu1atXD1u3bpUpHzJkCF6+fImjR4+W+IDuvIRVYmIi6tWrBwBSB2gBgK2tLXbv3g1BEMRE4tmzZ6GlpSWzX+yXpKWlBQsLCxw/fhytWrWSuW5oaAhDQ8PPGuPs2bPw9vaGh4cHgNwVr/Hx8Z/VZ16/7u7u6N+/P4DcROzdu3dRq1YtALlz09LSkquvnJwcZGRkAJBOzBbVBoDYrqh+lZWV4ejoiOPHj6Nr167i9ePHjxd4SNmgQYOwaNEi9O7dG7q6unLF9am0tDQoKChIrabOe/9xoj49PR1xcXHiM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRPFVYhfS2BgIEaMGAFDQ0N07NgRb9++xdmzZzF27NhC2z148EAmkWxtbS1Tz9raGnv27EHnzp0hkUgwY8aMQlfkysva2hq7du1CTEwMdHV1sWTJEiQlJYkJ1/y8e/cOc+bMQZcuXWBiYoLnz5/jt99+w+PHj9GzZ88C20VERCApKQkNGjSApqYmbt26hcmTJ6NZs2biwVO//fYbqlatChsbGwC5+wAvWrQI48aNE/uZOHEivLy84OTkhIYNG2LZsmV49+4dfHx88h3X1tYWz58/h7q6egnuUK527dph8uTJGD16NMaOHYucnBzMnz8fSkpKUkn2P//8EyoqKmjSpEmJxyrPmHAlIiIiIiIionLN3t4e9evXx44dOzB8+HAAuQdZZWVloUOHDvm22bRpE7y9veXqf+PGjRg8eDAcHR1Rs2ZN/Pzzz2jfvr14vXLlyoiIiMDkyZPh4OAAPT09DB48WCYx+zV4eXkhPT0dS5cuhZ+fHwwMDNCjR48i2xW0f+mnlixZgkGDBqFp06YwMDDAlClT8ObNm8+O29/fH/fv34eLiwvU1dUxbNgwdO3aFSkpKQW2UVRUxN9//43Nmzfj+fPn0NfXR4MGDRAdHS2zX+vH1NTUsG7dOkyYMAEZGRkwMzNDt27dMHXqVLFOTk4Opk2bhgcPHkBJSQlWVlZYsGCB+HwBQO/evfHs2TPMnDkTT58+Rd26dXHkyBGZg7Q+pq+vX+h98Pb2Rnx8PKKiovK9bmNjg4MHDyIoKAhNmjSBgoIC6tWrhyNHjkhthbBt2zb069fvs5K75ZlE+HgTkXLqzZs30NHRQUpKCrS1tcs6nHIhKysLERERqPW6FhShWGT9ak3/Kd4A1u2LrkP0hfE5p/KAzzmVB3zOqTzgc/71/Rf/Hpqeno4HDx7A0tKywAOovmWHDh3C5MmTcfPmza++qpSotDg7O6NVq1YIDAwscR/Pnz9HzZo1cenSJXEPWire7ziucCUiIiIiIiKics/NzQ337t3D48ePYWZmVtbhEBVbSkoK4uLicOjQoc/qJz4+HitXrmSy9TMw4UpEREREREREBMDX17esQyAqMR0dHfz777+f3Y+TkxOcnJxKIaLyi2vkiYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERFRuffixQsYGhoiPj6+rEORW1RUFCQSCV6/fg0ACAkJQcWKFcs0ppKKj4+HRCLB1atXyzoUKqcaN26M3bt3l0pfSqXSCxERERERERFRAe5vvv/VxqrmVa1E7ebMmQN3d3dYWFiUbkCfiIqKQqtWrfDq1avvNjlaUi1btkTdunWxbNmysg6l2Pbs2YO5c+fin3/+QVZWFqytrTFp0iQMGDCgwDaJiYmYNGkSLl26hH/++Qfjxo2Ta+7Z2dkIDAzE1q1b8fTpU5iamsLb2xv+/v6QSCQAcu/lqVOnxDaGhoZo0aIFFi1aBHNz8wL7Lu3nLzAwEPv27ftqiXILCwv4+vrC19e3xH2Eh4fD09MT7u7u2Ldvn1ju7++PCRMmwMPDAwoKn7dGlStciYiIiIiIiKhcS0tLw4YNGzB48OCyDoW+UXp6epg+fTrOnTuH69evw8fHBz4+Pjh69GiBbTIyMlCpUiX4+/vDwcFB7rEWLFiAVatW4ddff0VsbCwWLFiAn3/+Gb/88otUvaFDhyIxMRFPnjzB/v378ejRI/Tv37/Ec/ySsrKyyjoEALkrqf38/NC8eXOZax07dsTbt29x+PDhzx6HCVciIiIiIiIiKtciIiKgoqKCxo0bi2V5X9c/evQo6tWrBzU1NbRu3RrJyck4fPgwbG1toa2tjb59+yItLU1sl5OTg3nz5sHS0hJqampwcHDArl27AOQme1q1agUA0NXVhUQigbe3NwDgyJEj+OGHH1CxYkXo6+ujU6dOiIuL+6x5xcXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23btn1WrJ/Kzs7G4MGDxftYs2ZNLF++XKqOt7c3unbtirlz58LIyAgVK1bErFmz8OHDB0yePBl6enqoUqUKNm3aJNVuypQpqFGjBtTV1VGtWjXMmDGjyIRgy5Yt4eHhAVtbW1hZWWH8+PGoU6cOzpw5U2AbCwsLLF++HAMHDoSOjo7cc4+JiYG7uzvc3NxgYWGBHj16oH379rhw4YJUPXV1dRgbG8PExASNGzfGmDFjcPnyZbnHAf5vG4qjR4/C1tYWmpqa6NChAxITE8U6UVFRaNiwITQ0NFCxYkU0a9YMCQkJCAkJQVBQEK5duwaJRAKJRIKQkBAAgEQiwapVq9ClSxdoaGhgzpw5+W55sW/fPnHVbp6DBw+iQYMGUFVVhYGBATw8PADkfgYJCQmYMGGCOF5xZGdno1+/fggKCkK1arIr4RUVFeHq6orw8PBi9ZsfJlyJiIiIiIiIqFyLjo6Go6NjvtcCAwPx66+/IiYmBo8ePUKvXr2wbNkyhIWF4dChQzh27JjUysN58+bh999/x+rVq3Hr1i1MmDAB/fv3x6lTp2BmZibuEXnnzh0kJiaKScR3795h4sSJuHTpEo4fPw4FBQV4eHggJyenxPNKTU2Fq6srjh8/jitXrqBDhw7o3LkzHj58KFVv6dKlaNasGa5cuQI3NzcMGDAAAwcORP/+/XH58mVYWVlh4MCBEAQBAJCeng5HR0ccOnQIN2/exLBhwzBgwACZhODnyMnJQZUqVbBz507cvn0bM2fOxE8//YQdO3ZI1Ttx4gSePHmC06dPY8mSJQgICECnTp2gq6uL8+fPY8SIERg+fDj+/fdfsY2WlhZCQkJw+/ZtLF++HOvWrcPSpUvljk0QBBw/fhx37txBixYtSm3OeZo2bYrjx4/j7t27AIBr167hzJkz6NixY4FtXr58iR07dqBRo0bFHi8tLQ2LFi3Cli1bcPr0aTx8+BB+fn4AgA8fPqBr165wdnbG9evXce7cOQwbNgwSiQS9e/fGpEmTYGdnh8TERCQmJqJ3795iv4GBgfDw8MCNGzcwaNAguWI5dOgQPDw84OrqiitXruD48eNo2LAhgNxtHapUqYJZs2aJ4xXHrFmzYGhoWOhK9oYNGyI6OrpY/eaHe7gSERERERERUbmWkJAAU1PTfK/Nnj0bzZo1AwAMHjwY06ZNQ1xcnLhCrkePHjh58iSmTJmCjIwMzJ07F3/88QeaNGkCAKhWrRrOnDmDNWvWwNnZGXp6egBy99z8eLVf9+7dpcbduHEjKlWqhNu3b6N27dolmpeDg4PUV9mDg4Oxd+9eHDhwAGPGjBHLXV1dMXz4cADAzJkzsWrVKjRo0AA9e/YEkLsitEmTJkhKSoKxsTEqV64sJuQAYOzYsTh69Ch27NghJsc+V4UKFRAUFCS+t7S0xLlz57Bjxw706tVLLNfT08OKFSugoKCAmjVr4ueff0ZaWhp++uknAMC0adMwf/58nDlzBn369AGQu1dnHgsLC/j5+SE8PBw//vhjoTGlpKSgcuXKyMjIgKKiIlauXIl27dqVynw/NnXqVLx58wY2NjZQVFREdnY25syZg379+knVW7lyJdavXw9BEJCWloYaNWoUusVBQbKysrB69WpYWVkBAMaMGYNZs2YBAN68eYOUlBR06tRJvG5rayu21dTUhJKSEoyNjWX67du3L3x8fIoVy5w5c9CnTx+pzz7vGdbT04OioiK0tLTyHa8wZ86cwYYNG4rca9bU1BSPHj1CTk7OZ+3jyhWuRERERERERFSuvX//Hqqqqvleq1OnjvjfRkZG4tfQPy5LTk4GAPzzzz9IS0tDu3btoKmpKb5+//33IrcHuHfvHjw9PVGtWjVoa2uLh3flrUbt2LGj2J+dnZ1c80pNTYWfnx9sbW1RsWJFaGpqIjY2VmaF66dzBAB7e3uZsrx5ZmdnIzg4GPb29tDT04OmpiaOHj0q9hsaGio1/5KuGPztt9/g6OiISpUqQVNTE2vXrpWJ3c7OTioxZmRkJBW7oqIi9PX1xdgBYPv27WjWrBmMjY2hqakJf39/sd+HDx9KxT537lyxnZaWFq5evYqLFy9izpw5mDhxIqKioko0NyB3ZfXHY4WGhgIAduzYgdDQUISFheHy5cvYvHkzFi1ahM2bN0u179evH65evSqugK1evTrat2+Pt2/fivcmr+/CVseqq6uLyVQAMDExEe+Xnp4evL294eLigs6dO2P58uVyryx1cnIq1v0AgKtXr6JNmzbFbleYt2/fYsCAAVi3bh0MDAwKraumpoacnBxkZGR81phc4UpERERERERE5ZqBgQFevXqV77UKFSqI/y2RSKTe55Xlfe0/NTUVQO7XoitXrixVT0VFpdAYOnfuDHNzc6xbtw6mpqbIyclB7dq1kZmZCQBYv3493r9/LxNTYfz8/BAZGYlFixahevXqUFNTQ48ePcQ+C5pjQWV581y4cCGWL1+OZcuWwd7eHhoaGvD19RX77dKli9RX2z+9F/IIDw+Hn58fFi9ejCZNmkBLSwsLFy7E+fPnC4w9L9bCPqNz586J+3i6uLhAR0cH4eHhWLx4MYDcFY4fr4LMW5EMAAoKCqhevToAoG7duoiNjcW8efPQsmXLYs8PyE1IfjxWXmJ78uTJmDp1qrgi197eHgkJCZg3bx68vLzE+jo6OmI81atXx4YNG2BiYoLt27djyJAhiIiIEPemVVNTKzCO/O5X3vYRALBp0yaMGzcOR44cwfbt2+Hv74/IyEipPY/zo6GhIfVeQUFBql9A9jCtwuIsqbi4OMTHx6Nz585iWd7zoKSkhDt37ogJ55cvX0JDQ+Oz42DClYiIiIiIiIjKtXr16mHr1q2f3U+tWrWgoqKChw8fwtnZOd86ysrKAHJXieZ58eIF7ty5g3Xr1omnp396GFNJkpZnz56Ft7e3eOhQamoq4uPji91Pfv26u7ujf//+AHKTV3fv3kWtWrUA5K4E1dLS+uwxmjZtilGjRolln3uIGJB7IJW5uTmmT58uliUkJIj/raSkJCYxi/K5KyHV1NTyHSstLU3m6+yKiopF7uerqKgIAGJi3tzcvMSxfapevXqoV68epk2bhiZNmiAsLAyNGzeGsrKy1LNcmEqVKuHt27d49+6dmIz99Cv+derUwfHjxwvciqA44+WxsbHBjRs3pMr8/f3x9u1bLF++HGZmZmL5zZs3Ua9evWL1nx8mXImIiIiIiIioXHNxccG0adPw6tUr6OrqlrgfLS0t+Pn5YcKECcjJycEPP/yAlJQUnD17Ftra2vDy8oK5uTkkEgn+97//wdXVFWpqatDV1YW+vj7Wrl0LExMTPHz4EFOnTv3seVlbW2PPnj3o3LkzJBIJZsyY8VmHcH3c765duxATEwNdXV0sWbIESUlJYsK1MM+ePZNJspmYmOQ7xu+//46jR4/C0tISW7ZswcWLF2FpafnZsT98+BDh4eFo0KABDh06hL179xbZbt68eXBycoKVlRUyMjIQERGBLVu2YNWqVWKdadOm4fHjx/j999/Fsry5pqaminNXVlYu9F517twZc+bMQdWqVWFnZ4crV65gyZIlMgdPpaWl4enTpwCApKQkBAcHQ1VVFe3bty/OLSnUgwcPsHbtWnTp0gWmpqa4c+cO7t27h4EDBwLI3QP3wYMHuHr1KqpUqQItLa0CV3M3atQI6urq+OmnnzBu3DicP38eISEhUnUCAgLQpk0bWFlZoU+fPvjw4QMiIiIwZcoUcbzTp0+jT58+UFFRKXKLAABQVVWV2Qc5b//kT8ujo6NL5f4x4UpEREREREREX1Q1r2pFVypD9vb2qF+/Pnbs2CEeHlVSwcHBqFSpEubNm4f79++jYsWKqF+/vniIU+XKlREUFISpU6fCx8cHAwcOREhICMLDwzFu3DjUrl0bNWvWxIoVK0r8VfU8eUm6pk2bwsDAAFOmTMGbN28+q08gd3Xg/fv34eLiAnV1dQwbNgxdu3ZFSkpKkW3DwsIQFhYmVRYcHCyuls0zfPhwXLlyBb1794ZEIoGnpydGjRqFw4cPf1bsXbp0wYQJEzBmzBhkZGTAzc0NM2bMQGBgYKHt3r17h1GjRuHff/+FmpoabGxssHXrVvTu3Vusk5iYKLPH7MerJf/66y+EhYXB3Ny80JXGv/zyC2bMmIFRo0YhOTkZpqamGD58OGbOnClVb926dVi3bh0AQFdXF3Xq1EFERARq1qwp590omrq6Ov7++29s3rwZL168gImJCUaPHi3+nHTv3h179uxBq1at8Pr1a2zatAne3t759qWnp4etW7di8uTJWLduHdq0aYPAwEAMGzZMrNOyZUvs3LkTwcHBmD9/PrS1tdGiRQvx+qxZszB8+HAx8Z23RYFEIil0bHk8fvwYMTExpbLaXSJ8unlCOfTmzRvo6OggJSUF2traZR1OuZCVlYWIiAjUel0LilAssn61pv8UbwDr0vvXHKKS4nNO5QGfcyoP+JxTecDn/Ov7L/49ND09HQ8ePIClpWWBB1B9yw4dOoTJkyfj5s2bn3U6ORF9XQ8ePECNGjVw+/ZtWFtbl7ifKVOm4NWrV1i7dm2+14vzO44rXImIiIiIiIio3HNzc8O9e/fw+PFjqT0diejbFhERgWHDhn1WshUADA0NMXHixFKJiQlXIiIiIiIiIiIAvr6+ZR0CERXT6NGjS6WfSZMmlUo/AMA18kRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIqNx78eIFDA0NER8fX9ahyC0qKgoSiQSvX78GAISEhKBixYplGlNJxcfHQyKR4OrVq2UdCpVTffr0weLFi0ulLyZciYiIiIiIiOjLunfs671KaM6cOXB3d4eFhUXpzTsfnyZJy5OWLVvC19e3rMMokT179sDJyQkVK1aEhoYG6tatiy1bthTaJjExEX379kWNGjWgoKAg99yzs7MxY8YMWFpaQk1NDVZWVggODoYgCGKdli1bQiKRiC8jIyP07NkTCQkJhfZd2s9fYGAg6tatWyp9ycPCwgLLli0rUdtly5ahZs2aUFNTg5mZGSZMmID09HTxur+/P+bMmYOUlJTPjpMJVyIiIiIiIiIq19LS0rBhwwYMHjy4rEOhb5Senh6mT5+Oc+fO4fr16/Dx8YGPjw+OHj1aYJuMjAxUqlQJ/v7+cHBwkHusBQsWYNWqVfj1118RGxuLBQsW4Oeff8b/a+++43s6//+PP9+JiJAlZIiRWDVjKxEtSimamq29qqVFrcZq7f1pzVZLqQoqDdpq1aymNlW7VT5aM1oxPlYQIuv3h5/z9a4g7+REJHnce3O7eZ9zneu8rncvVU/Xuc7HH39s1e7NN99UVFSUzp49q++//15nzpxRx44dUz3G9BQXF5eh9w8LC9PQoUM1atQoHTlyRPPnz9fSpUv13nvvGW3Kly+v4sWL68svv0zz/QhcAQAAAABAtrZmzRo5OjqqZs2axrF7KwHXr1+vypUry8nJSS+88IIuXLigtWvXqkyZMnJ1dVX79u0VExNjXJeYmKhJkyYZqxMrVqyor7/+WtLdx+br1asnScqbN68sFou6du0qSVq3bp1q164td3d35cuXTy+//LKOHz+epnEdP35czZo1k7e3t5ydnVW9enX99NNPVm38/f01fvx4de7cWc7OzvLz89PKlSt18eJFNWvWTM7OzqpQoYL27NljXHPp0iW1a9dOBQsWVO7cuRUQEKCvvvoqTbX+W0JCgrp37258j6VKldLMmTOt2nTt2lXNmzfXxIkT5e3tLXd3d40dO1bx8fEaNGiQPDw8VKhQIS1YsMDquiFDhuiZZ55R7ty5VaxYMY0YMeKxgWDdunXVokULlSlTRsWLF1e/fv1UoUIFbdu27aHX+Pv7a+bMmercubPc3NxSPPYdO3aoWbNmatq0qfz9/dW6dWs1bNhQv/76q1W73Llzy8fHRwUKFFDNmjXVp08f7du3L8X3kf5vG4r169erTJkycnZ21ksvvaSoqCijzaZNm/Tss88qT548cnd3V1BQkE6fPq3Q0FCNGTNGBw8eNFbahoaGSpIsFotmz56tV155RXny5NGECROS3fLiu+++k8VisTr2ww8/qHr16sqVK5fy58+vFi1aSLr77+D06dMaMGCAcb+U2rFjh4KCgtS+fXv5+/urYcOGateu3QPfaXBwsMLDw234BpNH4AoAAAAAALK1rVu3qmrVqsmeGz16tGbNmqUdO3bozJkzeu211zRjxgyFhYVp9erV+vHHH61WHk6aNEmLFi3SnDlz9Mcff2jAgAHq2LGjNm/erMKFC+ubb76RJB09elRRUVFGiHjz5k0NHDhQe/bsUUREhOzs7NSiRQslJiamelw3btxQkyZNFBERof379+ull15ScHCwIiMjrdpNnz5dQUFB2r9/v5o2bapOnTqpc+fO6tixo/bt26fixYurc+fOxiPtt2/fVtWqVbV69WodOnRIPXr0UKdOnR4Ir9IiMTFRhQoV0vLly3X48GGNHDlS7733npYtW2bV7ueff9bZs2e1ZcsWTZs2TaNGjdLLL7+svHnzateuXXrrrbfUs2dP/f3338Y1Li4uCg0N1eHDhzVz5kzNmzdP06dPT3FtSUlJioiI0NGjR/X888+bNuZ7atWqpYiICP3555+SpIMHD2rbtm1q3LjxQ6+5fPmyli1bpho1ath8v5iYGE2ZMkWLFy/Wli1bFBkZqZCQEElSfHy8mjdvrjp16ui3337Tzp071aNHD1ksFrVp00bvvvuuypUrp6ioKEVFRalNmzZGv6NHj1aLFi30+++/6/XXX09RLatXr1aLFi3UpEkT7d+/XxEREXr22Wcl3d3WoVChQho7dqxxv5SqVauW9u7da8zREydOaM2aNWrSpIlVu2effVa//vqrYmNjU9x3cnKk6WoAAAAAAIBM7vTp0/L19U323Pjx4xUUFCRJ6t69u4YNG6bjx4+rWLFikqTWrVtr48aNGjJkiGJjYzVx4kT99NNPCgwMlCQVK1ZM27Zt02effaY6derIw8NDkuTl5WW12q9Vq1ZW9/3iiy/k6empw4cPq3z58qkaV8WKFa0eZR83bpxWrFihlStXqk+fPsbxJk2aqGfPnpKkkSNHavbs2apevbpeffVVSXdXhAYGBur8+fPy8fFRwYIFjUBOkt555x2tX79ey5YtM8KxtHJwcNCYMWOMz0WLFtXOnTu1bNkyvfbaa8ZxDw8PffTRR7Kzs1OpUqX0wQcfKCYmxnhUfNiwYZo8ebK2bdumtm3bSrq7V+c9/v7+CgkJUXh4uAYPHvzImq5du6aCBQsqNjZW9vb2+vTTT/Xiiy+aMt77DR06VNHR0SpdurTs7e2VkJCgCRMmqEOHDlbtPv30U33++edKSkpSTEyMnnnmmUducfAwcXFxmjNnjooXLy5J6tOnj8aOHStJio6O1rVr1/Tyyy8b58uUKWNc6+zsrBw5csjHx+eBftu3b69u3brZVMuECRPUtm1bq3/39+awh4eH7O3t5eLikuz9HqV9+/b63//+p9q1ayspKUnx8fF66623rLYUkCRfX1/duXNH586dk5+fn033uB8rXAEAAAAAQLZ269Yt5cqVK9lzFSpUMH7u7e1tPIZ+/7ELFy5Iko4dO6aYmBi9+OKLcnZ2Nn4sWrTosdsD/PXXX2rXrp2KFSsmV1dX4+Vd91ajNm7c2OivXLlyKRrXjRs3FBISojJlysjd3V3Ozs46cuTIAytc/z1GSQoICHjg2L1xJiQkaNy4cQoICJCHh4ecnZ21fv16o98lS5ZYjX/r1q0pqvffPvnkE1WtWlWenp5ydnbW3LlzH6i9XLlysrP7v3jL29vbqnZ7e3vly5fPqF2Sli5dqqCgIPn4+MjZ2VnDhw83+o2MjLSqfeLEicZ1Li4uOnDggHbv3q0JEyZo4MCB2rRpU6rGJt1dWX3/vZYsWSJJWrZsmZYsWaKwsDDt27dPCxcu1JQpU7Rw4UKr6zt06KADBw4YK2BLlCihhg0b6vr168Z3c6/vR62OzZ07txGmSlKBAgWM78vDw0Ndu3ZVo0aNFBwcrJkzZ6Z4ZWm1atVs+j4k6cCBA6pfv77N1z3Opk2bNHHiRH366afat2+fvv32W61evVrjxo2zaufk5CRJVtuEpAYrXAEAAAAAQLaWP39+XblyJdlzDg4Oxs8tFovV53vH7j32f+PGDUl3H4suWLCgVTtHR8dH1hAcHCw/Pz/NmzdPvr6+SkxMVPny5XXnzh1J0ueff65bt249UNOjhISEaMOGDZoyZYpKlCghJycntW7d2ujzYWN82LF74/zwww81c+ZMzZgxQwEBAcqTJ4/69+9v9PvKK69YPdr+7+8iJcLDwxUSEqKpU6cqMDBQLi4u+vDDD7Vr166H1n6v1kf9O9q5c6c6dOigMWPGqFGjRnJzc1N4eLimTp0q6e4KxwMHDhjX3luRLEl2dnYqUaKEJKlSpUo6cuSIJk2apLp169o8PuluIHn/ve4F24MGDdLQoUONFbkBAQE6ffq0Jk2apC5duhjt3dzcjHpKlCih+fPnq0CBAlq6dKneeOMNrVmzxtib9l6QmJzkvq9720dI0oIFC9S3b1+tW7dOS5cu1fDhw7VhwwarPY+TkydPHqvPdnZ2Vv1KD75M61F1psWIESPUqVMnvfHGG5Lufqc3b95Ujx499P777xuh/eXLlyVJnp6eabofgSsAAAAAAMjWKleubMqbycuWLStHR0dFRkaqTp06ybbJmTOnpLurRO+5dOmSjh49qnnz5um5556TpAdexpSa0HL79u3q2rWr8dKhGzdu6NSpUzb3k1y/zZo1U8eOHSXdDWL//PNPlS1bVtLdlaAuLi5pvketWrXUq1cv41haXyIm3X15kp+fn95//33j2OnTp42f58iRwwgxHycxMTFNe306OTkle6+YmBirVbvS3ZW6j9vP197eXpKMYD4tj8T/W+XKlVW5cmUNGzZMgYGBCgsLU82aNZUzZ06rufwonp6eun79um7evGmEsfcHztLd1dYREREP3YrAlvvd72HfqSSrEPjQoUMqVKiQ8ufPb/M97kfgCgAAAAAAsrVGjRpp2LBhunLlivLmzZvqflxcXBQSEqIBAwYoMTFRtWvX1rVr17R9+3a5urqqS5cu8vPzk8Vi0apVq9SkSRM5OTkpb968ypcvn+bOnasCBQooMjJSQ4cOTfO4SpYsqW+//VbBwcGyWCwaMWJEml7CdX+/X3/9tXbs2KG8efNq2rRpOn/+vBG4PsrFixcfCNkKFCiQ7D0WLVqk9evXq2jRolq8eLF2796tokWLprn2yMhIhYeHq3r16lq9erVWrFjx2OsmTZqkatWqqXjx4oqNjdWaNWu0ePFizZ4922gzbNgw/fPPP1q0aJFx7N5Yb9y4YYw9Z86cj/yugoODNWHCBBUpUkTlypXT/v37NW3atAdePBUTE6Nz585Jks6fP69x48YpV65catiwoS1fySOdPHlSc+fO1SuvvCJfX18dPXpUf/31lzp37izp7h64J0+e1IEDB1SoUCG5uLg8dDV3jRo1lDt3br333nvq27evdu3apdDQUKs2o0aNUv369VW8eHG1bdtW8fHxWrNmjYYMGWLcb8uWLWrbtq0cHR1THIwGBwdr2rRpqly5smrUqKFjx45pxIgRCg4ONoJX6e42D2Z8fwSuAAAAAAAgfZU0LwBKDwEBAapSpYqWLVtmvDwqtcaNGydPT09NmjRJJ06ckLu7u6pUqWK8nKdgwYIaM2aMhg4dqm7duqlz584KDQ1VeHi4+vbtq/Lly6tUqVL66KOPUv2o+j33QrpatWopf/78GjJkiKKjo9PUp3T3pVMnTpxQo0aNlDt3bvXo0UPNmzfXtWvXHnttWFiYwsLCrI6NGzfOWC17T8+ePbV//361adNGFotF7dq1U69evbR27do01f7KK69owIAB6tOnj2JjY9W0aVONGDFCo0ePfuR1N2/eVK9evfT333/LyclJpUuX1pdffqk2bdoYbaKioh7YY7Zy5crGz/fu3auwsDD5+fk9cqXxxx9/rBEjRqhXr166cOGCfH191bNnT40cOdKq3bx58zRv3jxJUt68eVWhQgWtWbNGpUqVSuG38Xi5c+fWf//7Xy1cuFCXLl1SgQIF1Lt3b+PXSatWrfTtt9+qXr16unr1qhYsWKCuXbsm25eHh4e+/PJLDRo0SPPmzVP9+vU1evRo9ejRw2hTt25dLV++XOPGjdPkyZPl6uqq559/3jg/duxY9ezZ0wi+761OtVgsj7z38OHDZbFYNHz4cP3zzz/y9PQ0gu17bt++re+++07r1q1L47cmWZL+vXlCNhQdHS03Nzddu3ZNrq6uGV1OthAXF6c1a9ao7NWyspf9Y9sXq3XMths85b+ZI3tgniM7YJ4jO2CeIztgnj95WfHPobdv39bJkydVtGjRh76A6mm2evVqDRo0SIcOHXrg0WMAT6+TJ0/qmWee0eHDh1WyZMlU9zN79mytWLFCP/74Y7LnbflvHCtcAQAAAABAtte0aVP99ddf+ueff1S4cOGMLgdACq1Zs0Y9evRIU9gq3X152Mcff2xKTQSuAAAAAAAAkvr375/RJQCwUe/evU3p54033jClH0lijTwAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAACQ7V26dEleXl46depURpeSYps2bZLFYtHVq1clSaGhoXJ3d8/QmlLr1KlTslgsOnDgQEaXgmzozp078vf31549e0zpL4cpvQAAAAAAADxE49/2PrF7ra1QNVXXTZgwQc2aNZO/v7+5Bf3Lpk2bVK9ePV25ciXThqOpVbduXVWqVEkzZszI6FJs9u2332rixIk6duyY4uLiVLJkSb377rvq1KnTI6+ZPXu2Dhw4oNjYWJUrV06jR49Wo0aNHnuvOXPmaO/evbp8+bL279+vSpUqWbXx9/fX6dOnJUl2dnby9vZW48aNNWXKFOXNm/ehfYeGhqp///5GSJ9WXbt21dWrV/Xdd9+Z0t/jWCwWrVixQs2bN091H5MnT9awYcPUr18/Yy7mzJlTISEhGjJkiCIiItJcJytcAQAAAABAthYTE6P58+ere/fuGV0KnlIeHh56//33tXPnTv3222/q1q2bunXrpvXr1z/0mi1btujFF1/UmjVrtHfvXtWrV0/BwcHav3//I+918+ZN1a5dW//5z38e2W7s2LGKiopSZGSklixZoi1btqhv376pGl96i4uLy+gSJEm7d+/WZ599pgoVKjxwrkOHDtq2bZv++OOPNN+HwBUAAAAAAGRra9askaOjo2rWrGkcu/e4/vr161W5cmU5OTnphRde0IULF7R27VqVKVNGrq6uat++vWJiYozrEhMTNWnSJBUtWlROTk6qWLGivv76a0l3H5uvV6+eJClv3ryyWCzq2rWrJGndunWqXbu23N3dlS9fPr388ss6fvx4msZ1/PhxNWvWTN7e3nJ2dlb16tX1008/WbXx9/fX+PHj1blzZzk7O8vPz08rV67UxYsX1axZMzk7O6tChQpWj1pfunRJ7dq1U8GCBZU7d24FBAToq6++SlOt/5aQkKDu3bsb32OpUqU0c+ZMqzZdu3ZV8+bNNXHiRHl7e8vd3V1jx45VfHy8Bg0aJA8PDxUqVEgLFiywum7IkCF65plnlDt3bhUrVkwjRox4bCBYt25dtWjRQmXKlFHx4sXVr18/VahQQdu2bXvoNTNmzNDgwYNVvXp1lSxZUhMnTlTJkiX1ww8/PPJenTp10siRI9WgQYNHtnNxcZGPj48KFiyoevXqqUuXLtq3b98jr/m30aNHq1KlSlq8eLH8/f3l5uamtm3b6vr160abr7/+WgEBAXJyclK+fPnUoEED3bx5U6NHj9bChQv1/fffy2KxyGKxaNOmTcb2EEuXLlWdOnWUK1cuLVmyxLjXv7+jf68q/+KLL1SuXDk5OjqqQIEC6tOnjyQZ7Vq0aCGLxWLzavQbN26oQ4cOmjdvXrKrgPPmzaugoCCFh4fb1G9yCFwBAAAAAEC2tnXrVlWtmvxWBKNHj9asWbO0Y8cOnTlzRq+99ppmzJihsLAwrV69Wj/++KM+/vhjo/2kSZO0aNEizZkzR3/88YcGDBigjh07avPmzSpcuLC++eYbSdLRo0cVFRVlhIg3b97UwIEDtWfPHkVERMjOzk4tWrRQYmJiqsd148YNNWnSRBEREdq/f79eeuklBQcHKzIy0qrd9OnTFRQUpP3796tp06bq1KmTOnfurI4dO2rfvn0qXry4OnfurKSkJEnS7du3VbVqVa1evVqHDh1Sjx491KlTJ/3666+prvXfEhMTVahQIS1fvlyHDx/WyJEj9d5772nZsmVW7X7++WedPXtWW7Zs0bRp0zRq1Ci9/PLLyps3r3bt2qW33npLPXv21N9//21c4+LiotDQUB0+fFgzZ87UvHnzNH369BTXlpSUpIiICB09elTPP/+8TWO6fv26PDw8UnxNSv3zzz/64YcfVKNGDZuvPX78uL777jutWrVKq1at0ubNmzV58mRJUlRUlNq1a6fXX39dR44c0aZNm9SyZUslJSUpJCREr732ml566SVFRUUpKipKtWrVMvodOnSo+vXrpyNHjjx2G4V7Zs+erd69e6tHjx76/ffftXLlSpUoUULS3dWpkrRgwQJFRUUZn1Oqd+/eatq06SOD7GeffVZbt261qd/ksIcrAAAAAADI1k6fPi1fX99kz40fP15BQUGSpO7du2vYsGE6fvy4ihUrJklq3bq1Nm7cqCFDhig2NlYTJ07UTz/9pMDAQElSsWLFtG3bNn322WeqU6eOEbZ5eXlZ7eHaqlUrq/t+8cUX8vT01OHDh1W+fPlUjatixYqqWLGi8XncuHFasWKFVq5caawalKQmTZqoZ8+ekqSRI0dq9uzZql69ul599VVJd1eEBgYG6vz588aKypCQEOP6d955R+vXr9eyZcv07LPPpqrWf3NwcNCYMWOMz0WLFtXOnTu1bNkyvfbaa8ZxDw8PffTRR7Kzs1OpUqX0wQcfKCYmRu+9954kadiwYZo8ebK2bdumtm3bSpKGDx9uXO/v76+QkBCFh4dr8ODBj6zp2rVrKliwoGJjY2Vvb69PP/1UL774YorHNGXKFN24ccOq/rQYMmSIhg8froSEBN2+fVs1atTQtGnTbO4nMTFRoaGhcnFxkXR3hW1ERIQmTJigqKgoxcfHq2XLlvLz85MkBQQEGNc6OTkpNjZWPj4+D/Tbv39/tWzZ0qZaxo8fr3fffVf9+vUzjlWvXl2S5OnpKUlyd3dP9n6PEh4ern379j02pPX19TX2xk0LVrgCAAAAAIBs7datW8qVK1ey5+7f69Hb29t4DP3+YxcuXJAkHTt2TDExMXrxxRfl7Oxs/Fi0aNFjtwf466+/1K5dOxUrVkyurq7G49L3VqM2btzY6K9cuXIpGteNGzcUEhKiMmXKyN3dXc7Ozjpy5MgDK1z/PUbJOlS7d+zeOBMSEjRu3DgFBATIw8NDzs7OWr9+vdHvkiVLrMaf2hWDn3zyiapWrSpPT085Oztr7ty5D9Rerlw52dn9X7zl7e1tVbu9vb3y5ctn1C5JS5cuVVBQkHx8fOTs7Kzhw4cb/UZGRlrVPnHiROM6FxcXHThwQLt379aECRM0cOBAbdq0KUVjCQsL05gxY7Rs2TJ5eXlJSvv3NGjQIB04cEC//fab8aKnpk2bKiEhQZKs+n7rrbce2o+/v78RtkpSgQIFjO+rYsWKql+/vgICAvTqq69q3rx5unLlSorqq1atmk3juXDhgs6ePav69evbdN3jnDlzRv369dOSJUse+uv8HicnJ6stQlKLFa4AAAAAACBby58//0NDJAcHB+PnFovF6vO9Y/ce+79x44YkafXq1SpYsKBVO0dHx0fWEBwcLD8/P82bN0++vr5KTExU+fLldefOHUnS559/rlu3bj1Q06OEhIRow4YNmjJlikqUKCEnJye1bt3a6PNhY3zYsXvj/PDDDzVz5kzNmDFDAQEBypMnj/r372/0+8orr1g92v7v7yIlwsPDFRISoqlTpyowMFAuLi768MMPtWvXrofWfq/WR/072rlzpzp06KAxY8aoUaNGcnNzU3h4uKZOnSrp7grHAwcOGNfe//i/nZ2d8Xh7pUqVdOTIEU2aNEl169Z97FjeeOMNLV++3Opx9rR+T/nz5zfqKVmypGbMmKHAwEBt3LhRDRo0sBqHq6vrQ/t51Pdlb2+vDRs2aMeOHcb2Ge+//7527dqlokWLPrK+PHnyWH22s7MztqW45/69c52cnB7ZX2rt3btXFy5cUJUqVYxjCQkJ2rJli2bNmmWsWJaky5cvGytp04LAFQAAAAAAZGuVK1fWl19+meZ+ypYtK0dHR0VGRqpOnTrJtsmZM6ckGasQpbsvoTp69KjmzZun5557TpIeeBlTakLL7du3q2vXrmrRooWku4HwqVOnbO4nuX6bNWumjh07SrobxP75558qW7aspLsrQe9fMZnae9SqVUu9evUyjqX1JWKStGPHDvn5+en99983jt3/CHmOHDmMEPNxEhMTFRsb+8g2X331lV5//XWFh4eradOmVufM+J7udy80vBfMp3Qcj2OxWBQUFKSgoCCNHDlSfn5+WrFihQYOHKicOXNazeVH8fT01Llz55SUlGSE+PeHwi4uLvL391dERITxcrl/c3BwSPH97qlfv75+//13q2PdunVT6dKlNWTIEON7k6RDhw6pcuXKNvWfHAJXAAAAAACQrTVq1EjDhg3TlStXkn17eUq5uLgoJCREAwYMUGJiomrXrq1r165p+/btcnV1VZcuXeTn5yeLxaJVq1apSZMmcnJyUt68eZUvXz7NnTtXBQoUUGRkpIYOHZrmcZUsWVLffvutgoODZbFYNGLEiDS9hOv+fr/++mvt2LFDefPm1bRp03T+/HkjcH2UixcvWoVs0t1H2JO7x6JFi7R+/XoVLVpUixcv1u7dux+7qjIltUdGRio8PFzVq1fX6tWrtWLFisdeN2nSJFWrVk3FixdXbGys1qxZo8WLF2v27NlGm2HDhumff/7RokWLJN3dRqBLly6aOXOmatSooXPnzkm6u5LTzc3tofe6fPmyIiMjdfbsWUl3X7AmST4+PlZ7l16/ft0IMM+cOaPBgwfL09PT6sVVabVr1y5FRESoYcOG8vLy0q5du3Tx4kWVKVNG0t3tCNavX6+jR48qX758jxxX3bp1dfHiRX3wwQdq3bq11q1bp7Vr11qtvh09erTeeusteXl5qXHjxrp+/bq2b9+ud955x7hfRESEgoKC5OjomKJfry4uLg/sg5wnTx7ly5fvgeNbt27VuHHjUvz9PAyBKwAAAAAASFdrK1TN6BIeKSAgQFWqVNGyZcuMl0el1rhx4+Tp6alJkybpxIkTcnd3V5UqVYyXOBUsWFBjxozR0KFD1a1bN3Xu3FmhoaEKDw9X3759Vb58eZUqVUofffTRYx9Vf5xp06bp9ddfV61atZQ/f34NGTJE0dHRaepTuvvSqRMnTqhRo0bKnTu3evTooebNm+vatWuPvTYsLExhYWFWx8aNG2eslr2nZ8+e2r9/v9q0aSOLxaJ27dqpV69eWrt2bZpqf+WVVzRgwAD16dNHsbGxatq0qUaMGKHRo0c/8rqbN2+qV69e+vvvv+Xk5KTSpUvryy+/VJs2bYw2UVFRVnvMzp07V/Hx8erdu7d69+5tHO/SpYtCQ0Mfeq+VK1eqW7duxud7L/saNWqUVZ0jR47UyJEjJd1dPVq9enX9+OOPypcvX0q+ihRxdXXVli1bNGPGDEVHR8vPz09Tp05V48aNJUlvvvmmNm3apGrVqunGjRvauHGjsf/wv5UpU0affvqpJk6cqHHjxqlVq1YKCQnR3LlzjTZdunTR7du3NX36dIWEhCh//vxq3bq1cX7q1KkaOHCg5s2bp4IFC+rUqVM6deqUihYtqo0bN6bp18zOnTt17do1q/ulliXp35snZEPR0dFyc3PTtWvXHrmnBcwTFxenNWvWqOzVsrKX/WPbF6t1zLYblGyYysoA8zDPkR0wz5EdMM+RHTDPn7ys+OfQ27dv6+TJkypatOhjX0zzNFq9erUGDRqkQ4cOWb2ECcDTbePGjWrZsqVOnDiRphXqbdq0UcWKFY2/HPk3W/4bxwpXAAAAAACQ7TVt2lR//fWX/vnnHxUuXDijywGQQmvWrNF7772XprD1zp07CggI0IABA0ypicAVAAAAAABAUv/+/TO6BAA2+vDDD9PcR86cOTV8+HATqrmLNfIAAAAAAAAAYBICVwAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJMQuAIAAAAAAACASQhcAQAAAABAtnfp0iV5eXnp1KlTGV1Kim3atEkWi0VXr16VJIWGhsrd3T1Da0qtU6dOyWKx6MCBAxldCrKpmjVr6ptvvjGlrxym9AIAAAAAAPAQcWPefWL3chg1NVXXTZgwQc2aNZO/v7+5Bf3Lpk2bVK9ePV25ciXThqOpVbduXVWqVEkzZszI6FJs9u2332rixIk6duyY4uLiVLJkSb377rvq1KnTI6+ZPXu2Dhw4oNjYWJUrV06jR49Wo0aNHnuvOXPmaO/evbp8+bL279+vSpUqWbXx9/fX6dOnJUl2dnby9vZW48aNNWXKFOXNm/ehfYeGhqp///5GSJ9WXbt21dWrV/Xdd9+Z0t/jWCwWrVixQs2bN7fpuoSEBI0ePVpffvmlzp07J19fX3Xt2lXDhw+XxWKRJA0fPlwDBgxQixYtZGeXtjWqrHAFAAAAAADZWkxMjObPn6/u3btndCl4Snl4eOj999/Xzp079dtvv6lbt27q1q2b1q9f/9BrtmzZohdffFFr1qzR3r17Va9ePQUHB2v//v2PvNfNmzdVu3Zt/ec//3lku7FjxyoqKkqRkZFasmSJtmzZor59+6ZqfOktLi4uQ+//n//8R7Nnz9asWbN05MgR/ec//9EHH3ygjz/+2GjTuHFjXb9+XWvXrk3z/QhcAQAAAABAtrZmzRo5OjqqZs2axrF7j+uvX79elStXlpOTk1544QVduHBBa9euVZkyZeTq6qr27dsrJibGuC4xMVGTJk1S0aJF5eTkpIoVK+rrr7+WdPex+Xr16kmS8ubNK4vFoq5du0qS1q1bp9q1a8vd3V358uXTyy+/rOPHj6dpXMePH1ezZs3k7e0tZ2dnVa9eXT/99JNVG39/f40fP16dO3eWs7Oz/Pz8tHLlSl28eFHNmjWTs7OzKlSooD179hjXXLp0Se3atVPBggWVO3duBQQE6KuvvkpTrf+WkJCg7t27G99jqVKlNHPmTKs2Xbt2VfPmzTVx4kR5e3vL3d1dY8eOVXx8vAYNGiQPDw8VKlRICxYssLpuyJAheuaZZ5Q7d24VK1ZMI0aMeGwgWLduXbVo0UJlypRR8eLF1a9fP1WoUEHbtm176DUzZszQ4MGDVb16dZUsWVITJ05UyZIl9cMPPzzyXp06ddLIkSPVoEGDR7ZzcXGRj4+PChYsqHr16qlLly7at2/fI6/5t9GjR6tSpUpavHix/P395ebmprZt2+r69etGm6+//loBAQFycnJSvnz51KBBA928eVOjR4/WwoUL9f3338tischisWjTpk3G9hBLly5VnTp1lCtXLi1ZssS417+/o3+vKv/iiy9Urlw5OTo6qkCBAurTp48kGe1atGghi8Vi02r0HTt2qFmzZmratKn8/f3VunVrNWzYUL/++qvRxt7eXk2aNFF4eLhN32FyCFwBAAAAAEC2tnXrVlWtWjXZc6NHj9asWbO0Y8cOnTlzRq+99ppmzJihsLAwrV69Wj/++KPVKrlJkyZp0aJFmjNnjv744w8NGDBAHTt21ObNm1W4cGFjj8ijR48qKirKCBFv3rypgQMHas+ePYqIiJCdnZ1atGihxMTEVI/rxo0batKkiSIiIrR//3699NJLCg4OVmRkpFW76dOnKygoSPv371fTpk3VqVMnde7cWR07dtS+fftUvHhxde7cWUlJSZKk27dvq2rVqlq9erUOHTqkHj16qFOnTlbhVVolJiaqUKFCWr58uQ4fPqyRI0fqvffe07Jly6za/fzzzzp79qy2bNmiadOmadSoUXr55ZeVN29e7dq1S2+99ZZ69uypv//+27jGxcVFoaGhOnz4sGbOnKl58+Zp+vTpKa4tKSlJEREROnr0qJ5//nmbxnT9+nV5eHik+JqU+ueff/TDDz+oRo0aNl97/Phxfffdd1q1apVWrVqlzZs3a/LkyZKkqKgotWvXTq+//rqOHDmiTZs2qWXLlkpKSlJISIhee+01vfTSS4qKilJUVJRq1apl9Dt06FD169dPR44ceew2CvfMnj1bvXv3Vo8ePfT7779r5cqVKlGihCRp9+7dkqQFCxYoKirK+JwStWrVUkREhP78809J0sGDB7Vt2zY1btzYqt2zzz6rrVu3prjfh2EPVwAAkGXYsj9cavd3AwAAWc/p06fl6+ub7Lnx48crKChIktS9e3cNGzZMx48fV7FixSRJrVu31saNGzVkyBDFxsZq4sSJ+umnnxQYGChJKlasmLZt26bPPvtMderUMcI2Ly8vqz1cW7VqZXXfL774Qp6enjp8+LDKly+fqnFVrFhRFStWND6PGzdOK1as0MqVK41Vg5LUpEkT9ezZU5I0cuRIzZ49W9WrV9err74q6e6K0MDAQJ0/f95YURkSEmJc/84772j9+vVatmyZnn322VTV+m8ODg4aM2aM8blo0aLauXOnli1bptdee8047uHhoY8++kh2dnYqVaqUPvjgA8XExOi9996TJA0bNkyTJ0/Wtm3b1LZtW0l39+q8x9/fXyEhIQoPD9fgwYMfWdO1a9dUsGBBxcbGyt7eXp9++qlefPHFFI9pypQpunHjhlX9aTFkyBANHz5cCQkJun37tmrUqKFp06bZ3E9iYqJCQ0Pl4uIi6e4K24iICE2YMEFRUVGKj49Xy5Yt5efnJ0kKCAgwrnVyclJsbKx8fHwe6Ld///5q2bKlTbWMHz9e7777rvr162ccq169uiTJ09NTkuTu7p7s/R5l6NChio6OVunSpWVvb6+EhARNmDBBHTp0sGrn6+urM2fOKDExMU37uLLCFQAAAAAAZGu3bt1Srly5kj1XoUIF4+fe3t7GY+j3H7tw4YIk6dixY4qJidGLL74oZ2dn48eiRYseuz3AX3/9pXbt2qlYsWJydXU1Hpe+txq1cePGRn/lypVL0bhu3LihkJAQlSlTRu7u7nJ2dtaRI0ceWOH67zFK1qHavWP3xpmQkKBx48YpICBAHh4ecnZ21vr1641+lyxZYjX+1K4Y/OSTT1S1alV5enrK2dlZc+fOfaD2cuXKWQVj3t7eVrXb29srX758Ru2StHTpUgUFBcnHx0fOzs4aPny40W9kZKRV7RMnTjSuc3Fx0YEDB7R7925NmDBBAwcO1KZNm1I0lrCwMI0ZM0bLli2Tl5eXpLR/T4MGDdKBAwf022+/KSIiQpLUtGlTJSQkSJJV32+99dZD+/H39zfCVkkqUKCA8X1VrFhR9evXV0BAgF599VXNmzdPV65cSVF91apVs2k8Fy5c0NmzZ1W/fn2brkuJZcuWacmSJQoLC9O+ffu0cOFCTZkyRQsXLrRq5+TkpMTERMXGxqbpfqxwBQAAAAAA2Vr+/PkfGiI5ODgYP7dYLFaf7x2799j/jRs3JEmrV69WwYIFrdo5Ojo+sobg4GD5+flp3rx58vX1VWJiosqXL687d+5Ikj7//HPdunXrgZoeJSQkRBs2bNCUKVNUokQJOTk5qXXr1kafDxvjw47dG+eHH36omTNnasaMGQoICFCePHnUv39/o99XXnnF6tH2f38XKREeHq6QkBBNnTpVgYGBcnFx0Ycffqhdu3Y9tPZ7tT7q39HOnTvVoUMHjRkzRo0aNZKbm5vCw8M1derdp598fX114MAB49r7H/+3s7MzHm+vVKmSjhw5okmTJqlu3bqPHcsbb7yh5cuXW+3LmtbvKX/+/EY9JUuW1IwZMxQYGKiNGzeqQYMGVuNwdXV9aD+P+r7s7e21YcMG7dixw9g+4/3339euXbtUtGjRR9aXJ08eq892dnbGthT33L93rpOT0yP7S4tBgwZp6NChxirngIAAnT59WpMmTVKXLl2MdpcvX1aePHnSXAuBKwAAAJCJsHUGAJivcuXK+vLLL9PcT9myZeXo6KjIyEjVqVMn2TY5c+aUJGMVonT3JVRHjx7VvHnz9Nxzz0nSAy9jSk1ouX37dnXt2lUtWrSQdDcQPnXqlM39JNdvs2bN1LFjR0l3g9g///xTZcuWlXR3Jej9KyZTe49atWqpV69exrG0vkRMuvvyJD8/P73//vvGsdOnTxs/z5EjhxFiPk5KVkJ+9dVXev311xUeHq6mTZtanTPje7qfvb29JBnBfErH8TgWi0VBQUEKCgrSyJEj5efnpxUrVmjgwIHKmTOn1Vx+FE9PT507d05JSUlGiH9/KOzi4iJ/f39FREQYL5f7NwcHhxTf734xMTEPbBFgb2//wB7Jhw4dUuXKlW3u/98IXAEAAAAAQLbWqFEjDRs2TFeuXFHevHlT3Y+Li4tCQkI0YMAAJSYmqnbt2rp27Zq2b98uV1dXdenSRX5+frJYLFq1apWaNGkiJycn5c2bV/ny5dPcuXNVoEABRUZGaujQoWkeV8mSJfXtt98qODhYFotFI0aMSNNLuO7v9+uvv9aOHTuUN29eTZs2TefPnzcC10e5ePGiVcgm3X2EPbl7LFq0SOvXr1fRokW1ePFi7d69+7GrKlNSe2RkpMLDw1W9enWtXr1aK1aseOx1kyZNUrVq1VS8eHHFxsZqzZo1Wrx4sWbPnm20GTZsmP755x8tWrRI0t1tBLp06aKZM2eqRo0aOnfunKS7Kznd3Nweeq/Lly8rMjJSZ8+elXT3BWuS5OPjY7V36fXr140A88yZMxo8eLA8PT2tXlyVVrt27VJERIQaNmwoLy8v7dq1SxcvXlSZMmUk3d2OYP369Tp69Kjy5cv3yHHVrVtXFy9e1AcffKDWrVtr3bp1Wrt2rdXq29GjR+utt96Sl5eXGjdurOvXr2v79u165513jPtFREQoKChIjo6OKf71GhwcrAkTJqhIkSIqV66c9u/fr2nTpun111+3ard161Y1bNjQ1q/pAQSuAAAAAAAgXT3tK+4DAgJUpUoVLVu2zHh5VGqNGzdOnp6emjRpkk6cOCF3d3dVqVLFeIlTwYIFNWbMGA0dOlTdunVT586dFRoaqvDwcPXt21fly5dXqVKl9NFHHz32UfXHuRco1apVS/nz59eQIUMUHR2dpj6luy+dOnHihBo1aqTcuXOrR48eat68ua5du/bYa8PCwhQWFmZ1bNy4ccZq2Xt69uyp/fv3q02bNrJYLGrXrp169eqltWvXpqn2V155RQMGDFCfPn0UGxurpk2basSIERo9evQjr7t586Z69eqlv//+W05OTipdurS+/PJLtWnTxmgTFRVltcfs3LlzFR8fr969e6t3797G8S5duig0NPSh91q5cqW6detmfL73GPyoUaOs6hw5cqRGjhwp6e7q0erVq+vHH39Uvnz5UvJVpIirq6u2bNmiGTNmKDo6Wn5+fpo6daoaN24sSXrzzTe1adMmVatWTTdu3NDGjRuN/Yf/rUyZMvr00081ceJEjRs3Tq1atVJISIjmzp1rtOnSpYtu376t6dOnKyQkRPnz51fr1q2N81OnTtXAgQM1b948FSxYUKdOndKpU6dUtGhRbdy48aG/Zj7++GONGDFCvXr10oULF+Tr66uePXsa358k/fPPP9qxY4cpq90tSf/ePCEbio6Olpubm65du/bIPS1gnri4OK1Zs0Zlr5aVvewf275YrWO23aBk2v82Akgr5jmyg6dtnvOoNdID8xzZwdM2z7ODrPjn0Nu3b+vkyZMqWrToQ19A9TRbvXq1Bg0apEOHDqXp7eQAnqyNGzeqZcuWOnHiRJpWqA8ZMkRXrlyxCoDvZ8t/41jhCgAAAAAAsr2mTZvqr7/+0j///KPChQtndDkAUmjNmjV677330hS2SpKXl5cGDhxoSk0ErgAAAAAAAJL69++f0SUAsNGHH35oSj/vvpvyp4gehzXyAAAAAAAAAGASVrgCAAAAAJ4q7FUMAMjMWOEKAAAAAABMw7u5AWRFtvy3jcAVAAAAAACkmYODgyQpJiYmgysBAPPduXNHkmRvb//YtmwpAACZVOPf9trUfm2FqulUCZB+bJ3nK9OpDgAA8Hj29vZyd3fXhQsXJEm5c+eWxWLJ4KoAIO0SExN18eJF5c6dWzlyPD5OJXAFgGyCvdAAAACQ3nx8fCTJCF0BIKuws7NTkSJFUvQXSQSuAAAAAADAFBaLRQUKFJCXl5fi4uIyuhwAME3OnDllZ5ey3VkJXAGx8g8AAAAAzGRvb5+ifQ4BICvipVkAAAAAAAAAYBJWuAIAAAAZiJfDAQAAZC2scAUAAAAAAAAAkxC4AgAAAAAAAIBJ2FIAAAAAAJCu2DoDAJCdsMIVAAAAAAAAAExC4AoAAAAAAAAAJmFLAWRJPLIEAAAAAACAjMAKVwAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJMQuAIAAAAAAACASQhcAQAAAAAAAMAkBK4AAAAAAAAAYBICVwAAAAAAAAAwCYErAAAAAAAAAJgkQwPX2bNnq0KFCnJ1dZWrq6sCAwO1du1a4/zt27fVu3dv5cuXT87OzmrVqpXOnz9v1UdkZKSaNm2q3Llzy8vLS4MGDVJ8fPyTHgoAAAAAAAAAZGzgWqhQIU2ePFl79+7Vnj179MILL6hZs2b6448/JEkDBgzQDz/8oOXLl2vz5s06e/asWrZsaVyfkJCgpk2b6s6dO9qxY4cWLlyo0NBQjRw5MqOGBAAAAAAAACAby5GRNw8ODrb6PGHCBM2ePVu//PKLChUqpPnz5yssLEwvvPCCJGnBggUqU6aMfvnlF9WsWVM//vijDh8+rJ9++kne3t6qVKmSxo0bpyFDhmj06NHKmTNnRgwLAAAAAAAAQDaVoYHr/RISErR8+XLdvHlTgYGB2rt3r+Li4tSgQQOjTenSpVWkSBHt3LlTNWvW1M6dOxUQECBvb2+jTaNGjfT222/rjz/+UOXKlZO9V2xsrGJjY43P0dHRkqS4uDjFxcWl0whxv3vfc4ISUtY+Icmm/h0SE22rx87ehsbMEaQM8xzZAfMc2QHzHNkB8/zJ48+eAJB1WZKSkmz7ndJkv//+uwIDA3X79m05OzsrLCxMTZo0UVhYmLp162YVjErSs88+q3r16uk///mPevToodOnT2v9+vXG+ZiYGOXJk0dr1qxR48aNk73n6NGjNWbMmAeOh4WFKXfu3OYOEAAAAACAf4mJiVH79u117do1ubq6ZnQ5AAATZfgK11KlSunAgQO6du2avv76a3Xp0kWbN29O13sOGzZMAwcOND5HR0ercOHCatiwIb/RPSFxcXHasGGDSl0tJXs9/m+v/WucsKn/Vrc9bGof/sPyFLd1GDrBpr6RfTHPkR0wz5EdMM+RHTDPn7x7T1oCALKeDA9cc+bMqRIlSkiSqlatqt27d2vmzJlq06aN7ty5o6tXr8rd3d1of/78efn4+EiSfHx89Ouvv1r1d/78eePcwzg6OsrR0fGB4w4ODnJwcEjrkGAD+///z+M42Fts6jfOzrb3wTkkpuzRKUnMEdiMeY7sgHmO7IB5juyAef7kZNa6AQCPZ9vvek9AYmKiYmNjVbVqVTk4OCgiIsI4d/ToUUVGRiowMFCSFBgYqN9//10XLlww2mzYsEGurq4qW7bsE68dAAAAAAAAQPaWoStchw0bpsaNG6tIkSK6fv26wsLCtGnTJq1fv15ubm7q3r27Bg4cKA8PD7m6uuqdd95RYGCgatasKUlq2LChypYtq06dOumDDz7QuXPnNHz4cPXu3TvZFawAAAAAAAAAkJ4yNHC9cOGCOnfurKioKLm5ualChQpav369XnzxRUnS9OnTZWdnp1atWik2NlaNGjXSp59+alxvb2+vVatW6e2331ZgYKDy5MmjLl26aOzYsRk1JAAAAAAAAADZWIYGrvPnz3/k+Vy5cumTTz7RJ5988tA2fn5+WrNmjdmlAQAAAAAAAIDNnro9XAEAAAAAAAAgsyJwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGCSHLZekJiYqM2bN2vr1q06ffq0YmJi5OnpqcqVK6tBgwYqXLhwetQJAAAAAAAAAE+9FK9wvXXrlsaPH6/ChQurSZMmWrt2ra5evSp7e3sdO3ZMo0aNUtGiRdWkSRP98ssv6VkzAAAAAAAAADyVUrzC9ZlnnlFgYKDmzZunF198UQ4ODg+0OX36tMLCwtS2bVu9//77evPNN00tFgAAAAAAAACeZikOXH/88UeVKVPmkW38/Pw0bNgwhYSEKDIyMs3FAQAAAAAAAEBmkuItBR4Xtt7PwcFBxYsXT1VBAAAAAAAAAJBZpThwvd+6deu0bds24/Mnn3yiSpUqqX379rpy5YppxQEAAAAAAABAZpKqwHXQoEGKjo6WJP3+++9699131aRJE508eVIDBw40tUAAAAAAAAAAyCxSvIfr/U6ePKmyZctKkr755hu9/PLLmjhxovbt26cmTZqYWiAAAAAAAAAAZBapWuGaM2dOxcTESJJ++uknNWzYUJLk4eFhrHwFAAAAAAAAgOwmVStca9eurYEDByooKEi//vqrli5dKkn6888/VahQIVMLBAAAAAAAAIDMIlUrXGfNmqUcOXLo66+/1uzZs1WwYEFJ0tq1a/XSSy+ZWiAAAAAAAAAAZBapWuFapEgRrVq16oHj06dPT3NBAAAAAAAAAJBZpSpwvefChQu6cOGCEhMTrY5XqFAhTUUBAAAAAAAAQGaUqsB179696tKli44cOaKkpCRJksViUVJSkiwWixISEkwtEgAAAAAAAAAyg1QFrq+//rqeeeYZzZ8/X97e3rJYLGbXBQAAAAAAAACZTqoC1xMnTuibb75RiRIlzK4HAAAAAAAAADItu9RcVL9+fR08eNDsWgAAAAAAAAAgU0vVCtfPP/9cXbp00aFDh1S+fHk5ODhYnX/llVdMKQ4AAAAAAAAAMpNUBa47d+7U9u3btXbt2gfO8dIsAAAAAAAAANlVqgLXd955Rx07dtSIESPk7e1tdk0AAAAAgLT6rw0vN05wkvRVupUCAEB2kqrA9dKlSxowYABhKwAAADIngigAAACkk1S9NKtly5bauHGj2bUAAAAAAAAAQKaWqhWuzzzzjIYNG6Zt27YpICDggZdm9e3b15TiAOCpw4ooZAfMcwAAAABItVQFrp9//rmcnZ21efNmbd682eqcxWIhcM2u+AM6AAAAAAAAsrlUBa4nT540uw4AAAAAAAAAyPRStYcrAAAAAAAAAOBBKQ5cJ0+erFu3bqWo7a5du7R69epUFwUAAAAAAAAAmVGKtxQ4fPiwihQpoldffVXBwcGqVq2aPD09JUnx8fE6fPiwtm3bpi+//FJnz57VokWL0q1oPBnXxoyxqb1bm3QqBAAAAAAAAMgkUhy4Llq0SAcPHtSsWbPUvn17RUdHy97eXo6OjoqJiZEkVa5cWW+88Ya6du2qXLlypVvRAAAAAAAAAPA0sumlWRUrVtS8efP02Wef6bffftPp06d169Yt5c+fX5UqVVL+/PnTq04AAADgoXgyBwAAAE8LmwLXe+zs7FSpUiVVqlTJ5HIAAIDZCKIAAAAA4MlJVeAKAFkFQRQAAMgs+P8WAAAyB7uMLgAAAAAAAAAAsgoCVwAAAAAAAAAwCYErAAAAAAAAAJjEpsDV3t5eFy5cSK9aAAAAAAAAACBTsylwTUpKSq86AAAAAAAAACDTY0sBAAAAAAAAADBJDlsv+Pzzz+Xs7PzINn379k11QQAAAAAAAACQWdkcuM6ZM0f29vYPPW+xWAhcAQAAAAAAAGRLNgeue/bskZeXV3rUAgAAAAAAAACZmk17uFoslvSqAwAAAAAAAAAyPZsC16SkpPSqAwAAAAAAAAAyPZsC11GjRj32hVkAAAAAAAAAkF3ZFLjOnz9ft27dMj7PmjVL0dHRphcFAAAAAAAAAJmRTYHr33//rYSEBOPze++9p//973+mFwUAAAAAAAAAmZFNgeu/sacrAAAAAAAAAPyfNAWuAAAAAAAAAID/k8PWCz7//HPjxVnx8fEKDQ1V/vz5rdr07dvXnOoAAAAAAAAAIBOxKXAtUqSI5s2bZ3z28fHR4sWLrdpYLBYCVwAAAAAAAADZkk2B66lTp9KpDAAAAAAAAADI/Gzaw/WFF17Q1atX06kUAAAAAAAAAMjcbApcN23apDt37qRXLQAAAAAAAACQqdkUuAIAAAAAAAAAHs6mPVwl6fDhwzp37twj21SoUCHVBQEAAAAAAABAZmVz4Fq/fn0lJSU9cNxisSgpKUkWi0UJCQmmFAcAAAAAAAAAmYnNgeuuXbvk6emZHrUAAAAAAAAAQKZmc+BapEgReXl5pUctAAAAAAAAAJCp8dIsAAAAAAAAADCJTYFrnTp1dOfOnfSqBQAAAAAAAAAyNZsC1y1btihnzpzpVQsAAAAAAAAAZGo2Ba5JSUnpVQcAAAAAAAAAZHo27+FqsVjSow4AAAAAAAAAyPRy2HrBM88889jQ9fLly6kuCAAAAAAAAAAyK5sD1zFjxsjNzS09agEAAAAAAACATM3mwLVt27by8vJKj1oAAAAAAAAAIFOzaQ9X9m8FAAAAAAAAgIezKXBNSkpKrzoAAAAAAAAAINOzaUuBxMTE9KoDAAAAAAAAADI9m1a4AgAAAAAAAAAejsAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJBkauE6aNEnVq1eXi4uLvLy81Lx5cx09etSqze3bt9W7d2/ly5dPzs7OatWqlc6fP2/VJjIyUk2bNlXu3Lnl5eWlQYMGKT4+/kkOBQAAAAAAAAAyNnDdvHmzevfurV9++UUbNmxQXFycGjZsqJs3bxptBgwYoB9++EHLly/X5s2bdfbsWbVs2dI4n5CQoKZNm+rOnTvasWOHFi5cqNDQUI0cOTIjhgQAAAAAAAAgG8uRkTdft26d1efQ0FB5eXlp7969ev7553Xt2jXNnz9fYWFheuGFFyRJCxYsUJkyZfTLL7+oZs2a+vHHH3X48GH99NNP8vb2VqVKlTRu3DgNGTJEo0ePVs6cOR+4b2xsrGJjY43P0dHRkqS4uDjFxcWl44gzl3g72/L4uASnlLdNvNs2QQkp7DvJplocEhNtah9nZ29DY+ZIVsI8f1hj5nlWwjx/WGPmeVbCPH9YY+Z5VsI8f1jjzDnP+bMnAGRdlqSkJNt+p0xHx44dU8mSJfX777+rfPny+vnnn1W/fn1duXJF7u7uRjs/Pz/1799fAwYM0MiRI7Vy5UodOHDAOH/y5EkVK1ZM+/btU+XKlR+4z+jRozVmzJgHjoeFhSl37tzpMTQAAAAAAAwxMTFq3769rl27JldX14wuBwBgogxd4Xq/xMRE9e/fX0FBQSpfvrwk6dy5c8qZM6dV2CpJ3t7eOnfunNHG29v7gfP3ziVn2LBhGjhwoPE5OjpahQsXVsOGDfmN7j7Rkyfb1N615aQUt41LdNKGE1+o1NVSstfj//bav8YJm2ppddvDpvbhPyxPcVuHoRNs6htPN+Z58pjnWQvzPHnM86yFeZ485nnWwjxPXmad5/eetAQAZD1PTeDau3dvHTp0SNu2bUv3ezk6OsrR0fGB4w4ODnJwcEj3+2cWOWx87MfB/pbN97D///88vm+LTf3G2fi4lUNiyh6dksQcyWKY5w9pyzzPUpjnD2nLPM9SmOcPacs8z1KY5w9pm0nneWatGwDweBn60qx7+vTpo1WrVmnjxo0qVKiQcdzHx0d37tzR1atXrdqfP39ePj4+Rpvz588/cP7eOQAAAAAAAAB4UjI0cE1KSlKfPn20YsUK/fzzzypatKjV+apVq8rBwUERERHGsaNHjyoyMlKBgYGSpMDAQP3++++6cOGC0WbDhg1ydXVV2bJln8xAAAAAAAAAAEAZvKVA7969FRYWpu+//14uLi7Gnqtubm5ycnKSm5ubunfvroEDB8rDw0Ourq565513FBgYqJo1a0qSGjZsqLJly6pTp0764IMPdO7cOQ0fPly9e/dOdtsAAAAAAAAAAEgvGRq4zp49W5JUt25dq+MLFixQ165dJUnTp0+XnZ2dWrVqpdjYWDVq1Eiffvqp0dbe3l6rVq3S22+/rcDAQOXJk0ddunTR2LFjn9QwAAAAAAAAAEBSBgeuSUlJj22TK1cuffLJJ/rkk08e2sbPz09r1qwxszQAAAAAAAAAsNlT8dIsAAAAAAAAAMgKCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQ5MroAAAAAAMiuJu//X4rbvp2OdQAAAPMQuALIcviDC7ID5jkAAAAAPJ3YUgAAAAAAAAAATMIKVwAAADyVWMkNAACAzIjANZvhDy4AAAAAAABA+mFLAQAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAm4aVZAJ56M6/MtPGKDulSBwAAAAAAwOMQuGZyBFEAAAAAAADA04MtBQAAAAAAAADAJKxwBQDgKcATCwCQNfDfcwAAQOAKAACAJ4IgCgAAANkBWwoAAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJsnQwHXLli0KDg6Wr6+vLBaLvvvuO6vzSUlJGjlypAoUKCAnJyc1aNBAf/31l1Wby5cvq0OHDnJ1dZW7u7u6d++uGzduPMFRAAAAAAAAAMBdGRq43rx5UxUrVtQnn3yS7PkPPvhAH330kebMmaNdu3YpT548atSokW7fvm206dChg/744w9t2LBBq1at0pYtW9SjR48nNQQAAAAAAAAAMOTIyJs3btxYjRs3TvZcUlKSZsyYoeHDh6tZs2aSpEWLFsnb21vfffed2rZtqyNHjmjdunXavXu3qlWrJkn6+OOP1aRJE02ZMkW+vr5PbCwAAAAAAAAAkKGB66OcPHlS586dU4MGDYxjbm5uqlGjhnbu3Km2bdtq586dcnd3N8JWSWrQoIHs7Oy0a9cutWjRItm+Y2NjFRsba3yOjo6WJMXFxSkuLi6dRpQ+LPEWm9rbJcanuG28nW0LoOMSnFLeNvFu2wQlpLDvJJtqcUhMtKl9nJ29DY0z1xzJCpjnyWOeZy3M8+Qxz7MW5nnymOdZC/M8eczzB2W2P3sCAFLOkpSUZNvvlOnEYrFoxYoVat68uSRpx44dCgoK0tmzZ1WgQAGj3WuvvSaLxaKlS5dq4sSJWrhwoY4ePWrVl5eXl8aMGaO333472XuNHj1aY8aMeeB4WFiYcufObd6gAAAAAABIRkxMjNq3b69r167J1dU1o8sBAJjoqV3hmp6GDRumgQMHGp+jo6NVuHBhNWzYMNP9Rjf76myb2t+OfC3FbbuvmWdT364tJ6W4bVyikzac+EKlrpaSvR7/t9f+NU7YVEur2x42tQ//YXmK2zoMnWBT30g75nnymOdZC/M8eczzrIV5njzmedbCPE8e8/xB9560BABkPU9t4Orj4yNJOn/+vNUK1/Pnz6tSpUpGmwsXLlhdFx8fr8uXLxvXJ8fR0VGOjo4PHHdwcJCDg4MJ1T85STlsW6CcaJfyf+U5bHzsx8H+lk3tJcn+///z+L5tezQrzsbHrRwSU/bolKRMN0eyAuZ58pjnWQvzPHnM86yFeZ485nnWwjxPHvP8QZm1bgDA49n2u94TVLRoUfn4+CgiIsI4Fh0drV27dikwMFCSFBgYqKtXr2rv3r1Gm59//lmJiYmqUaPGE68ZAAAAAAAAQPaWoStcb9y4oWPHjhmfT548qQMHDsjDw0NFihRR//79NX78eJUsWVJFixbViBEj5Ovra+zzWqZMGb300kt68803NWfOHMXFxalPnz5q27atfH19M2hUAAAAAAAAALKrDA1c9+zZo3r16hmf7+2r2qVLF4WGhmrw4MG6efOmevTooatXr6p27dpat26dcuXKZVyzZMkS9enTR/Xr15ednZ1atWqljz766ImPBQAAAAAAAAAyNHCtW7eukpIevseRxWLR2LFjNXbs2Ie28fDwUFhYWHqUBwAAAAAAAAA2eWr3cAUAAAAAAACAzIbAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADBJlglcP/nkE/n7+ytXrlyqUaOGfv3114wuCQAAAAAAAEA2kyUC16VLl2rgwIEaNWqU9u3bp4oVK6pRo0a6cOFCRpcGAAAAAAAAIBvJkdEFmGHatGl688031a1bN0nSnDlztHr1an3xxRcaOnToA+1jY2MVGxtrfL527Zok6fLly4qLi3syRZsk9lrs4xvd5070lRS3vXLnjk19x1/LleK2cYm5FBMTo2u3rsle9o9tf+naDZtq0W0Hm5pfuhOf4rYOly7ZVgvSjHn+EMzzLIV5/hDM8yyFef4QzPMshXn+EMzzB1y/fl2SlJSUlMGVAADMZknK5P91v3PnjnLnzq2vv/5azZs3N4536dJFV69e1ffff//ANaNHj9aYMWOeYJUAAAAAADzozJkzKlSoUEaXAQAwUaZf4fq///1PCQkJ8vb2tjru7e2t//73v8leM2zYMA0cOND4nJiYqMuXLytfvnyyWCzpWi/uio6OVuHChXXmzBm5urpmdDlAumCeIztgniM7YJ4jO2CeP3lJSUm6fv26fH19M7oUAIDJMn3gmhqOjo5ydHS0Oubu7p4xxWRzrq6u/A8dsjzmObID5jmyA+Y5sgPm+ZPl5uaW0SUAANJBpn9pVv78+WVvb6/z589bHT9//rx8fHwyqCoAAAAAAAAA2VGmD1xz5sypqlWrKiIiwjiWmJioiIgIBQYGZmBlAAAAAAAAALKbLLGlwMCBA9WlSxdVq1ZNzz77rGbMmKGbN2+qW7duGV0aHsLR0VGjRo16YGsHICthniM7YJ4jO2CeIztgngMAYB5LUlJSUkYXYYZZs2bpww8/1Llz51SpUiV99NFHqlGjRkaXBQAAAAAAACAbyTKBKwAAAAAAAABktEy/hysAAAAAAAAAPC0IXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAASDXevQkAAAAA1ghcAQBAqjk6OurIkSMZXQYAIBW2bt2qjh07KjAwUP/8848kafHixdq2bVsGVwYAQOaWI6MLAM6cOaNRo0bpiy++yOhSgDS5deuW9u7dKw8PD5UtW9bq3O3bt7Vs2TJ17tw5g6oD0mbgwIHJHk9ISNDkyZOVL18+SdK0adOeZFlAupg1a5Z+/fVXNWnSRG3bttXixYs1adIkJSYmqmXLlho7dqxy5OB/o5G5ffPNN+rUqZM6dOig/fv3KzY2VpJ07do1TZw4UWvWrMngCgEAyLwsSTwLiAx28OBBValSRQkJCRldCpBqf/75pxo2bKjIyEhZLBbVrl1b4eHhKlCggCTp/Pnz8vX1ZZ4j07Kzs1PFihXl7u5udXzz5s2qVq2a8uTJI4vFop9//jljCgRMMn78eH3wwQdq2LChtm/frv79++vDDz/UgAEDZGdnp+nTp+vtt9/WmDFjMrpUIE0qV66sAQMGqHPnznJxcdHBgwdVrFgx7d+/X40bN9a5c+cyukQAADIt/moe6W7lypWPPH/ixIknVAmQfoYMGaLy5ctrz549unr1qvr376+goCBt2rRJRYoUyejygDSbOHGi5s6dq6lTp+qFF14wjjs4OCg0NPSBVd1AZhUaGqrQ0FC1bNlSBw8eVNWqVbVw4UJ16NBBklS6dGkNHjyYwBWZ3tGjR/X8888/cNzNzU1Xr1598gUBAJCFELgi3TVv3lwWi+WRL1axWCxPsCLAfDt27NBPP/2k/PnzK3/+/Prhhx/Uq1cvPffcc9q4caPy5MmT0SUCaTJ06FDVr19fHTt2VHBwsCZNmiQHB4eMLgsw3dmzZ1WtWjVJUsWKFWVnZ6dKlSoZ56tUqaKzZ89mUHWAeXx8fHTs2DH5+/tbHd+2bZuKFSuWMUUBAJBF8NIspLsCBQro22+/VWJiYrI/9u3bl9ElAml269Ytq/38LBaLZs+ereDgYNWpU0d//vlnBlYHmKN69erau3evLl68qGrVqunQoUP8hRmyHB8fHx0+fFiS9NdffykhIcH4LEl//PGHvLy8Mqo8wDRvvvmm+vXrp127dslisejs2bNasmSJQkJC9Pbbb2d0eQAAZGqscEW6q1q1qvbu3atmzZole/5xq1+BzKB06dLas2ePypQpY3V81qxZkqRXXnklI8oCTOfs7KyFCxcqPDxcDRo0YF9iZDkdOnRQ586d1axZM0VERGjw4MEKCQnRpUuXZLFYNGHCBLVu3TqjywTSbOjQoUpMTFT9+vUVExOj559/Xo6OjgoJCdE777yT0eUBAJCp8dIspLutW7fq5s2beumll5I9f/PmTe3Zs0d16tR5wpUB5pk0aZK2bt360Df69urVS3PmzFFiYuITrgxIP3///bf27t2rBg0asG0GsozExERNnjxZO3fuVK1atTR06FAtXbpUgwcPVkxMjIKDgzVr1izmPLKMO3fu6NixY7px44bKli0rZ2fnjC4JAIBMj8AVAAAAAAAAAEzClgIAAAAAkM3cvHlTkydPVkREhC5cuPDAUzgnTpzIoMoAAMj8CFwBAAAAIJt54403tHnzZnXq1EkFChTgJYgAAJiILQUAAAAAIJtxd3fX6tWrFRQUlNGlAACQ5dhldAEAAAAAgCcrb9688vDwyOgyAADIkghcAQAAACCbGTdunEaOHKmYmJiMLgUAgCyHLQUAAAAAIJupXLmyjh8/rqSkJPn7+8vBwcHq/L59+zKoMgAAMj9emgUAAAAA2Uzz5s0zugQAALIsVrgCAAAAAAAAgEnYwxUAAAAAAAAATMKWAgAAAACQDXh4eOjPP/9U/vz5lTdvXlksloe2vXz58hOsDACArIXAFQAAAACygenTp8vFxUWSNGPGjIwtBgCALIw9XAEAAAAAAADAJKxwBQAAAIBsIDo6OsVtXV1d07ESAACyNla4AgAAAEA2YGdn98h9WyUpKSlJFotFCQkJT6gqAACyHla4AgAAAEA2sHHjxowuAQCAbIEVrgAAAACQDbRs2VKhoaFydXXVokWL1KZNGzk6OmZ0WQAAZDkErgAAAACQDeTMmVOnT59WgQIFZG9vr6ioKHl5eWV0WQAAZDlsKQAAAAAA2UDp0qU1bNgw1atXT0lJSVq2bNlDX47VuXPnJ1wdAABZBytcAQAAACAb2L59u959910dP35cly9flouLS7Iv0bJYLLp8+XIGVAgAQNZA4AoAAAAA2YydnZ3OnTvHlgIAAKQDu4wuAAAAAACQ/lq2bKno6GhJ0oIFC+Ti4pLBFQEAkDWxwhUAAAAAsgFemgUAwJPBS7MAAAAAIBvgpVkAADwZrHAFAAAAgGxgx44dGjhwIC/NAgAgnRG4AgAAAEA2Y2dnp6ioKHl7e2d0KQAAZDkErgAAAACQzZw+fVqurq764osvdOTIEUlSuXLl1L1794duMwAAAFKGwBUAAAAAspk9e/aoUaNGcnJy0rPPPitJ2r17t27duqX169eratWqGVwhAACZF4ErAAAAAGQzzz33nEqUKKF58+YpR46771KOj4/XG2+8oRMnTmjLli0ZXCEAAJkXgSsAAAAAZDNOTk7av3+/SpcubXX88OHDqlatmmJiYjKoMgAAMj+7jC4AAAAAAPBkubq6KjIy8oHjZ86ckYuLSwZUBABA1kHgCgAAAADZTJs2bdS9e3ctXbpUZ86c0ZkzZxQeHq433nhD7dq1y+jyAADI1HJkdAEAAAAAgCdrypQpslgs6ty5s+Lj4yVJDg4OevvttzV58uQMrg4AgMyNPVwBAAAAIJuKiYnR8ePHJUnFixdX7ty5M7giAAAyPwJXAAAAAAAAADAJe7gCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAA2ZDFYtF3331nWn+bNm2SxWLR1atXTesTAAAAADIjAlcAADKJnTt3yt7eXk2bNk1zX1FRUWrcuLEJVaWcv7+/LBaLLBaLcufOrYCAAH3++ec292N2WAwAAAAAZiJwBQAgk5g/f77eeecdbdmyRWfPnn1k26SkJMXHxz9w/M6dO5IkHx8fOTo6pkudjzJ27FhFRUXp0KFD6tixo958802tXbv2idcBAAAAAOmFwBUAgEzgxo0bWrp0qd5++201bdpUoaGhVufvPdK/du1aVa1aVY6Ojtq2bZvq1q2rPn36qH///sqfP78aNWokyXqVaK1atTRkyBCr/i5evCgHBwdt2bJFkrR48WJVq1ZNLi4u8vHxUfv27XXhwgWbx3Hv+mLFimnIkCHy8PDQhg0bjPO7d+/Wiy++qPz588vNzU116tTRvn37jPP+/v6SpBYtWshisRifJen7779XlSpVlCtXLhUrVkxjxoxJNnQGAAAAgPRE4AoAQCawbNkylS5dWqVKlVLHjh31xRdfKCkp6YF2Q4cO1eTJk3XkyBFVqFBBkrRw4ULlzJlT27dv15w5cx64pkOHDgoPD7fqb+nSpfL19dVzzz0nSYqLi9O4ceN08OBBfffddzp16pS6du2a6vEkJibqm2++0ZUrV5QzZ07j+PXr19WlSxdt27ZNv/zyi0qWLKkmTZro+vXrku4GspK0YMECRUVFGZ+3bt2qzp07q1+/fjp8+LA+++wzhYaGasKECamuEQAAAABSw5KU3J/WAADAUyUoKEivvfaa+vXrp/j4eBUoUEDLly9X3bp1Jd1d4VqvXj199913atasmXFd3bp1FR0dbbVKVLq7wnXFihVq3ry5Ll68KF9fX/38889GwFqrVi09//zzmjx5crL17NmzR9WrV9f169fl7Oxs3P/KlStyd3dP9hp/f39FRUXJwcFBsbGxio+Pl4eHh3bt2qUSJUoke01iYqLc3d0VFhaml19++YHa72nQoIHq16+vYcOGGce+/PJLDR48+LHbLwAAAACAmVjhCgDAU+7o0aP69ddf1a5dO0lSjhw51KZNG82fP/+BttWqVXvgWNWqVR/Zv6enpxo2bKglS5ZIkk6ePKmdO3eqQ4cORpu9e/cqODhYRYoUkYuLi+rUqSNJioyMtGksgwYN0oEDB/Tzzz+rRo0amj59ulXYev78eb355psqWbKk3Nzc5Orqqhs3bjz2PgcPHtTYsWPl7Oxs/HjzzTcVFRWlmJgYm2oEAAAAgLTIkdEFAACAR5s/f77i4+Pl6+trHEtKSpKjo6NmzZolNzc343iePHkeuD65Y//WoUMH9e3bVx9//LHCwsIUEBCggIAASdLNmzfVqFEjNWrUSEuWLJGnp6ciIyPVqFEj4yVcKZU/f36VKFFCJUqU0PLlyxUQEKBq1aqpbNmykqQuXbro0qVLmjlzpvz8/OTo6KjAwMDH3ufGjRsaM2aMWrZs+cC5XLly2VQjAAAAAKQFK1wBAHiKxcfHa9GiRZo6daoOHDhg/Dh48KB8fX311VdfmXKfZs2a6fbt21q3bp3CwsKsVrf+97//1aVLlzR58mQ999xzKl26dKpemPVvhQsXVps2bay2Adi+fbv69u2rJk2aqFy5cnJ0dNT//vc/q+scHByUkJBgdaxKlSo6evSoEebe/8POjv/dAQAAAPDksMIVAICn2KpVq3TlyhV1797daiWrJLVq1Urz58/XW2+9leb75MmTR82bN9eIESN05MgRY/sCSSpSpIhy5sypjz/+WG+99ZYOHTqkcePGpfmektSvXz+VL19ee/bsUbVq1VSyZEktXrxY1apVU3R0tAYNGiQnJyera/z9/RUREaGgoCA5Ojoqb968GjlypF5++WUVKVJErVu3lp2dnQ4ePKhDhw5p/PjxptQKAAAAACnBkg8AAJ5i8+fPV4MGDR4IW6W7geuePXv022+/mXKvDh066ODBg3ruuedUpEgR47inp6dCQ0O1fPlylS1bVpMnT9aUKVNMuWfZsmXVsGFDjRw5UtLd8V65ckVVqlRRp06d1LdvX3l5eVldM3XqVG3YsEGFCxdW5cqVJUmNGjXSqlWr9OOPP6p69eqqWbOmpk+fLj8/P1PqBAAAAICUsiQlJSVldBEAAAAAAAAAkBWwwhUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJMQuAIAAAAAAACASQhcAQAAAAAAAMAkBK4AAAAAAAAAYBICVwAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJP8P6J8V3JpZ4/JAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create a list to store the TTFT data\n",
+    "ttft_data = []\n",
+    "\n",
+    "# Iterate over the models, batch sizes, and arrival rates to calculate TTFT\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                ttft = get_ttft(filepath)\n",
+    "                ttft_data.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'TTFT': ttft\n",
+    "                })\n",
+    "# add incremental decoding entry\n",
+    "for batch_size in batch_sizes:\n",
+    "    for arrival_rate in arrival_rates:\n",
+    "        model_name = ssm.replace(\"/\", \"-\")\n",
+    "        filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "        if os.path.exists(filepath):\n",
+    "            ttft = get_ttft(filepath)\n",
+    "            ttft_data.append({\n",
+    "                'Model': \"Incr Dec (baseline)\",\n",
+    "                'Batch Size': batch_size,\n",
+    "                'Arrival Rate': arrival_rate,\n",
+    "                'TTFT': ttft\n",
+    "            })\n",
+    "\n",
+    "# Convert the list to a DataFrame\n",
+    "ttft_df = pd.DataFrame(ttft_data)\n",
+    "print(ttft_df.head())\n",
+    "\n",
+    "# Pivot the dataframe to have models and batch sizes as columns\n",
+    "pivot_df = ttft_df.pivot_table(index='Arrival Rate', columns=['Model', 'Batch Size'], values='TTFT')\n",
+    "\n",
+    "# Plot the data\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "\n",
+    "colors = ['lightgreen', 'skyblue', 'lightcoral', 'gold', 'plum', 'peachpuff', 'mediumturquoise', 'salmon']\n",
+    "pivot_df.plot(kind='bar', ax=ax, color=colors)\n",
+    "\n",
+    "ax.set_title('TTFT vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n",
+    "ax.set_xlabel('Arrival Rate')\n",
+    "ax.set_ylabel('TTFT (ms)')\n",
+    "ax.grid(True)\n",
+    "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/wildchat/ttft_vs_arrival_rate.pdf')\n",
+    "\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n",
+      "/tmp/ipykernel_3339078/2453520981.py:58: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  queueing_time = group.apply(lambda x: x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -2][\"timestamp\"].values[0])\n",
+      "/tmp/ipykernel_3339078/2453520981.py:60: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  return queueing_time.mean()[1] / 1000000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Model  Batch Size Arrival Rate  Queueing Time\n",
+      "0  Zhuominc-Llama-3-330M           4      offline     376.053818\n",
+      "1  Zhuominc-Llama-3-330M           4            1     319.585296\n",
+      "2  Zhuominc-Llama-3-330M           4            2     346.747481\n",
+      "3  Zhuominc-Llama-3-330M           4            4     360.138720\n",
+      "4  Zhuominc-Llama-3-330M           4            8     368.694877\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeVzN2f8H8NetaC+tWqSSKCTKbixhREnZSyhbDCEmY4vKNvZtxjJjkCGyb2OXrez7mmUalUnJWlIq9fn94dfn67qVWyKm1/PxuI/v3PM5n3Pen0/nXt/enc85EkEQBBARERERERERERHRJ1Mo6wCIiIiIiIiIiIiI/iuYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCWib56FhQV8fX3LOoxCHT9+HBKJBMePHy/rUP4z4uLiIJFIEBYW9tn6CAkJgUQi+Wztl5X79++jffv20NbWhkQiwc6dO8s6JCmFfV7WrVsHGxsbVKhQAZUqVRLL586di2rVqkFRURH16tX7orH+F33K91VYWBgkEgni4uJKPa7P5Vv+fs6/3xcvXvzsfUkkEoSEhHz2foiIiIj+K5hwJSpnbt26hT59+sDU1BTKysowMTFBnz59cPv27bIO7Zvi6+sLiUTy0dfXnAj+El6+fAkVFRVIJBLExMSUdTifxYdjQVlZGTVq1MCUKVPw5s2bErV5+/ZthISEfJbElY+PD27cuIEZM2Zg3bp1aNCgQan3kS8/MZ7/qlChAvT19dGsWTNMnDgRCQkJcrVz584d+Pr6wsrKCitXrsTvv/8OADh06BB++uknNG/eHGvWrMHMmTM/27V8qtOnTyMkJAQvX76Uq37+uNLS0kJmZqbM8fv374v3dd68eaUcLX0oP7n5/svQ0BBOTk7Yv39/idudOXNmmf7RIzo6Gh07doSpqSlUVFRQtWpVuLm5YcOGDWUWExEREdF/gVJZB0BEX8727dvh5eUFXV1dDBw4EJaWloiLi8OqVauwdetWbNq0Ce7u7mUdZrHdvXsXCgpf9u9HQ4YMQbt27cT3Dx48wJQpU+Dn54cWLVqI5VZWVmjcuDEyMzNRsWLFLxrj12DLli2QSCQwMjJCeHg4pk+fXirtmpubIzMzExUqVCiV9j6VsrIy/vjjDwBAamoqdu3ahWnTpiE2Nhbh4eHFbu/27dsIDQ1F69atYWFhUWpxZmZm4syZM5g0aRL8/f1Lrd2P8fLygouLC/Ly8vDixQtcuHABixYtwuLFi7Fq1Sp4enqKdVu2bCnzeTl+/Djy8vKwePFiVK9eXSw/evQoFBQUsGrVqq/+83X69GmEhobC19dXaoZuUZSUlJCRkYE9e/agZ8+eUsfCw8OhoqJS4qQ+lczUqVNhaWkJQRDw+PFjhIWFwcXFBXv27EGnTp2K3d7MmTPRvXt3eHh4lH6wH7Flyxb06tUL9erVw6hRo6Cjo4MHDx7g5MmTWLlyJXr37i3WzczMhJISf20gIiIikhf/nxNROREbG4u+ffuiWrVqOHnyJAwMDMRjo0aNQosWLdCnTx9cv34dlpaWZRhp8SkrK3/xPps2bYqmTZuK7y9evIgpU6agadOm6NOnj0x9FRWVLxneV2P9+vVwcXGBubk5NmzYIHfCNSMjA2pqajLlb9++RV5eHipWrPhV3VMlJSWpn/uwYcPQrFkzbNy4EQsWLEDlypXLMLr/efLkCQDInfCTx+vXr6Gurl5kHQcHB5nPRXx8PNq3bw8fHx/Y2trC3t4eAKCgoCDzs01JSSkw7pSUFKiqqpZqsrWwsVcWlJWV0bx5c2zcuFEm4bphwwa4urpi27ZtZRRd+dSxY0epWeEDBw5E5cqVsXHjxhIlXMtSSEgIatWqhbNnz8p8hvI/c/m+pu9bIiIiom8BlxQgKifmzp2LjIwM/P7771LJVgDQ19fHb7/9hvT0dMydO1cs9/X1LXB2XWFrW65fvx6Ojo5QVVWFrq4uPD098fDhQ6k6ha232rp1a7Ru3VqqLCsrC8HBwahevTqUlZVhZmaGn376CVlZWUW2mf/o56lTpzBmzBgYGBhAXV0dXbp0ERNO+fLy8hASEgITExOoqanByckJt2/fLtV1YQtaI7B169aoU6cOrl+/jlatWkFNTQ3Vq1fH1q1bAQAnTpxA48aNoaqqipo1a+LIkSMy7SYmJmLAgAGoXLkylJWVUbt2baxevfqj8dSpUwdOTk4y5Xl5eTA1NUX37t3FsoiICDg6OkJTUxNaWlqws7PD4sWL5bruhIQEREVFwdPTE56ennjw4AFOnz4tUy//Xly6dAktW7aEmpoaJk6cKD6OPm/ePCxatAhWVlZQVlbG7du3ZdZwnTdvHiQSCeLj42XanzBhAipWrIgXL14AAKKiotCjRw9UrVpVHFejR48u8LHtkpJIJPjuu+8gCAL++ecfsTw+Ph7Dhg1DzZo1oaqqCj09PfTo0UNq6YCwsDD06NEDAODk5CQ+vvz++Nm/fz9atGgBdXV1aGpqwtXVFbdu3SoyppCQEJibmwMAxo4dC4lEIvX5vnLlCjp27AgtLS1oaGigbdu2OHv2rFQb+Z+tEydOYNiwYTA0NESVKlVKdI/Mzc0RFhaG7OxszJkzRyz/8PNiYWGB4OBgAICBgYG4lqREIsGaNWvw+vVr8R69v6avPN9HhY09QP7vH4lEAn9/f+zcuRN16tQRP4sHDhyQuvdjx44FAFhaWorxyrNkRO/evbF//36ppQguXLiA+/fvS81AfN8///yDHj16QFdXF2pqamjSpAn27t0rU+/ff/+Fh4cH1NXVYWhoiNGjR8tcX75z586hQ4cO0NbWhpqaGlq1aoVTp059NP6LFy/C2dkZ+vr6UFVVhaWlJQYMGPDR83bt2gVXV1eYmJhAWVkZVlZWmDZtGnJzc6Xq5f8Mb9++DScnJ6ipqcHU1FRqTJXkeuVVqVIlqKqqysz+nDdvHpo1awY9PT2oqqrC0dFR/H7PJ5FI8Pr1a6xdu7bAZWgSExMxcOBA8R5YWlrihx9+QHZ2tlQ7WVlZH/23riCxsbFo2LBhgX+wMDQ0lIk1fw3XD5cK+fD1PnnGzatXrxAQEAALCwsoKyvD0NAQ33//PS5fvvzRayAiIiL6WnGGK1E5sWfPHlhYWEg97v6+li1bwsLCAnv27MGyZcuK3f6MGTMwefJk9OzZE4MGDcKTJ0/wyy+/oGXLlrhy5UqxZ9Tl5eWhc+fOiI6Ohp+fH2xtbXHjxg0sXLgQ9+7dk2vNuxEjRkBHRwfBwcGIi4vDokWL4O/vj02bNol1JkyYgDlz5sDNzQ3Ozs64du0anJ2dv8hjui9evECnTp3g6emJHj16YPny5fD09ER4eDgCAgIwdOhQ9O7dG3PnzkX37t3x8OFDaGpqAgAeP36MJk2aiMkeAwMD7N+/HwMHDkRaWhoCAgIK7bdXr14ICQlBcnIyjIyMxPLo6Gg8evRIfLz78OHD8PLyQtu2bTF79mwAQExMDE6dOoVRo0Z99Po2btwIdXV1dOrUCaqqqrCyskJ4eDiaNWsmU/fZs2fo2LEjPD090adPH6kZoWvWrMGbN2/g5+cHZWVl6OrqIi8vT+r8nj174qeffsLmzZvFxFa+zZs3o3379tDR0QHw7jHajIwM/PDDD9DT08P58+fxyy+/4N9//8WWLVs+el3yyk+m5fcLvEuUnT59Gp6enqhSpQri4uKwfPlytG7dGrdv34aamhpatmyJkSNHYsmSJZg4cSJsbW0BQPzfdevWwcfHB87Ozpg9ezYyMjKwfPlyfPfdd7hy5UqhSxB07doVlSpVwujRo8VH/DU0NAC8W9u5RYsW0NLSwk8//YQKFSrgt99+Q+vWrcXk//uGDRsGAwMDTJkyBa9fvy7xPWratCmsrKxw+PDhQussWrQIf/75J3bs2IHly5dDQ0MDdevWRfXq1fH777/j/Pnz4nIO+WOrON9HBY294n7/REdHY/v27Rg2bBg0NTWxZMkSdOvWDQkJCdDT00PXrl1x7949bNy4EQsXLoS+vj4AyPzxqyBdu3bF0KFDsX37djFRuWHDBtjY2MDBwUGm/uPHj9GsWTNkZGRg5MiR0NPTw9q1a9G5c2ds3boVXbp0AfDuEfG2bdsiISEBI0eOhImJCdatW4ejR4/KtHn06FF07NgRjo6OCA4OhoKCAtasWYM2bdogKioKjRo1KjD2lJQUtG/fHgYGBhg/fjwqVaqEuLg4bN++/aPXHRYWBg0NDYwZMwYaGho4evQopkyZgrS0NKk/DALvvks7dOiArl27omfPnti6dSvGjRsHOzs7dOzYsdjXW5TU1FQ8ffoUgiAgJSUFv/zyC9LT02VmcC9evBidO3eGt7c3srOzERERgR49euCvv/6Cq6srgHef5UGDBqFRo0bw8/MD8G4ZGgB49OgRGjVqhJcvX8LPzw82NjZITEzE1q1bkZGRIZUkleffuoKYm5sjMjIS//77b7H+cGJgYIB169ZJleXk5GD06NFScck7boYOHYqtW7fC398ftWrVwrNnzxAdHY2YmJgCxzgRERHRN0Egov+8ly9fCgAEd3f3Iut17txZACCkpaUJgiAIPj4+grm5uUy94OBg4f2vj7i4OEFRUVGYMWOGVL0bN24ISkpKUuXm5uaCj4+PTJutWrUSWrVqJb5ft26doKCgIERFRUnVW7FihQBAOHXqVKFtrlmzRgAgtGvXTsjLyxPLR48eLSgqKgovX74UBEEQkpOTBSUlJcHDw0Oqj5CQEAFAgXEW5sKFCwIAYc2aNTLHjh07JgAQjh07JnW9AIQNGzaIZXfu3BEACAoKCsLZs2fF8oMHD8q0PXDgQMHY2Fh4+vSpVF+enp6Ctra2kJGRUWisd+/eFQAIv/zyi1T5sGHDBA0NDfHcUaNGCVpaWsLbt2/luQUy7OzsBG9vb/H9xIkTBX19fSEnJ0eqXv69WLFihVT5gwcPBACClpaWkJKSUuCx9+9J06ZNBUdHR6l658+fFwAIf/75p1hW0L35+eefBYlEIsTHx4tlH47zwvj4+Ajq6urCkydPhCdPngh///23MG/ePEEikQh16tSRGoMF9X3mzBmZGLds2SIzZgRBEF69eiVUqlRJGDx4sFR5cnKyoK2tLVP+ofz7NnfuXKlyDw8PoWLFikJsbKxY9ujRI0FTU1No2bKlWJb/2fruu+/kGheF9fc+d3d3AYCQmpoqCELBn5f8n8WTJ0+kzs2/9+8rzvdRYWOvON8/AISKFSsKf//9t1h27do1mc/Y3LlzBQDCgwcPCr0XhV1b9+7dhbZt2wqCIAi5ubmCkZGREBoaWuD9DQgIEABIxf7q1SvB0tJSsLCwEHJzcwVBEIRFixYJAITNmzeL9V6/fi1Ur15d6v7n5eUJ1tbWgrOzs8xYtrS0FL7//nuxLH985F/jjh07BADChQsX5Lrm9xX0WRkyZIigpqYmvHnzRizL/xm+//nJysoSjIyMhG7duoll8l5vYfKv7cOXsrKyEBYW9tH4s7OzhTp16ght2rSRKldXVy/w35p+/foJCgoKBd67/J+DvP/WFWbVqlXi+HVychImT54sREVFiWPkfQCE4ODgQtsaNmyYoKioKBw9elSMUd5xo62tLQwfPrzIWImIiIi+NVxSgKgcePXqFQCIsyMLk388v768tm/fjry8PPTs2RNPnz4VX0ZGRrC2tsaxY8eKHfOWLVtga2sLGxsbqTbbtGkDAHK16efnJ/V4Y4sWLZCbmys+dh4ZGYm3b99i2LBhUueNGDGi2PGWhIaGhtRmQTVr1kSlSpVga2srNaMw/7/zH00XBAHbtm2Dm5sbBEGQuj/Ozs5ITU0t8lHMGjVqoF69elKzn3Jzc7F161a4ublBVVUVwLtHZV+/fl3k7MPCXL9+HTdu3ICXl5dY5uXlhadPn+LgwYMy9ZWVldG/f/8C2+rWrZtcMwF79eqFS5cuITY2VizbtGkTlJWVpTaDy78+4N36o0+fPkWzZs0gCAKuXLki1/V96PXr1zAwMICBgQGqV6+OwMBANG/eHLt27ZIag+/3nZOTg2fPnqF69eqoVKmSXI/PHj58GC9fvhTvZf5LUVERjRs3LtFnLTc3F4cOHYKHhweqVasmlhsbG6N3796Ijo5GWlqa1DmDBw+GoqJisfsqSP4s2+J+7xSmuN9HBY294n7/tGvXTpyZCAB169aFlpaW1HISn6J37944fvw4kpOTcfToUSQnJxe6nMC+ffvQqFEjfPfdd2KZhoYG/Pz8EBcXh9u3b4v1jI2NpZYQUVNTE2da5rt69aq4fMGzZ8/Ee/H69Wu0bdsWJ0+elJlxni9/JvFff/2FnJycYl3z+5+VV69e4enTp2jRogUyMjJw584dqboaGhpSM0wrVqyIRo0aSd1/ea/3Y5YuXYrDhw/j8OHDWL9+PZycnDBo0CCZWbvvx//ixQukpqaiRYsWcn3O8/LysHPnTri5uUmtF5vvw8f2P/ZvXWEGDBiAAwcOoHXr1oiOjsa0adPQokULWFtbF7j8S2H+/PNPLFu2DHPmzBGXqynOuKlUqRLOnTuHR48eyd0nERER0deOSwoQlQPyJlJfvXoFiUQiPu4qr/v370MQBFhbWxd4vCQ7yd+/fx8xMTGFJto+3NCjIFWrVpV6n/9od/5anvm/jL6/6zkA6OrqSj0G/rlUqVJF5hdnbW1tmJmZyZQB/4v7yZMnePnyJX7//Xf8/vvvBbb9sfvTq1cvTJw4EYmJiTA1NcXx48eRkpKCXr16iXWGDRuGzZs3o2PHjjA1NUX79u3Rs2dPdOjQ4aPXtn79eqirq6NatWr4+++/AbzbdMXCwgLh4eHiI7X5TE1NC934SN5N3Hr06IExY8Zg06ZNmDhxIgRBwJYtW8R1SfMlJCRgypQp2L17t3hP86WmpsrV14dUVFSwZ88eAO/WiZwzZ464odP7MjMz8fPPP2PNmjVITEyEIAjF6vv+/fsAICb+PvT+dcrryZMnyMjIQM2aNWWO2draIi8vDw8fPkTt2rXF8tLcWC89PR3Ax/8gJK/ifh8VNPaK+/3z4XcN8O775sPxVVIuLi7Q1NTEpk2bcPXqVTRs2BDVq1cvcA3Y+Ph4mSUggP8tSREfH486deogPj4e1atXl/kO+nAc5I85Hx+fQuNLTU0t8DuzVatW6NatG0JDQ7Fw4UK0bt0aHh4e6N2790c3O7x16xaCgoJw9OhRmYT/h5+Vgr5LdXR0cP36dfG9vNf7MY0aNZJKgnp5eaF+/frw9/dHp06dxLH0119/Yfr06bh69arUOrEFrX/+oSdPniAtLQ116tSRK6aP/VtXFGdnZzg7OyMjIwOXLl3Cpk2bsGLFCnTq1Al37tyRWcv1Q1evXsXQoUPh5eWFMWPGiOXFGTdz5syBj48PzMzM4OjoCBcXF/Tr10/qD0BERERE3xomXInKAW1tbZiYmEj98lmQ69evo0qVKuIvjIX9YvjhpiV5eXmQSCTYv39/gbPe8mewfazN98/Ny8uDnZ0dFixYUGD9D5OSBSlsBt77Sa6yVFh8H4s7f1ZQnz59Cv1ltm7dukX23atXL0yYMAFbtmxBQEAANm/eDG1tbalkqqGhIa5evYqDBw9i//792L9/P9asWYN+/fph7dq1hbYtCAI2btyI169fo1atWjLHU1JSkJ6eLjUuPkxMvq+oY+8zMTFBixYtsHnzZkycOBFnz55FQkKCuP4s8G6cff/993j+/DnGjRsHGxsbqKurIzExEb6+voXO1PsYRUVFtGvXTnzv7OwMGxsbDBkyBLt37xbLR4wYgTVr1iAgIABNmzaFtrY2JBIJPD095eo7v866deuk1t/N9+HGPZ+LvD8Tedy8eROGhoYlShYXpDjfR0DB11Lc75/P/V2jrKyMrl27Yu3atfjnn3/EzYu+hPwxN3fuXNSrV6/AOh/e03wSiQRbt27F2bNnsWfPHhw8eBADBgzA/Pnzcfbs2ULPe/nyJVq1agUtLS1MnToVVlZWUFFRweXLlzFu3DiZz0pZftcrKCjAyckJixcvxv3791G7dm1ERUWhc+fOaNmyJZYtWwZjY2NUqFABa9aswYYNG0o9htK4fjU1NbRo0QItWrSAvr4+QkNDsX///iITpi9evEC3bt1Qo0YNcR3lfMUZNz179kSLFi2wY8cOHDp0CHPnzsXs2bOxfft2cQ1eIiIiom8NE65E5YSbmxt+++03REdHSz1qmi8qKgpxcXFSM1R0dHSkdsbO9+FjilZWVhAEAZaWlqhRo0aRcRTV5vuzWaysrHDt2jW0bdtWrhlBJZG/Y/vff/8tNWPv2bNnpTYz7XMwMDCApqYmcnNzpZJ8xWFpaYlGjRph06ZN8Pf3x/bt2+Hh4SEz66xixYpwc3ODm5sb8vLyMGzYMPz222+YPHmyzMzgfCdOnMC///6LqVOnirPq8r148QJ+fn7YuXOnzCYzpaFXr14YNmwY7t69i02bNkFNTQ1ubm7i8Rs3buDevXtYu3Yt+vXrJ5aXZNmEohgbG2P06NEIDQ3F2bNn0aRJEwDA1q1b4ePjg/nz54t137x5I/OZKGzM5z+2bmhoWOKf/YcMDAygpqaGu3fvyhy7c+cOFBQU5PoDR0mcOXMGsbGxpToWivN9VFQbpf3986nt9O7dG6tXr4aCgoLUUiQfMjc3L/RnmX88/39v3rwJQRCkYvvw3Pwxp6WlVeIx16RJEzRp0gQzZszAhg0b4O3tjYiICAwaNKjA+sePH8ezZ8+wfft2tGzZUix/8OBBifoH5L/eknj79i2A/83W3rZtG1RUVHDw4EGp79Q1a9bInFvQuDAwMICWlhZu3rz5ybGVRP4M3qSkpELr5OXlwdvbGy9fvsSRI0egpqYmdby448bY2BjDhg3DsGHDkJKSAgcHB8yYMYMJVyIiIvpmcQ1XonIiMDAQampqGDJkCJ49eyZ17Pnz5xg6dCi0tLTg7+8vlltZWSE1NVVqZmxSUhJ27NghdX7Xrl2hqKiI0NBQmRk1giBI9WdlZYWzZ88iOztbLPvrr7/w8OFDqfN69uyJxMRErFy5UuZaMjMzP2ln9Hxt27aFkpISli9fLlX+66+/fnLbn5OioiK6deuGbdu2FfgL+ZMnT+Rqp1evXjh79ixWr16Np0+fSi0nAEBmnCgoKIgzZ99/RPZD+csJjB07Ft27d5d6DR48GNbW1ggPD5crxuLq1q0bFBUVsXHjRmzZsgWdOnWCurq6eDx/Jtj741QQBCxevLjUYxkxYgTU1NQwa9Ysqf4//Iz88ssvMrPG82P+MBHr7OwMLS0tzJw5s8A1MeX92b9PUVER7du3x65du6QeUX/8+DE2bNiA7777rtRmn74vPj4evr6+qFixIsaOHVtq7Rbn+6gwn+P7p7CfqbycnJwwbdo0/PrrrwXObs7n4uKC8+fP48yZM2LZ69ev8fvvv8PCwkKcde7i4oJHjx5h69atYr2MjAyZZUocHR1hZWWFefPmiQnF9xU15l68eCHzM8if7VjUd0hBn9Ps7GwsW7as0HM+Rt7rLa6cnBwcOnQIFStWFP/ApKioCIlEIvW5jouLw86dO2XOV1dXlxkTCgoK8PDwwJ49e3Dx4kWZc0pr5m5kZGSB5fv27QNQ9HILoaGhOHjwIDZu3FjgEiPyjpvc3FyZJSIMDQ1hYmJS5BghIiIi+tpxhitROVG9enX8+eef8PLygp2dHQYOHAhLS0vExcVh1apVePHiBSIiIqR+cfL09MS4cePQpUsXjBw5EhkZGVi+fDlq1KghtfGHlZUVpk+fjgkTJiAuLg4eHh7Q1NTEgwcPsGPHDvj5+SEwMBAAMGjQIGzduhUdOnRAz549ERsbi/Xr10ttOAMAffv2xebNmzF06FAcO3YMzZs3R25uLu7cuYPNmzfj4MGDBW4mUhyVK1fGqFGjMH/+fHTu3BkdOnTAtWvXsH//fujr63+2mbWlYdasWTh27BgaN26MwYMHo1atWnj+/DkuX76MI0eO4Pnz5x9to2fPnggMDERgYCB0dXVlZiENGjQIz58/R5s2bVClShXEx8fjl19+Qb169WRmrubLysrCtm3b8P3330NFRaXAOp07d8bixYuRkpLy0fUBi8vQ0BBOTk5YsGABXr16JZNEtrGxgZWVFQIDA5GYmAgtLS1s27bts8xo1tPTQ//+/bFs2TLExMTA1tYWnTp1wrp166CtrY1atWrhzJkzOHLkCPT09KTOrVevHhQVFTF79mykpqZCWVkZbdq0gaGhIZYvX46+ffvCwcEBnp6eMDAwQEJCAvbu3YvmzZuX6A8G06dPx+HDh/Hdd99h2LBhUFJSwm+//YasrCzMmTPnk+/F5cuXsX79euTl5eHly5e4cOECtm3bBolEgnXr1n10CYziKM73UWE+x/ePo6MjAGDSpEnw9PREhQoV4ObmJvUHgaIoKCggKCjoo/XGjx+PjRs3omPHjhg5ciR0dXWxdu1aPHjwANu2bYOCwru/tQ8ePBi//vor+vXrh0uXLsHY2Bjr1q2TmamooKCAP/74Ax07dkTt2rXRv39/mJqaIjExEceOHYOWlpa4fvGH1q5di2XLlqFLly6wsrLCq1evsHLlSmhpacHFxaXQa2jWrBl0dHTg4+ODkSNHiuPkUxKN8l7vx+zfv1+cLZySkoINGzbg/v37GD9+vPiHCVdXVyxYsAAdOnRA7969kZKSgqVLl6J69eoyS/s4OjriyJEjWLBgAUxMTGBpaYnGjRtj5syZOHToEFq1agU/Pz/Y2toiKSkJW7ZsQXR0tLgh2adwd3eHpaUl3NzcYGVlhdevX+PIkSPYs2cPGjZsKPV0wPtu3LiBadOmoWXLlkhJScH69euljvfp00fucfPq1StUqVIF3bt3h729PTQ0NHDkyBFcuHBB6kkAIiIiom+OQETlyo0bN4TevXsLRkZGgoKCggBAUFFREW7dulVg/UOHDgl16tQRKlasKNSsWVNYv369EBwcLBT09bFt2zbhu+++E9TV1QV1dXXBxsZGGD58uHD37l2pevPnzxdMTU0FZWVloXnz5sLFixeFVq1aCa1atZKql52dLcyePVuoXbu2oKysLOjo6AiOjo5CaGiokJqaKtYzNzcXfHx8xPdr1qwRAAgXLlyQau/YsWMCAOHYsWNi2du3b4XJkycLRkZGgqqqqtCmTRshJiZG0NPTE4YOHSrnXRWECxcuCACENWvWyBwrqN9WrVoJtWvXlqlrbm4uuLq6ypQDEIYPHy5V9vjxY2H48OGCmZmZUKFCBcHIyEho27at8Pvvv8sdd/PmzQUAwqBBg2SObd26VWjfvr1gaGgoVKxYUahataowZMgQISkpqdD2tm3bJgAQVq1aVWid48ePCwCExYsXC4JQ+L148OCBAECYO3duoccKut8rV64UAAiamppCZmamzPHbt28L7dq1EzQ0NAR9fX1h8ODBwrVr12TaK2ycf8jHx0dQV1cv8FhsbKygqKgojs8XL14I/fv3F/T19QUNDQ3B2dlZuHPnjswYzr+OatWqCYqKijLj59ixY4Kzs7Ogra0tqKioCFZWVoKvr69w8eLFImMt6p5evnxZcHZ2FjQ0NAQ1NTXByclJOH36tFSdwj5bH+sv/6WkpCTo6uoKjRs3FiZMmCDEx8fLnFPQ5yX/Z/HkyROpukXde3m+jwobe4Ig//dPQZ9NQZD9XhIEQZg2bZpgamoqfvc+ePCgwL4/dm35Cvt5xsbGCt27dxcqVaokqKioCI0aNRL++usvmfPj4+OFzp07C2pqaoK+vr4watQo4cCBAzL3XxAE4cqVK0LXrl0FPT09QVlZWTA3Nxd69uwpREZGinXyx0f+dV2+fFnw8vISqlatKigrKwuGhoZCp06dPjpOBUEQTp06JTRp0kRQVVUVTExMhJ9++kk4ePCg3N+lPj4+grm5eYmv90P51/b+S0VFRahXr56wfPlyIS8vT6r+qlWrBGtra0FZWVmwsbER1qxZU+B3yp07d4SWLVsKqqqqAgCpMRMfHy/069dPMDAwEJSVlYVq1aoJw4cPF7KysqRikuffuoJs3LhR8PT0FKysrARVVVVBRUVFqFWrljBp0iQhLS1Nqi4AITg4WKr9wl7v+9i4ycrKEsaOHSvY29sLmpqagrq6umBvby8sW7asyNiJiIiIvnYSQfhKdo8hojLx559/wtfXF3369MGff/5Z1uF8FV6+fAkdHR1Mnz4dkyZNKutwiIiIiIiIiOgbwiUFiMq5fv36ISkpCePHj0eVKlUwc+bMsg7pi8rMzJTZpXzRokUAgNatW3/5gIiIiIiIiIjom8YZrkRUroWFhSEsLAwuLi7Q0NBAdHQ0Nm7ciPbt2+PgwYNlHR4RERERERERfWM4w5WIyrW6detCSUkJc+bMQVpamriR1vTp08s6NCIiIiIiIiL6BnGGKxEREREREREREVEpUSjrAIiIiIiIiIiIiIj+K5hwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImI6IsICwuDRCLBxYsXC60TFxcHiUSCefPmFdmWhYUFJBIJ2rVrV+DxlStXQiKRfLS/ooSEhEAikeDp06eF1jl+/DgkEgm2bt0qd7s9e/aERCLBuHHjimxTIpFg/fr1BdZp3rw5JBIJ6tSpU+Dx3NxcmJiYQCKRYP/+/XLHBgCjR4+Gg4MDdHV1oaamBltbW4SEhCA9PV2u85cvX44ePXqgatWqkEgk8PX1LVb/+T/bgl7W1tYy9VetWgVbW1uoqKjA2toav/zyi0wdX19fqXaUlJRgZmYGT09P3L59W6645Bm/n+L27dsICQlBXFzcZ2n/W4mBiIiIiOi/QKmsAyAiIioJFRUVHDt2DMnJyTAyMpI6Fh4eDhUVFbx586aMoitYWloa9uzZAwsLC2zcuBGzZs2CRCIpsK6Kigo2bNiAPn36SJXHxcXh9OnTUFFRKbSfo0ePIikpCRYWFggPD0fHjh3ljvHChQto0aIF+vfvDxUVFVy5cgWzZs3CkSNHcPLkSSgoFP232tmzZ+PVq1do1KgRkpKS5O4336JFi2SSu/Hx8QgKCkL79u2lyn/77TcMHToU3bp1w5gxYxAVFYWRI0ciIyNDJqGtrKyMP/74AwDw9u1bxMbGYsWKFThw4ABu374NExOTYsdamm7fvo3Q0FC0bt0aFhYW5TYGIiIiIqL/AiZciYjom9S8eXNcuHABmzZtwqhRo8Tyf//9F1FRUejSpQu2bdtWhhHK2rZtG3Jzc7F69Wq0adMGJ0+eRKtWrQqs6+Ligt27d+Pp06fQ19cXyzds2IDKlSvD2toaL168KPDc9evXw8HBAT4+Ppg4cSJev34NdXV1uWKMjo6WKbOyskJgYCDOnz+PJk2aFHn+iRMnxNmtGhoacvX5Pg8PD5my6dOnAwC8vb3FsszMTEyaNAmurq7iDOPBgwcjLy8P06ZNg5+fH3R0dMT6SkpKMsnrJk2aoFOnTti7dy8GDx5c7FjLiiAIePPmDVRVVcs6FCIiIiIiKgCXFCAiom+SiooKunbtig0bNkiVb9y4ETo6OnB2dpY5JycnB3fu3CnRzMvSEB4eju+//x5OTk6wtbVFeHh4oXXd3d2hrKyMLVu2SJVv2LABPXv2hKKiYoHnZWZmYseOHfD09ETPnj2RmZmJXbt2fVLc+bMdX758+dG65ubmhc7aLakNGzbA0tISzZo1E8uOHTuGZ8+eYdiwYVJ1hw8fjtevX2Pv3r0fbTd/ZrSSUsn+/uzr6wsNDQ0kJibCw8MDGhoaMDAwQGBgIHJzc6XqRkREwNHREZqamtDS0oKdnR0WL14M4N1yBT169AAAODk5iUsfHD9+HMC7+9+pUyccPHgQDRo0gKqqKn777TdxCY6wsDCZ2CQSCUJCQqTKEhMTMXDgQJiYmEBZWRmWlpb44YcfkJ2d/dEYiIiIiIhIfky4EhHRN6t37944f/48YmNjxbINGzage/fuqFChgkz9xMRE2NraYsKECV8yTADAo0ePcOzYMXh5eQEAvLy8sHXrVmRnZxdYX01NDe7u7ti4caNYdu3aNdy6dQu9e/cutJ/du3cjPT0dnp6eMDIyQuvWrYtM7Bbk7du3ePr0KR49eoRDhw4hKCgImpqaaNSoUbHaKQ1XrlxBTEyMzDVfuXIFANCgQQOpckdHRygoKIjH3/f06VM8ffoUjx8/xpkzZzB69Gjo6emhU6dOJY4vNzcXzs7O0NPTw7x589CqVSvMnz8fv//+u1jn8OHD8PLygo6ODmbPno1Zs2ahdevWOHXqFACgZcuWGDlyJABg4sSJWLduHdatWwdbW1uxjbt378LLywvff/89Fi9ejHr16hUrzkePHqFRo0aIiIhAr169sGTJEvTt2xcnTpxARkaGXDEQEREREZF8uKQAERF9s9q0aQMjIyNs3LgRQUFBiImJwdWrV7F48WL8888/ZR2elI0bN0JZWRnu7u4AAE9PT0yZMgX79u0r8DF64F1C2c3NDQ8fPoSZmRnCw8NRrVq1Ih/rX79+PZo1awYzMzOxn2HDhuHJkycwMDCQK9aLFy+iadOm4vuaNWti9+7d0NXVlfNqS09+svj95QQAICkpCYqKijA0NJQqr1ixIvT09PDo0SOp8tevX8tcv6mpKQ4dOiT3fSnImzdv0KtXL0yePBkAMHToUDg4OGDVqlX44YcfAAB79+6FlpYWDh48WODM5GrVqqFFixZYsmQJvv/+e7Ru3Vqmzt9//40DBw5IzdwuzuZWEyZMQHJyMs6dOyeVpJ46dSoEQUClSpU+GgMREREREcmHM1yJiOibpaioiJ49e4qzQMPDw2FmZoYWLVoUWN/CwgKCIBT4CPbnFh4eDldXV2hqagIArK2t4ejoWOTs0/bt20NXVxcREREQBAERERHiDNmCPHv2DAcPHpSq061bN0gkEmzevFnuWGvVqoXDhw9j586d+Omnn6Curi6zkdWXkJeXh4iICNSvX19mpmVmZiYqVqxY4HkqKirIzMyUKTt8+DAOHz6MgwcP4rfffoOGhgZcXFxw7969T4pz6NChUu9btGghlfCvVKkSXr9+jcOHD5e4D0tLywKXyZBHXl4edu7cCTc3N5kZwQBKfQkIIiIiIqLyjjNciYjom9a7d28sWbIE165dw4YNG+Dp6fnVJZBiYmJw5coV9OvXD3///bdY3rp1ayxduhRpaWnQ0tKSOa9ChQro0aMHNmzYgEaNGuHhw4dFLiewadMm5OTkoH79+lL9NG7cGOHh4Rg+fDgA4Pnz51JLGaiqqkJbW1t8r6WlhXbt2gF4t5bshg0b4O7ujsuXL8Pe3r7kN+L/ZWZmIjU1Vaosfz3V9504cQKJiYkYPXq0zDFVVdVCl2MoaEMpRUVF8Zryubi4wNraGhMmTBA3NHvy5IlUHV1d3UITu8C7RO6HM2R1dHSkNjQbNmwYNm/ejI4dO8LU1BTt27dHz5490aFDh0Lb/ZClpaXcdT/05MkTpKWloU6dOiVug4iIiIiI5McZrkRE9E1r3LgxrKysEBAQgAcPHhSZkCwr69evBwCMHj0a1tbW4mv+/Pl48+YNtm3bVui5vXv3xtWrVxESEgJ7e3vUqlWr0Lr5s2WbN28u1U90dDTOnDkjzrrs2rUrjI2NxdeoUaOKjL9r164A3m38VBo2bdok1b+xsXGh16OgoFDgrF5jY2Pk5uYiJSVFqjw7OxvPnj2DiYnJR+OoUqUKatasiZMnTwIAHj58KBPX6dOni2yjsM3L3mdoaIirV69i9+7d6Ny5M44dO4aOHTvCx8fno+fm+zCBDBQ+M/XDDbuIiIiIiOjL4gxXIiL65nl5eWH69OmwtbUt9mZCn5sgCNiwYQOcnJwwbNgwmePTpk1DeHg4+vfvX+D53333HapWrYrjx49j9uzZhfbz4MEDnD59Gv7+/mjVqpXUsby8PPTt2xcbNmxAUFAQ5s+fLzUD82PJyaysLOTl5cnMSi0pZ2fnjz5en5WVhW3btqF169YFxpf/c7548SJcXFzE8osXLyIvL0/ucfD27VtxuQQjIyOZuEpjRi/wbm1ZNzc3uLm5IS8vD8OGDcNvv/2GyZMno3r16iWala2jowMAePnypVR5fHy81HsDAwNoaWnh5s2bRbb3tc0MJyIiIiL6VjHhSkRE37xBgwZBUVERjRs3LrJeTk4OYmNjoa2tXeisytJ26tQpxMXFYerUqejevbvM8Xv37mHy5Ml49OhRgYlFiUSCJUuW4MqVK+jbt2+h/eTPbv3pp5/EDbPe98cffyA8PBxBQUFwdHQssI2XL19CXV0dFSpUkDkXgNT6nxkZGUhISIC+vj709fULjasgRc1qzbdv3z68fPlSZrOsfG3atIGuri6WL18ulXBdvnw51NTU4Orq+tE47t27h7t374r3Q0VFRWbZgdLw7Nkz6Onpie8VFBRQt25dAO8SywCgrq4OQDZ5WhQtLS3o6+vj5MmTCAgIEMuXLVsmVU9BQQEeHh5Yv349Ll68KLOOqyAIkEgkJYqBiIiIiIhkMeFKRERf1OrVq3HgwAGZ8vcfa4+MjMSbN29k6nh4eBS4DqW5uTlCQkI+2ndiYiJsbW3h4+Mj98ZZCxYsgJqamlSZgoICJk6cKL7ftm0b7ty5I3Ouj48PwsPDoaioWGgCsHPnzpg0aRIiIiIwZsyYAuu4u7vD3d29yDjDw8NRr169ApOt+f2MGDECly9fhoODQ4F1jh8/jpEjR6J79+6wtrZGdnY2oqKisH37djRo0AB9+vQR654/fx5OTk4IDg6Wuvd79uzBtWvXALxLcF+/fh3Tp08XY8hPNH5MeHg4lJWV0a1btwKPq6qqYtq0aRg+fDh69OgBZ2dnREVFYf369ZgxYwZ0dXWl6r99+1Zc2iEvLw9xcXFYsWIF8vLyEBwcLFdMJTVo0CA8f/4cbdq0QZUqVRAfH49ffvkF9erVEzcDq1evHhQVFTF79mykpqZCWVkZbdq0gaGh4UfbnjVrFgYNGoQGDRrg5MmTBW4CNnPmTBw6dAitWrWCn58fbG1tkZSUhC1btiA6OhqVKlUqcQxERERERCSNCVciIvqili9fXmC5r6+v+N8HDhwoMClrYWHxxTf++fnnn2XKFBUVpRKuha1t2qpVK2zZsgXNmjWTSQDmq1OnDiwtLbF+/fpCE64fc/nyZdy5cweTJ08utI6bmxtGjBiB9evXF5pwtbOzg5OTE3bt2oWkpCQIggArKytMmTIFY8eOLXLzqHzbtm3D2rVrxfdXrlzBlStXALxbM1WehGtaWhr27t0LV1dXqc28PjRs2DBUqFAB8+fPx+7du2FmZoaFCxcWuCZtVlaW1AxhLS0tNGzYEOvWrUPbtm0/GtOn6NOnD37//XcsW7YML1++hJGREXr16oWQkBAoKLxbTt/IyAgrVqzAzz//jIEDByI3NxfHjh37aLJzypQpePLkCbZu3SpuzLV//36Z80xNTXHu3DlMnjwZ4eHhSEtLg6mpKTp27Cj+QaGkMRARERERkTSJIAhCWQdBRERERERERERE9F+gUNYBEBEREREREREREf1XMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISolSWQfwNcjLy8OjR4+gqakJiURS1uEQERERERHRf5wgCHj16hVMTEygoMC5UERE/yVMuAJ49OgRzMzMyjoMIiIiIiIiKmcePnyIKlWqlHUYRERUiphwBaCpqQng3T90WlpaZRxN+ZCTk4NDhw6hffv2qFChQlmHQ/RZcJxTecBxTuUBxzmVBxznX15aWhrMzMzE30eJiOi/gwlXQFxGQEtLiwnXLyQnJwdqamrQ0tLi/6Gj/yyOcyoPOM6pPOA4p/KA47zscFk7IqL/Hi4UQ0RERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEq7hSkRERERERESlKjc3Fzk5OWUdBhFRqalYsSIUFOSbu8qEKxERERERERGVCkEQkJycjJcvX5Z1KEREpUpBQQGWlpaoWLHiR+sy4UpEREREREREpSI/2WpoaAg1NTVIJJKyDomI6JPl5eXh0aNHSEpKQtWqVT/63caEKxERERERERF9stzcXDHZqqenV9bhEBGVKgMDAzx69Ahv375FhQoViqzLTbOIiIiIiIiI6JPlr9mqpqZWxpEQEZW+/KUEcnNzP1qXCVciIiIiIiIiKjVcRoCI/ouK893GhCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKRERERERERFQOHT9+HBKJBC9fvpT7HAsLCyxatOizxVSYksT6Jfn6+sLDw6PU223dujUCAgJKvV36vJhwJSIiIiIiIiL6yvj6+kIikWDo0KEyx4YPHw6JRAJfX98vH9hXzMLCAhKJBBKJBIqKijAxMcHAgQPx4sWLYrXzpZKcubm5mDVrFmxsbKCqqgpdXV00btwYf/zxh1hn+/btmDZt2mePhUoXE65ERERERERERF8hMzMzREREIDMzUyx78+YNNmzYgKpVq5ZhZF+vqVOnIikpCQkJCQgPD8fJkycxcuTIsg6rQKGhoVi4cCGmTZuG27dv49ixY/Dz85OaxaurqwtNTc2yC5JKhAlXIiIiIiIiIqKvkIODA8zMzLB9+3axbPv27ahatSrq168vVTcrKwsjR46EoaEhVFRU8N133+HChQtSdfbt24caNWpAVVUVTk5OiIuLk+kzOjoaLVq0gKqqKszMzDBy5Ei8fv1a7pgvXLiA77//Hvr6+tDW1karVq1w+fJlqToSiQR//PEHunTpAjU1NVhbW2P37t3FjrUgmpqaMDIygqmpKZycnODj4yPV/7Nnz+Dl5QVTU1OoqanBzs4OGzduFI/7+vrixIkTWLx4sThbNr/vW7duoVOnTtDS0oKmpiZatGiB2NhYqf7nzZsHY2Nj6OnpYfjw4cjJySk01t27d2PYsGHo0aMHLC0tYW9vj4EDByIwMFCs8/5s2/xlFT58vT/TedeuXXBwcICKigqqVauG0NBQvH37Vq57R6WHCVciIiIiIiIioq/UgAEDsGbNGvH96tWr0b9/f5l6P/30E7Zt24a1a9fi8uXLqF69OpydnfH8+XMAwMOHD9G1a1e4ubnh6tWrGDRoEMaPHy/VRmxsLDp06IBu3brh+vXr2LRpE6Kjo+Hv7y93vK9evYKPjw+io6Nx9uxZWFtbw8XFBa9evZKqFxoaip49e+L69etwcXGBt7d3sWKVR2JiIvbs2YPGjRuLZW/evIGjoyP27t2Lmzdvws/PD3379sX58+cBAIsXL0bTpk0xePBgJCUlISkpCWZmZkhMTETLli2hrKyMo0eP4tKlSxgwYIBUMvPYsWOIjY3FsWPHsHbtWoSFhSEsLKzQ+IyMjHD06FE8efJErutp1qyZGFNSUhKOHj0KFRUVtGzZEgAQFRWFfv36YdSoUbh9+zZ+++03hIWFYcaMGcW+d/SJBBJSU1MFAEJqampZh1JuZGdnCzt37hSys7PLOhSiz4bjnMoDjnMqDzjOqTzgOP/y/ou/h2ZmZgq3b98WMjMzyzqU/wQfHx/B3d1dSElJEZSVlYW4uDghLi5OUFFREZ48eSK4u7sLPj4+giAIQnp6ulChQgUhPDxcPD87O1swMTER5syZIwiCIEyYMEGoVauWVB/jxo0TAAgvXrwQBEEQBg4cKPj5+UnViYqKEhQUFMSfq7m5ubBw4UK5ryM3N1fQ1NQU9uzZI5YBEIKCgsT36enpAgBh//79csdaEHNzc6FixYqCurq6oKKiIgAQGjduXOQ5giAIrq6uwo8//ii+b9WqlTBq1CipOhMmTBAsLS0L/Z708fERzM3Nhbdv34plPXr0EHr16lVov7du3RJsbW0FBQUFwc7OThgyZIiwb98+qToFxSIIgvD06VOhWrVqwrBhw8Sytm3bCjNnzpSqt27dOsHY2LjQGEh+xfmO4wxXIiIiIiIiIqKvlIGBAVxdXREWFoY1a9bA1dUV+vr6UnViY2ORk5OD5s2bi2UVKlRAo0aNEBMTAwCIiYmRmukJAE2bNpV6f+3aNYSFhUFDQ0N8OTs7Iy8vDw8ePJAr3sePH2Pw4MGwtraGtrY2tLS0kJ6ejoSEBKl6devWFf9bXV0dWlpaSElJkTvWwowdOxZXr17F9evXERkZCQBwdXVFbm4ugHcbVU2bNg12dnbQ1dWFhoYGDh48KBPfh65evYoWLVqgQoUKhdapXbs2FBUVxffGxsbiNRWkVq1auHnzJs6ePYsBAwYgJSUFbm5uGDRoUJGx5OTkoFu3bjA3N8fixYvF8mvXrmHq1KlSP7/8mboZGRlFtkmlS6msAyAiIiIiIiIiosINGDBAfKx/6dKln62f9PR0DBkypMBNpuTdpMvHxwfPnj3D4sWLYW5uDmVlZTRt2hTZ2dlS9T5MXEokEuTl5ZU8+P+nr6+P6tWrAwCsra2xaNEiNG3aFMeOHUO7du0wd+5cLF68GIsWLYKdnR3U1dUREBAgE9+HVFVVP9p3Sa5JQUEBDRs2RMOGDREQEID169ejb9++mDRpEiwtLQs854cffsDDhw9x/vx5KCn9L7WXnp6O0NBQdO3aVeYcFRWVj8ZPpYcJVyIiIiIiIiKir1iHDh2QnZ0NiUQCZ2dnmeNWVlaoWLEiTp06BXNzcwDvZkFeuHBB3HDJ1tZWZmOqs2fPSr13cHDA7du3xYRlSZw6dQrLli2Di4sLgHfrsT59+rRYbcgTq7zyZ5xmZmaK8bm7u6NPnz4AgLy8PNy7dw+1atUSz6lYsaI4IzZf3bp1sXbtWuTk5BQ5y/VT5cdR2EZlCxYswObNm3H69Gno6elJHXNwcMDdu3c/6edHpYNLChARERERERERfcUUFRURExOD27dvSz2ynk9dXR0//PADxo4diwMHDuD27dsYPHgwMjIyMHDgQADA0KFDcf/+fYwdOxZ3797Fhg0bZDZ0GjduHE6fPg1/f39cvXoV9+/fx65du4q1aZa1tTXWrVuHmJgYnDt3Dt7e3nLNDn2fPLEW5tWrV0hOTkZSUhLOnz+PsWPHwsDAAM2aNRPjO3z4ME6fPo2YmBgMGTIEjx8/lmrDwsIC586dQ1xcHJ4+fYq8vDz4+/sjLS0Nnp6euHjxIu7fv49169bh7t27xbq293Xv3h0LFy7EuXPnEB8fj+PHj2P48OGoUaMGbGxsZOofOXIEP/30E+bOnQt9fX0kJycjOTkZqampAIApU6bgzz//RGhoKG7duoWYmBhEREQgKCioxDFSyTDhSkRERERERET0ldPS0oKWllahx2fNmoVu3bqhb9++cHBwwN9//42DBw9CR0cHwLslAbZt24adO3fC3t4eK1aswMyZM6XaqFu3Lk6cOIF79+6hRYsWqF+/PqZMmQITExO541y1ahVevHgBBwcH9O3bFyNHjoShoWGxrlWeWAszZcoUGBsbw8TEBJ06dYK6ujoOHTokzgYNCgqCg4MDnJ2d0bp1axgZGcHDw0OqjcDAQCgqKqJWrVowMDBAQkIC9PT0cPToUaSnp6NVq1ZwdHTEypUrP2m2q7OzM/bs2QM3NzfUqFEDPj4+sLGxwaFDh6SWCsgXHR2N3NxcDB06FMbGxuJr1KhRYnt//fUXDh06hIYNG6JJkyZYuHChOOuZvhyJIAhCWQdR1tLS0qCtrY3U1NQiv7yo9OTk5GDfvn1wcXH5rFPxicoSxzmVBxznVB5wnFN5wHH+5f0Xfw998+YNHjx4AEtLS64XSUT/OcX5juMMVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKZLc8IyIiIiIiohJZ/GJxseqP0hn1mSIhIiKissIZrkRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUcNMsIiIiIiIiIvqsiruh3Kco6WZ0z549g62tLc6fPw8LC4vSDeob1LJlSwwdOhS9e/cGAEgkEuzYsQMeHh5lFpOFhQUCAgIQEBDw2WJq0qQJxo4di27dupVamwCQnZ2NGjVqYOvWrWjQoEGptk1fH85wJSIiIiIiIqJyb8aMGXB3dxeTrXFxcZBIJLh69eoXj8XX1xcSiQQSiQQVKlRA5cqV8f3332P16tXIy8v77P3v3r0bjx8/hqen52fv61MkJSWhY8eOpdpmUFAQxo8f/0n3edasWZBIJGJiGAAqVqyIwMBAjBs3rhSipK8dE65EREREREREVK5lZGRg1apVGDhw4BftNzs7u9BjHTp0QFJSEuLi4rB//344OTlh1KhR6NSpE96+fftZ41qyZAn69+8PBYWvO21kZGQEZWXlUm2zY8eOePXqFfbv31+i8y9cuIDffvsNdevWlTnm7e2N6Oho3Lp161PDpK/c1/3JISIiIiIiIiL6zPbt2wdlZWU0adKk0DrHjx+HRCJBZGQkGjRoADU1NTRr1gx3796Vqrdnzx40bNgQKioq0NfXR5cuXcRjFhYWmDZtGvr16wctLS34+fkV2p+ysjKMjIxgamoKBwcHTJw4Ebt27cL+/fsRFhYm1nv58iUGDRoEAwMDaGlpoU2bNrh27ZrcMX3oyZMnOHr0KNzc3GSO5c8oVVVVRbVq1bB161ap4+PGjUONGjWgpqaGatWqYfLkycjJyRGPX7t2DU5OTtDU1ISWlhYcHR1x8eJF8Xh0dDRatGgBVVVVmJmZYeTIkXj9+nWhsUokEuzcuRPA/2Ykb9++HU5OTlBTU4O9vT3OnDkjdc7H+lBUVISLiwsiIiIK7bcw6enp8Pb2xsqVK6GjoyNzXEdHB82bNy9R2/RtYcKViIiIiIiIiMq1qKgoODo6ylV30qRJmD9/Pi5evAglJSUMGDBAPLZ371506dIFLi4uuHLlCiIjI9GoUSOp8+fNmwd7e3tcuXIFkydPLlacbdq0gb29PbZv3y6W9ejRAykpKdi/fz8uXboEBwcHtG3bFs+fP5c7pvdFR0dDTU0Ntra2MscmT56Mbt264dq1a/D29oanpydiYmLE45qamggLC8Pt27exePFirFy5EgsXLhSPe3t7o0qVKrhw4QIuXbqE8ePHo0KFCgCA2NhYdOjQAd26dcP169exadMmREdHw9/fv1j3aNKkSQgMDMTVq1dRo0YNeHl5iTOC5e2jUaNGiIqKKla/ADB8+HC4urqiXbt2hdYpadv0beGmWURERET0RRR3w5SSbnpCRERUXPHx8TAxMZGr7owZM9CqVSsAwPjx4+Hq6oo3b95ARUUFM2bMgKenJ0JDQ8X69vb2Uue3adMGP/74Y4ljtbGxwfXr1wG8S46eP38eKSkp4qP18+bNw86dO7F161b4+fnJFdP74uPjUbly5QKXE+jRowcGDRoEAJg2bRoOHz6MX375BcuWLQPwbv3TfBYWFggMDERERAR++uknAEBCQgLGjh0LGxsbAIC1tbVY/+eff4a3t7e47qm1tTWWLFmCVq1aYfny5VBRUZHr/gQGBsLV1RUAEBoaitq1a+Pvv/+GjY2N3H2YmJjg4cOHyMvLk3tZhYiICFy+fBkXLlwosp6JiQni4+PlapO+XZzhSkRERERERETlWmZmptwJvffX5jQ2NgYApKSkAACuXr2Ktm3bFnn+p+5QLwgCJBIJgHeP6Kenp0NPTw8aGhri68GDB4iNjZU7pvcVdS+aNm0q8/79Ga6bNm1C8+bNYWRkBA0NDQQFBSEhIUE8PmbMGAwaNAjt2rXDrFmzxBjzryUsLEzqOpydnZGXl4cHDx7IHX9RPx95+1BVVUVeXh6ysrLk6vPhw4cYNWoUwsPDPzqOVFVVkZGRIff10LeJM1yJiIiIiIiIqFzT19fHixcv5Kqb/wg8ADHxmb+jvaqq6kfPV1dXL0GE/xMTEwNLS0sA79YMNTY2xvHjx2XqVapUSe6Y3lece/G+M2fOwNvbG6GhoXB2doa2tjYiIiIwf/58sU5ISAh69+6NvXv3Yv/+/QgODkZERAS6dOmC9PR0DBkyBCNHjpRpu2rVqnLHUdTPR94+nj9/DnV1dbnv3aVLl5CSkgIHBwexLDc3FydPnsSvv/6KrKwsKCoqim0bGBjIfT30bWLClYiIiIiIiIjKtfr162P9+vWf3E7dunURGRmJ/v37l0JUso4ePYobN25g9OjRAAAHBwckJydDSUkJFhYWpRJT/fr1kZycjBcvXshs/HT27Fn069dP6n39+vUBAKdPn4a5uTkmTZokHi/o0fkaNWqgRo0aGD16NLy8vLBmzRp06dIFDg4OuH37NqpXry5XnCUhbx83b94Ur0sebdu2xY0bN6TK+vfvDxsbG4wbN05Mtpakbfo2cUkBIiIiIiIiIirXnJ2dcevWrRLN7HxfcHAwNm7ciODgYMTExODGjRuYPXt2idrKyspCcnIyEhMTcfnyZcycORPu7u7o1KmTmPRs164dmjZtCg8PDxw6dAhxcXE4ffo0Jk2ahIsXL5Yopvr160NfXx+nTp2SObZlyxasXr0a9+7dQ3BwMM6fPy9uOGVtbY2EhAREREQgNjYWS5YswY4dO8RzMzMz4e/vj+PHjyM+Ph6nTp3ChQsXxM25xo0bh9OnT8Pf3x9Xr17F/fv3sWvXrmJvmlUUefuIiopC+/bt5W5XU1MTderUkXqpq6tDT08PderU+aS26dvEGa5ERERERERE9Fl97Rsh2tnZwcHBAZs3b8aQIUNK3E7r1q2xZcsWTJs2DbNmzYKWlhZatmxZorYOHDgAY2NjKCkpQUdHB/b29liyZAl8fHzEjZwkEgn27duHSZMmoX///njy5AmMjIzQsmVLVK5cuUQxKSoqon///ggPD0enTp2kjoWGhiIiIgLDhg2DsbExNm7ciFq1agEAOnfujNGjR8Pf3x9ZWVlwdXXF5MmTERISIrb77Nkz9OvXD48fP4a+vj66du0qbuZVt25dnDhxApMmTUKLFi0gCAKsrKzQq1evEt2/gsjTR2JiIk6fPi014zkuLg6WlpY4duwYWrduXeL+z5w5g9TUVHTv3v1TLoO+ARJBEISyDqKspaWlQVtbG6mpqdDS0irrcMqFnJwc7Nu3Dy4uLlLrqxD9l3CcU3nAcU7FsfjF4mLV/1p+Oec4p+LgOCd5/Rd/D33z5g0ePHgAS0tLuTeg+prs3bsXY8eOxc2bN+Xemf6/Kjk5GbVr18bly5dhbm5e1uF8UePGjcOLFy/w+++/i2XHjh1D165d8c8//8gss1AcvXr1gr29PSZOnFgaodIXVpzvOM5wJSIiIiIiIqJyz9XVFffv30diYiLMzMzKOpwyZWRkhFWrViEhIaHcJVwNDQ0xZswYqbJ9+/Zh4sSJn5Rszc7Ohp2dnbj+Lv23MeFKRERERERERAQgICCgrEP4anh4eJR1CGXixx9/lCmbO3fuJ7dbsWJFBAUFfXI79G0o33PkiYiIiIiIiIiIiEoRE65EREREREREREREpeSrWVJg1qxZmDBhAkaNGoVFixYBeLcY7Y8//oiIiAhkZWXB2dkZy5YtE3faA4CEhAT88MMPOHbsGDQ0NODj44Off/4ZSkpfzaURERF91Le6yQoRERERERFJ+ypmuF64cAG//fYb6tatK1U+evRo7NmzB1u2bMGJEyfw6NEjdO3aVTyem5sLV1dXZGdn4/Tp01i7di3CwsIwZcqUL30JRERERERERERERGWfcE1PT4e3tzdWrlwptdtbamoqVq1ahQULFqBNmzZwdHTEmjVrcPr0aZw9exYAcOjQIdy+fRvr169HvXr10LFjR0ybNg1Lly5FdnZ2WV0SERERERERERERlVNl/tz98OHD4erqinbt2mH69Oli+aVLl5CTk4N27dqJZTY2NqhatSrOnDmDJk2a4MyZM7Czs5NaYsDZ2Rk//PADbt26hfr16xfYZ1ZWFrKyssT3aWlpAICcnBzk5OSU9iVSAfLvM+83/ZdxnFNxSN5KilX/axlXHOdUHBznVB5wnJO8eK+JiP67yjThGhERgcuXL+PChQsyx5KTk1GxYkVUqlRJqrxy5cpITk4W67yfbM0/nn+sMD///DNCQ0Nlyg8dOgQ1NbXiXgZ9gsOHD5d1CESfHcc5ycMc5sWqvw/7PlMkJcNxTvLgOKfygOOc5JWRkVHWIRAR0WdSZgnXhw8fYtSoUTh8+DBUVFS+aN8TJkzAmDFjxPdpaWkwMzND+/btoaWl9UVjKa9ycnJw+PBhfP/996hQoUJZh0P0WXCcU3Esf7m8WPV/qPTDZ4qkeDjOqTg4zqk84DgneeU/aVlezLry9Iv1Nb6+fonOe/bsGWxtbXH+/HlYWFiUblDfoJYtW2Lo0KHo3bs3AEAikWDHjh3w8PAos5gsLCwQEBCAgICAzxZTkyZNMHbsWHTr1q3U2vwSbdPXpcwSrpcuXUJKSgocHBzEstzcXJw8eRK//vorDh48iOzsbLx8+VJqluvjx49hZGQEADAyMsL58+el2n38+LF4rDDKyspQVlaWKa9QoQL/z8UXxntO5QHHOclDUBKKVf9rG1Mc5yQPjnMqDzjOSV68z1+fGTNmwN3dXUy2xsXFwdLSEleuXEG9evW+aCy+vr5Yu3YtAEBJSQm6urqoW7cuvLy84OvrCwWFz7slz+7du/H48WN4enp+1n4+VVJSktR+QKUhKCgIo0ePRpcuXYp1n3NzcxESEoL169cjOTkZJiYm8PX1RVBQECQSySe1Td+eMvvptm3bFjdu3MDVq1fFV4MGDeDt7S3+d4UKFRAZGSmec/fuXSQkJKBp06YAgKZNm+LGjRtISUkR6xw+fBhaWlqoVavWF78mIiIiIiIiIvr2ZGRkYNWqVRg4cOAX7beoDb87dOiApKQkxMXFYf/+/XBycsKoUaPQqVMnvH379rPGtWTJEvTv3/+rTwoaGRkVOKHuU3Ts2BGvXr3C/v37i3Xe7NmzsXz5cvz666+IiYnB7NmzMWfOHPzyyy+f3DZ9e8rsk6OpqYk6depIvdTV1aGnp4c6depAW1sbAwcOxJgxY3Ds2DFcunQJ/fv3R9OmTdGkSRMAQPv27VGrVi307dsX165dw8GDBxEUFIThw4eX+geOiIiIiIiIiP6b9u3bB2VlZTHfUJDjx49DIpEgMjISDRo0gJqaGpo1a4a7d+9K1duzZw8aNmwIFRUV6Ovro0uXLuIxCwsLTJs2Df369YOWlhb8/PwK7U9ZWRlGRkYwNTWFg4MDJk6ciF27dmH//v0ICwsT6718+RKDBg2CgYEBtLS00KZNG1y7dk3umD705MkTHD16FG5ubjLHkpKS0LFjR6iqqqJatWrYunWr1PFx48ahRo0aUFNTQ7Vq1TB58mSpDeKuXbsGJycnaGpqQktLC46Ojrh48aJ4PDo6Gi1atICqqirMzMwwcuRIvH79utBYJRIJdu7cCeDdjGSJRILt27fDyckJampqsLe3x5kzZ6TO+VgfioqKcHFxQURERKH9FuT06dNwd3eHq6srLCws0L17d7Rv317qyeyStk3fnq/6TxULFy5Ep06d0K1bN7Rs2RJGRkbYvn27eFxRURF//fUXFBUV0bRpU/Tp0wf9+vXD1KlTyzBqIiIiIiIiIvqWREVFwdHRUa66kyZNwvz583Hx4kUoKSlhwIAB4rG9e/eiS5cucHFxwZUrVxAZGYlGjRpJnT9v3jzY29vjypUrmDx5crHibNOmDezt7aVyIz169EBKSgr279+PS5cuwcHBAW3btsXz58/ljul90dHRUFNTg62trcyxyZMno1u3brh27Rq8vb3h6emJmJgY8bimpibCwsJw+/ZtLF68GCtXrsTChQvF497e3qhSpQouXLiAS5cuYfz48eLyGrGxsejQoQO6deuG69evY9OmTYiOjoa/v3+x7tGkSZMQGBiIq1evokaNGvDy8hJnBMvbR6NGjRAVFVWsfps1a4bIyEjcu3cPwLvkcnR0NDp27PjJbdO3p8zWcC3I8ePHpd6rqKhg6dKlWLp0aaHnmJubY9++r2tnTyIiIiIiIiL6dsTHx8PExESuujNmzECrVq0AAOPHj4erqyvevHkDFRUVzJgxA56enggNDRXr29vbS53fpk0b/PjjjyWO1cbGBtevXwfwLjl6/vx5pKSkiE/6zps3Dzt37sTWrVvh5+cnV0zvi4+PR+XKlQtcTqBHjx4YNGgQAGDatGk4fPgwfvnlFyxbtgzAuzVK81lYWCAwMBARERH46aefAAAJCQkYO3YsbGxsAADW1tZi/Z9//hne3t7ihljW1tZYsmQJWrVqheXLl8u94XpgYCBcXV0BAKGhoahduzb+/vtv2NjYyN2HiYkJHj58iLy8PLmXVRg/fjzS0tJgY2MDRUVF5ObmYsaMGfD29paqV5K26dvDnywRERERERERlWuZmZlyJ/Tq1q0r/rexsTEAiHvLXL16FW3bti3y/AYNGpQwyncEQRA3Ybp27RrS09Ohp6cHDQ0N8fXgwQPExsbKHdP7iroX+XvqvP/+/RmumzZtQvPmzWFkZAQNDQ0EBQUhISFBPD5mzBgMGjQI7dq1w6xZs8QY868lLCxM6jqcnZ2Rl5eHBw8eyB1/UT8feftQVVVFXl4esrKy5O538+bNCA8Px4YNG3D58mWsXbsW8+bNEzc/+5S26dvzVc1wJSIiIiIiIiL60vT19fHixQu56uY/Ag9ATHzm5eUBeJdM+xh1dfUSRPg/MTExsLS0BACkp6fD2NhY5olhAKhUqZLcMb2vOPfifWfOnIG3tzdCQ0Ph7OwMbW1tREREYP78+WKdkJAQ9O7dG3v37sX+/fsRHByMiIgIdOnSBenp6RgyZAhGjhwp03bVqlXljqOon4+8fTx//hzq6urFundjx47F+PHj4enpCQCws7NDfHw8fv75Z/j4+HxS2/TtYcKViL56i18sLlb9UTqjPlMkRERERET0X1S/fn2sX7/+k9upW7cuIiMj0b9//1KIStbRo0dx48YNjB49GgDg4OCA5ORkKCkpwcLColRiql+/PpKTk/HixQvo6OhIHTt79iz69esn9b5+/foA3m0aZW5ujkmTJonH4+PjZdqvUaMGatSogdGjR8PLywtr1qxBly5d4ODggNu3b6N69epyxVkS8vZx8+ZN8brklZGRIbNEgKKiopjs/ZS26dvDJQWIiIiIiIiIqFxzdnbGrVu3SjSz833BwcHYuHEjgoODERMTgxs3bmD27NklaisrKwvJyclITEzE5cuXMXPmTLi7u6NTp05i0rNdu3Zo2rQpPDw8cOjQIcTFxeH06dOYNGkSLl68WKKY6tevD319fZw6dUrm2JYtW7B69Wrcu3cPwcHBOH/+vLjhlLW1NRISEhAREYHY2FgsWbIEO3bsEM/NzMyEv78/jh8/jvj4eJw6dQoXLlwQN+caN24cTp8+DX9/f1y9ehX379/Hrl27ir1pVlHk7SMqKgrt27cvVttubm6YMWMG9u7di7i4OOzYsQMLFixAly5dPrlt+vZwhisRERERERERfVbj6+uXdQhFsrOzg4ODAzZv3owhQ4aUuJ3WrVtjy5YtmDZtGmbNmgUtLS20bNmyRG0dOHAAxsbGUFJSgo6ODuzt7bFkyRL4+PiIMyklEgn27duHSZMmoX///njy5AmMjIzQsmVLVK5cuUQxKSoqon///ggPD0enTp2kjoWGhiIiIgLDhg2DsbExNm7ciFq1agEAOnfujNGjR8Pf3x9ZWVlwdXXF5MmTERISIrb77Nkz9OvXD48fP4a+vj66du0qbuZVt25dnDhxApMmTUKLFi0gCAKsrKzQq1evEt2/gsjTR2JiIk6fPi014zkuLg6WlpY4duwYWrduXWDbv/zyCyZPnoxhw4YhJSUFJiYmGDJkCKZMmVJk2/TfJBEEQSjrIMpaWloatLW1kZqaCi0trbIOp1zIycnBvn374OLiIrW+ClFBvtUlBTjOqTg4zqk84Din8oDjnOT1X/w99M2bN3jw4AEsLS3l3oDqa7J3716MHTsWN2/eLPe7xycnJ6N27dq4fPkyzM3NyzqcL2rcuHF48eIFfv/9d7Hs2LFj6Nq1K/755x+ZZRY+tW36dhTnO44zXImIiIiIiIio3HN1dcX9+/eRmJgIMzOzsg6nTBkZGWHVqlVISEgodwlXQ0NDjBkzRqps3759mDhx4iclWwtrm/6bmHAlIiIiIiIiIgIQEBBQ1iF8NTw8PMo6hDLx448/ypTNnTv3s7VN/03le448ERERERERERERUSliwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREVO49e/YMhoaGiIuLK9Z5rVu3RkBAwGeJ6VMdP34cEokEL1++LLMYwsLCUKlSpTLrn4pvxYoVcHNzK+swvmlKZR0AEREREREREf23pYaGfrG+tIODS3TejBkz4O7uDgsLCwDvEqknTpwotP7x48fRqlWrEvX1pTRr1gxJSUnQ1tb+rP1IJBLs2LEDHh4en7WfzyEkJAQRERF4+PAhKlasCEdHR8yYMQONGzcu9Jxnz57B29sb169fFxP17u7umDlzJrS0tAAA0dHRGDduHO7cuYOMjAyYm5tjyJAhGD16tFRbS5cuxdy5c5GcnAx7e3v88ssvaNSokXjcwsIC8fHx2LhxIzw9PaXOrV27Nm7fvo01a9bA19e32Nf+7Nkz2NvbIzExES9evBAT4wMGDMC0adMQFRWFFi1aFLtd4gxXIiIiIiIiIirnMjIysGrVKgwcOFAs2759O5KSkqRe8fHxqFOnDho0aFBkQu5rUbFiRRgZGUEikZR1KF+tGjVq4Ndff8WNGzcQHR0NCwsLtG/fHk+ePCn0HAUFBbi7u2P37t24d+8ewsLCcOTIEQwdOlSso66uDn9/f5w8eRIxMTEICgpCUFAQfv/9d7HOpk2bMGbMGAQHB+Py5cuwt7eHs7MzUlJSpPozMzPDmjVrpMrOnj2L5ORkqKurl/jaBw4ciLp168qUV6xYEb1798aSJUtK3HZ5x4QrEREREREREZVr+/btg7KyMpo0aSKW6erqwsjISOo1bdo0PH36FDt27ICKiopYNy8vDz/99JN4TkhIiHgsLi4OEokEV69eFctevnwJiUSC48ePi2UnTpxAo0aNoKysDGNjY4wfPx5v374Vj7du3RojRoxAQEAAdHR0ULlyZaxcuRKvX79G//79oampierVq2P//v3iOR8uKZD/eP/Bgwdha2sLDQ0NdOjQAUlJSVL3Y/Xq1ahdu7YYi7+//yfe4f+JjY2Fu7s7KleuDA0NDTRs2BBHjhyRqmNhYYHp06ejX79+0NDQgLm5OXbv3o0nT57A3d0dGhoaqFu3Li5evCie8+zZM3h5ecHU1BRqamqws7PDxo0bPxpP79690a5dO1SrVg21a9fGggULkJaWhuvXrxd6jo6ODn744Qc0aNAA5ubmaNu2LYYNG4aoqCixTv369eHl5YXatWvDwsICffr0gbOzs1SdBQsWYPDgwejfvz9q1aqFFStWQE1NDatXr5bqz9vbGydOnMDDhw/FstWrV8Pb2xtKSiV7eH358uV4+fIlAgMDCzzu5uaG3bt3IzMzs0Ttl3dMuBIRERERERFRuRYVFQVHR8ci6yxbtgx//vkntm3bhipVqkgdW7t2LdTV1XHu3DnMmTMHU6dOxeHDh+XuPzExES4uLmjYsCGuXbuG5cuXY9WqVZg+fbpMP/r6+jh//jxGjBiBH374AT169ECzZs1w+fJltG/fHn379kVGRkahfWVkZGDevHlYt24dTp48iYSEBKmk2/LlyzF8+HD4+fnhxo0b2L17N6pXry73tXxMeno6XFxcEBkZiStXrqBDhw5wc3NDQkKCVL2FCxeiefPmuHLlClxdXdG3b1/069cPffr0weXLl2FlZYV+/fpBEAQAwJs3b+Do6Ii9e/fi5s2b8PPzQ9++fXH+/Hm5Y8vOzsbvv/8ObW1t2Nvby33eo0ePsH379iKXmLhy5QpOnz4t1snOzsalS5fQrl07sY6CggLatWuHM2fOSJ1buXJlODs7Y+3atQDe/Qw3bdqEAQMGyB3j+27fvo2pU6fizz//hIJCwanBBg0a4O3btzh37lyJ+ijvmHAlIiIiIiIionItPj4eJiYmhR4/efIkAgICsHTpUjRr1kzmeN26dREcHAxra2v069cPDRo0QGRkpNz9L1u2DGZmZvj1119hY2MDDw8PhIaGYv78+cjLyxPr2dvbIygoCNbW1pgwYQJUVFSgr6+PwYMHw9raGlOmTMGzZ8+KnJ2Zk5ODFStWoEGDBnBwcIC/v79UrNOnT8ePP/6IUaNGoUaNGmjYsGGpbgpmb2+PIUOGoE6dOrC2tsa0adNgZWWF3bt3S9VzcXHBkCFDxOtKS0tDw4YN0aNHD9SoUQPjxo1DTEwMHj9+DAAwNTVFYGAg6tWrh2rVqmHEiBHo0KEDNm/e/NGY/vrrL2hoaEBFRQULFy7E4cOHoa+v/9HzvLy8oKamBlNTU2hpaeGPP/6QqVOlShUoKyujQYMGGD58OAYNGgQAePr0KXJzc1G5cmWp+pUrV0ZycrJMOwMGDEBYWBgEQcDWrVthZWWFevXqfTTGD2VlZcHLywtz585F1apVC62npqYGbW1txMfHF7sPYsKViIiIiIiIiMq5zMxMqSUC3peQkIDu3bvDz89PTJZ96MN1MI2NjWXW4SxKTEwMmjZtKrXWavPmzZGeno5///23wH4UFRWhp6cHOzs7sSw/eVdU32pqarCysiow1pSUFDx69Aht27Yt8NyhQ4dCQ0NDfJVEeno6AgMDYWtri0qVKkFDQwMxMTEyM1zfv9b86yrqWnNzczFt2jTY2dlBV1cXGhoaOHjwoNhueHi4VOzvP9rv5OSEq1ev4vTp0+jQoQN69uwpttuxY0fxnNq1a0vFuHDhQly+fBm7du1CbGwsxowZI3O9UVFRuHjxIlasWIFFixbJtcxBQVxdXZGeno6TJ09i9erVJZ7dOmHCBNja2qJPnz4frauqqlrkbGkqXMkWeiAiIiIiIiIi+o/Q19fHixcvZMozMzPRpUsX1K5dG4sWLSr0/AoVKki9l0gk4szU/Ee28x99B97NMi2Jgvp5vyw/Yfv+rFh52siPTVVVtcj+p06dWuian/IKDAzE4cOHMW/ePFSvXh2qqqro3r07srOzC40z/7qKuta5c+di8eLFWLRoEezs7KCuro6AgACx3c6dO0ttdGZqair+t7q6OqpXr47q1aujSZMmsLa2xqpVqzBhwgT88ccf4jqmH967/LV9bWxsoKurixYtWmDy5MkwNjYW61haWgJ4lyx+/PgxQkJC4OXlBX19fSgqKoozdPM9fvwYRkZGMvdNSUkJffv2RXBwMM6dO4cdO3YUeZ8Lc/ToUdy4cQNbt24F8L9xqa+vj0mTJiE0NFSs+/z5cxgYGJSon/KOCVciIiIiIiIiKtfq16+P9evXy5QPGjQIz58/x8GDB0u8OVF+wiopKQn169cHAKkNtADA1tYW27ZtgyAIYiLx1KlT0NTUlFkv9nPS1NSEhYUFIiMj4eTkJHPc0NAQhoaGn9THqVOn4Ovriy5dugB4N+M1Li7uk9rMb9fd3V2cuZmXl4d79+6hVq1aAN5dm6amplxt5eXlISsrC4B0YvZj5wAQz/tYuxUrVoSjoyMiIyPh4eEhHo+MjCx0k7IBAwZg3rx56NWrF3R0dOSK60Pbtm2T2gjrwoULGDBgAKKioqRmPsfGxuLNmzfimKXiYcKViIiIiIiIiMo1Z2dnTJgwAS9evBATWXPnzsWWLVuwZ88evH37VmZdTW1t7Y/OCAXezRpt0qQJZs2aBUtLS6SkpCAoKEiqzrBhw7Bo0SKMGDEC/v7+uHv3LoKDgzFmzJhCNzX6XEJCQjB06FAYGhqiY8eOePXqFU6dOoURI0YUed6DBw9kEsnW1tYy9aytrbF9+3a4ublBIpFg8uTJRc7IlZe1tTW2bt2K06dPQ0dHBwsWLMDjx4/FhGtBXr9+jRkzZqBz584wNjbG06dPsXTpUiQmJqJHjx6Fnrdv3z48fvwYDRs2hIaGBm7duoWxY8eiefPmsLCwAAAsXboUVatWhY2NDYB36wDPmzcPI0eOFNsZM2YMfHx80KBBAzRq1AiLFi3C69ev0b9//wL7tbW1xdOnT6GmplaCO/TO+0lV4N1asvltV6pUSSyPiopCtWrVZOqTfJhwJSIiIiIiIqJyzc7ODg4ODti8eTOGDBkC4N1GVjk5OejQoUOB56xZswa+vr5ytb969WoMHDgQjo6OqFmzJubMmYP27duLx01NTbFv3z6MHTsW9vb20NXVxcCBA2USs1+Cj48P3rx5g4ULFyIwMBD6+vro3r37R88rbP3SDy1YsAADBgxAs2bNoK+vj3HjxiEtLe2T4w4KCsI///wDZ2dnqKmpwc/PDx4eHkhNTS30HEVFRdy5cwdr167F06dPoaenh4YNGyIqKkpmvdb3qaqqYuXKlRg9ejSysrJgZmaGrl27Yvz48WKdvLw8TJgwAQ8ePICSkhKsrKwwe/ZscXwBQK9evfDkyRNMmTIFycnJqFevHg4cOCCzkdb79PT0irwPvr6+iIuLw/Hjx4us9zEbN27E4MGDP6mN8kwivL+ISDmVlpYGbW1tpKamQktLq6zDKRdycnKwb98+uLi4yKyBQvShxS8WF6v+KJ1RnymS4uE4p+LgOKfygOOcygOOc5LXf/H30Ddv3uDBgwewtLQsdAOqr9nevXsxduxY3Lx584vPKiUqLa1atYKTkxNCQkJK3MatW7fQpk0b3Lt3D9ra2qUX3DeuON9xnOFKRERERET0DUh9byMTeWgHB3+mSIj+m1xdXXH//n0kJibCzMysrMMhKrbU1FTExsZi7969n9ROUlIS/vzzTyZbPwETrkREREREREREAAICAso6BKIS09bWxr///vvJ7bRr164UoinfOEeeiIiIiIiIiIiIqJQw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxERERERERGVe8+ePYOhoSHi4uKKdV7r1q0REBDwWWL6VMePH4dEIsHLly/LLIawsDBUqlSpzPqn4jtw4ADq1auHvLy8sg7lm8WEKxERERERERF9XnckX+5VQjNmzIC7uzssLCwAvEukSiSSQl8nTpwopZvz+TRr1gxJSUnQ1tb+rP1IJBLs3Lnzs/bxuYSEhMDGxgbq6urQ0dFBu3btcO7cuSLPefbsGTp06AATExMoKyvDzMwM/v7+SEtLE+tER0ejefPm0NPTg6qqKmxsbLBw4UKZtpYuXQoLCwuoqKigcePGOH/+vNRxCwsLSCQSREREyJxbu3ZtSCQShIWFFeuaL1y4gLZt26JSpUrQ0dGBs7Mzrl27Jh7v0KEDKlSogPDw8GK1S/+jVNYBEBERERF9qtTQ0GLV1w4O/kyREBHRtygjIwOrVq3CwYMHxbLt27cjOztbql52djZcXV3F5NjXrmLFijAyMirrML5qNWrUwK+//opq1aohMzMTCxcuRPv27fH333/DwMCgwHMUFBTg7u6O6dOnw8DAAH///TeGDx+O58+fY8OGDQAAdXV1+Pv7o27dulBXV0d0dDSGDBkCdXV1+Pn5AQA2bdqEMWPGYMWKFWjcuDEWLVoEZ2dn3L17F4aGhmJ/ZmZmWLNmDTw9PcWys2fPIjk5Gerq6sW63vT0dHTo0AGdO3fGsmXL8PbtWwQHB8PZ2RkPHz5EhQoVAAC+vr5YsmQJ+vbtW6z26R3OcCUiIiIiIiKicm3fvn1QVlZGkyZNxDJdXV0YGRlJvaZNm4anT59ix44dUFFREevm5eXhp59+Es8JCQkRj8XFxUEikeDq1ati2cuXLyGRSHD8+HGx7MSJE2jUqBGUlZVhbGyM8ePH4+3bt+Lx1q1bY8SIEQgICICOjg4qV66MlStX4vXr1+jfvz80NTVRvXp17N+/XzznwyUF8h/vP3jwIGxtbaGhoYEOHTogKSlJ6n6sXr0atWvXFmPx9/f/xDv8P7GxsXB3d0flypWhoaGBhg0b4siRI1J1LCwsMH36dPTr1w8aGhowNzfH7t278eTJE7i7u0NDQwN169bFxYsXxXOePXsGLy8vmJqaQk1NDXZ2dti4ceNH4+nduzfatWuHatWqoXbt2liwYAHS0tJw/fr1Qs/R0dHBDz/8gAYNGsDc3Bxt27bFsGHDEBUVJdapX78+vLy8ULt2bVhYWKBPnz5wdnaWqrNgwQIMHjwY/fv3R61atbBixQqoqalh9erVUv15e3vjxIkTePjwoVi2evVqeHt7Q0mpeHMp79y5g+fPn2Pq1KmoWbMmateujeDgYDx+/Bjx8fFiPTc3N1y8eBGxsbHFap/eYcKViIiIiIiIiMq1qKgoODo6Flln2bJl+PPPP7Ft2zZUqVJF6tjatWuhrq6Oc+fOYc6cOZg6dSoOHz4sd/+JiYlwcXFBw4YNce3aNSxfvhyrVq3C9OnTZfrR19fH+fPnMWLECPzwww/o0aMHmjVrhsuXL6N9+/bo27cvMjIyCu0rIyMD8+bNw7p163Dy5EkkJCQgMDBQPL58+XIMHz4cfn5+uHHjBnbv3o3q1avLfS0fk56eDhcXF0RGRuLKlSvo0KED3NzckJCQIFVv4cKFaN68Oa5cuQJXV1f07dsX/fr1Q58+fXD58mVYWVmhX79+EAQBAPDmzRs4Ojpi7969uHnzJvz8/NC3b1+ZR/SLkp2djd9//x3a2tqwt7eX+7xHjx5h+/btaNWqVaF1rly5gtOnT4t1srOzcenSJbRr106so6CggHbt2uHMmTNS51auXBnOzs5Yu3YtgHc/w02bNmHAgAFyx5ivZs2a0NPTw6pVq5CdnY3MzEysWrUKtra24nIaAFC1alVUrlxZKkFM8mPClYiIiIiIiIjKtfj4eJiYmBR6/OTJkwgICMDSpUvRrFkzmeN169ZFcHAwrK2t0a9fPzRo0ACRkZFy979s2TKYmZnh119/hY2NDTw8PBAaGor58+dLbVxkb2+PoKAgWFtbY8KECVBRUYG+vj4GDx4Ma2trTJkyBc+ePStydmZOTg5WrFiBBg0awMHBAf7+/lKxTp8+HT/++CNGjRqFGjVqoGHDhqW6KZi9vT2GDBmCOnXqwNraGtOmTYOVlRV2794tVc/FxQVDhgwRrystLQ0NGzZEjx49UKNGDYwbNw4xMTF4/PgxAMDU1BSBgYGoV68eqlWrhhEjRqBDhw7YvHnzR2P666+/oKGhARUVFSxcuBCHDx+Gvr7+R8/z8vKCmpoaTE1NoaWlhT/++EOmTpUqVaCsrIwGDRpg+PDhGDRoEADg6dOnyM3NReXKlaXqV65cGcnJyTLtDBgwAGFhYRAEAVu3boWVlRXq1av30Rg/pKmpiePHj2P9+vVQVVWFhoYGDhw4gP3798vMljUxMZGa9UryY8KViIiIiIiIiMq1zMxMqSUC3peQkIDu3bvDz89PTJZ9qG7dulLvjY2NkZKSInf/MTExaNq0KSSS/2361bx5c6Snp+Pff/8tsB9FRUXo6enBzs5OLMtP3hXVt5qaGqysrAqMNSUlBY8ePULbtm0LPHfo0KHQ0NAQXyWRnp6OwMBA2NraolKlStDQ0EBMTIzMDNf3rzX/uoq61tzcXEybNg12dnbQ1dWFhoYGDh48KLYbHh4uFfv7MzednJxw9epVnD59Gh06dEDPnj3Fdjt27CieU7t2bakYFy5ciMuXL2PXrl2IjY3FmDFjZK43KioKFy9exIoVK7Bo0SK5ljkoiKurK9LT03Hy5EmsXr26RLNbgXdjfeDAgWjevDnOnj2LU6dOoU6dOnB1dUVmZqZUXVVV1SJnS1PhuGkWEREREREREZVr+vr6ePHihUx5ZmYmunTpgtq1a2PRokWFnp+/0VA+iUQizkxVUHg31y3/0Xfg3SzTkiion/fL8hO278+KlaeN/NhUVVWL7H/q1KlSyw+URGBgIA4fPox58+ahevXqUFVVRffu3WU2KCvouoq61rlz52Lx4sVYtGgR7OzsoK6ujoCAALHdzp07S210ZmpqKv63uro6qlevjurVq6NJkyawtrbGqlWrMGHCBPzxxx9iIvLDe5e/tq+NjQ10dXXRokULTJ48GcbGxmIdS0tLAO+SxY8fP0ZISAi8vLygr68PRUVFcYZuvsePHxe40ZmSkhL69u2L4OBgnDt3Djt27CjyPhdmw4YNiIuLw5kzZ8SxuWHDBujo6GDXrl1SG3M9f/680I3DqGhMuBIRERERERFRuVa/fn2sX79epnzQoEF4/vw5Dh48WOzNifLlJ6ySkpJQv359AJDaQAsAbG1tsW3bNgiCICYST506BU1NTZn1Yj8nTU1NWFhYIDIyEk5OTjLHDQ0NYWho+El9nDp1Cr6+vujSpQuAdzNe4+LiPqnN/Hbd3d3Rp08fAO8Ssffu3UOtWrUAvLs2TU1NudrKy8tDVlYWAOnE7MfOASCe97F2K1asCEdHR0RGRsLDw0M8HhkZWegmZQMGDMC8efPQq1cv6OjoyBXXhzIyMqCgoCA1mzr//fuJ+jdv3iA2NlYcs1Q8TLgSERERERERUbnm7OyMCRMm4MWLF2Iia+7cudiyZQv27NmDt2/fyqyrqa2t/dEZocC7WaNNmjTBrFmzYGlpiZSUFAQFBUnVGTZsGBYtWoQRI0bA398fd+/eRXBwMMaMGSPOQvxSQkJCMHToUBgaGqJjx4549eoVTp06hREjRhR53oMHD2QSydbW1jL1rK2tsX37dri5uUEikWDy5MlFzsiVl7W1NbZu3YrTp09DR0cHCxYswOPHj8WEa0Fev36NGTNmoHPnzjA2NsbTp0+xdOlSJCYmokePHoWet2/fPjx+/BgNGzaEhoYGbt26hbFjx6J58+bixlNLly5F1apVYWNjA+DdOsDz5s3DyJEjxXbGjBkDHx8fNGjQAI0aNcKiRYvw+vVr9O/fv8B+bW1t8fTpU6ipqZXgDr3z/fffY+zYsRg+fDhGjBiBvLw8zJo1C0pKSlJJ9rNnz0JZWRlNmzYtcV/lGROuRERERERERFSu2dnZwcHBAZs3b8aQIUMAvNvIKicnBx06dCjwnDVr1sDX11eu9levXo2BAwfC0dERNWvWxJw5c9C+fXvxuKmpKfbt24exY8fC3t4eurq6GDhwoExi9kvw8fHBmzdvsHDhQgQGBkJfXx/du3f/6HmFrV/6oQULFmDAgAFo1qwZ9PX1MW7cOKSlpX1y3EFBQfjnn3/g7OwMNTU1+Pn5wcPDA6mpqYWeo6ioiDt37mDt2rV4+vQp9PT00LBhQ0RFRcms1/o+VVVVrFy5EqNHj0ZWVhbMzMzQtWtXjB8/XqyTl5eHCRMm4MGDB1BSUoKVlRVmz54tji8A6NWrF548eYIpU6YgOTkZ9erVw4EDB2Q20nqfnp5ekffB19cXcXFxOH78eIHHbWxssGfPHoSGhqJp06ZQUFBA/fr1ceDAAamlEDZu3Ahvb+9PSu6WZxLh/UVEyqm0tDRoa2sjNTUVWlpaZR1OsSx+sbhY9UfpjPpMkRRPTk4O9u3bBxcXF5k1UIg+xHFO5QHHOZUHn3Ocp4aGFqtt7eBguetynFNxcJyTvL7l30ML8+bNGzx48ACWlpaFbkD1Ndu7dy/Gjh2LmzdvfvFZpUSlpVWrVnByckJISEiJ23j69Clq1qyJixcvimvQUvG+4zjDlYiIiIiIiIjKPVdXV9y/fx+JiYkwMzMr63CIii01NRWxsbHYu3fvJ7UTFxeHZcuWMdn6CZhwJSIiIiIiIiICEBAQUNYhEJWYtrY2/v33309up0GDBmjQoEEpRFR+MeFKRET0H/c5H0ElIiIiIiIiaVyUhIiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIir3nj17BkNDQ8TFxZV1KHI7fvw4JBIJXr58CQAICwtDpUqVyjSmkoqLi4NEIsHVq1fLOhQqp5o0aYJt27aVSltKpdIKEREREREREVEh/ln7zxfrq5pPtRKdN2PGDLi7u8PCwqJ0A/rA8ePH4eTkhBcvXnyzydGSat26NerVq4dFixaVdSjFtn37dsycORN///03cnJyYG1tjR9//BF9+/Yt9JykpCT8+OOPuHjxIv7++2+MHDlSrmvPzc1FSEgI1q9fj+TkZJiYmMDX1xdBQUGQSCQA3t3LEydOiOcYGhqiZcuWmDdvHszNzQttu7THX0hICHbu3PnFEuUWFhYICAhAQEBAiduIiIiAl5cX3N3dsXPnTrE8KCgIo0ePRpcuXaCg8GlzVDnDlYiIiIiIiIjKtYyMDKxatQoDBw4s61DoK6Wrq4tJkybhzJkzuH79Ovr374/+/fvj4MGDhZ6TlZUFAwMDBAUFwd7eXu6+Zs+ejeXLl+PXX39FTEwMZs+ejTlz5uCXX36Rqjd48GAkJSXh0aNH2LVrFx4+fIg+ffqU+Bo/p5ycnLIOAcC7mdSBgYFo0aKFzLGOHTvi1atX2L9//yf3U6YJ1+XLl6Nu3brQ0tKClpYWmjZtKnVRrVu3hkQikXoNHTpUqo2EhAS4urpCTU0NhoaGGDt2LN6+ffulL4WIiIiIiIiIvlH79u2DsrIymjRpIpblP65/8OBB1K9fH6qqqmjTpg1SUlKwf/9+2NraQktLC71790ZGRoZ4Xl5eHn7++WdYWlpCVVUV9vb22Lp1K4B3yR4nJycAgI6ODiQSCXx9fQEABw4cwHfffYdKlSpBT08PnTp1Qmxs7CddV2xsLNzd3VG5cmVoaGigYcOGOHLkiFQdCwsLTJ8+Hf369YOGhgbMzc2xe/duPHnyBO7u7tDQ0EDdunVx8eJF8Zxnz57By8sLpqamUFNTg52dHTZu3PhJsX4oNzcXAwcOFO9jzZo1sXjxYqk6vr6+8PDwwMyZM1G5cmVUqlQJU6dOxdu3bzF27Fjo6uqiSpUqWLNmjdR548aNQ40aNaCmpoZq1aph8uTJH00Itm7dGl26dIGtrS2srKwwatQo1K1bF9HR0YWeY2FhgcWLF6Nfv37Q1taW+9pPnz4Nd3d3uLq6wsLCAt27d0f79u1x/vx5qXpqamowMjKCsbExmjRpAn9/f1y+fFnufoD/LUNx8OBB2NraQkNDAx06dEBSUpJY5/jx42jUqBHU1dVRqVIlNG/eHPHx8QgLC0NoaCiuXbsm5u3CwsIAABKJBMuXL0fnzp2hrq6OGTNmFLjkxc6dO8VZu/n27NmDhg0bQkVFBfr6+ujSpQuAdz+D+Ph4jB49WuyvOHJzc+Ht7Y3Q0FBUqyY7E15RUREuLi6IiIgoVrsFKdOEa5UqVTBr1ixcunQJFy9eRJs2beDu7o5bt26JdfKz9fmvOXPmiMdyc3Ph6uqK7OxsnD59GmvXrkVYWBimTJlSFpdDRERERERERN+gqKgoODo6FngsJCQEv/76K06fPo2HDx+iZ8+eWLRoETZs2IC9e/fi0KFDUjMPf/75Z/z5559YsWIFbt26hdGjR6NPnz44ceIEzMzMxDUi7969i6SkJDGJ+Pr1a4wZMwYXL15EZGQkFBQU0KVLF+Tl5ZX4utLT0+Hi4oLIyEhcuXIFHTp0gJubGxISEqTqLVy4EM2bN8eVK1fg6uqKvn37ol+/fujTpw8uX74MKysr9OvXD4IgAADevHkDR0dH7N27Fzdv3oSfnx/69u0rkxD8FHl5eahSpQq2bNmC27dvY8qUKZg4cSI2b94sVe/o0aN49OgRTp48iQULFiA4OBidOnWCjo4Ozp07h6FDh2LIkCH4999/xXM0NTURFhaG27dvY/HixVi5ciUWLlwod2yCICAyMhJ3795Fy5YtS+2a8zVr1gyRkZG4d+8eAODatWuIjo5Gx44dCz3n+fPn2Lx5Mxo3blzs/jIyMjBv3jysW7cOJ0+eREJCAgIDAwEAb9++hYeHB1q1aoXr16/jzJkz8PPzg0QiQa9evfDjjz+idu3aYt6uV69eYrshISHo0qULbty4gQEDBsgVy969e9GlSxe4uLjgypUriIyMRKNGjQC8W9ahSpUqmDp1qthfcUydOhWGhoZFzmRv1KgRoqKiitVuQcp0DVc3Nzep9zNmzMDy5ctx9uxZ1K5dG8D/svUFOXToEG7fvo0jR46gcuXKqFevHqZNm4Zx48YhJCQEFStW/OzXQERERERERETftvj4eJiYmBR4bPr06WjevDkAYODAgZgwYQJiY2PFGXLdu3fHsWPHMG7cOGRlZWHmzJk4cuQImjZtCgCoVq0aoqOj8dtvv6FVq1bQ1dUF8G7Nzfdn+3Xr1k2q39WrV8PAwAC3b99GnTp1SnRd9vb2Uo+yT5s2DTt27MDu3bvh7+8vlru4uGDIkCEAgClTpmD58uVo2LAhevToAeDdjNCmTZvi8ePHMDIygqmpqZiQA4ARI0bg4MGD2Lx5s5gc+1QVKlRAaGio+N7S0hJnzpzB5s2b0bNnT7FcV1cXS5YsgYKCAmrWrIk5c+YgIyMDEydOBABMmDABs2bNQnR0NDw9PQG8W6szn4WFBQIDAxEREYGffvqpyJhSU1NhamqKrKwsKCoqYtmyZfj+++9L5XrfN378eKSlpcHGxgaKiorIzc3FjBkz4O3tLVVv2bJl+OOPPyAIAjIyMlCjRo0ilzgoTE5ODlasWAErKysAgL+/P6ZOnQoASEtLQ2pqKjp16iQet7W1Fc/V0NCAkpJSgbm73r17o3///sWKZcaMGfD09JT62eePYV1dXSgqKkJTU7PQXGFhoqOjsWrVqo+uNWtiYoKHDx8iLy/vk9Zx/Wo2zcrNzcWWLVvw+vVr8UsJAMLDw7F+/XoYGRnBzc0NkydPhpqaGgDgzJkzsLOzQ+XKlcX6zs7O+OGHH3Dr1i3Ur1+/wL6ysrKQlZUlvk9LSwPwboB9LWtKyEvytnjTp7+W68uP42uJh75uHOdUHnzOcf62mP9HoThtc5xTcXCcU3nAcU7y4r3+umRmZkJFRaXAY3Xr1hX/u3LlyuJj6O+X5c/s/Pvvv5GRkSGThMvOzi40R5Hv/v37mDJlCs6dO4enT5+KM1sTEhJQp04ddOzYUZx5Z25uLvV0cGHS09MREhKCvXv3IikpCW/fvkVmZqbMDNcPrxEA7OzsZMpSUlJgZGSE3NxczJw5E5s3b0ZiYiKys7ORlZUl5mvCw8PFBC4A7N+/v8A1Mz9m6dKlWL16NRISEpCZmYns7GzUq1dPqk7t2rWlEmOVK1eWSlArKipCT08PKSkpYtmmTZuwZMkSxMbGIj09HW/fvoWWlhaAd/e7Vq1aYt2JEyeKyVtNTU1cvXoV6enpiIyMxJgxY1CtWjW0bt262NcGvJtZ/f6s1d9++w3e3t7YvHkzwsPDsWHDBtSuXRtXr15FQEAATExM4OPjI9b39vbGpEmTAACPHz/GzJkz0b59e1y6dAmampqoXbs24uPjAQAtWrQodG1SNTU1MZkKAMbGxuL90tXVha+vL5ydnfH999+jXbt26NmzJ4yNjT96fQ0aNCj2Pbl69SoGDx5c7POK8urVK/Tt2xcrV66Evr5+kXVVVVWRl5eHrKwsqKqqlrjPMk+43rhxA02bNsWbN2+goaGBHTt2iAO7d+/eMDc3h4mJCa5fv45x48bh7t272L59OwAgOTlZKtkK/O9LIDk5udA+f/75Z6lMeb5Dhw6JXw7fCnMUvvNcQfZh32eKpGQOHz5c1iHQN4DjnMqDzzrO3/s/0PI1XvzPEMc5yYPjnMoDjnOS1/trflLZ09fXx4sXLwo8VqFCBfG/JRKJ1Pv8svzkaHp6OoB3j0WbmppK1VNWVi4yBjc3N5ibm2PlypUwMTFBXl4e6tSpg+zsbADAH3/8gczMTJmYihIYGIjDhw9j3rx5qF69OlRVVdG9e3exzcKusbCy/OucO3cuFi9ejEWLFsHOzg7q6uoICAgQ2+3cubPUo+0f3gt5REREIDAwEPPnz0fTpk2hqamJuXPn4ty5c4XGnh9rUT+jM2fOiOt4Ojs7Q1tbGxEREZg/fz6AdzMc358FmT8jGQAUFBRQvXp1AEC9evUQExODn3/+ucQJ1wYNGkj1lZ/TGjt2LMaPHy/OyLWzs0N8fDx+/vlnqYSrtra2GE/16tWxatUqGBsbY9OmTRg0aBD27dsn/nGnqORhQfcrf/kIAFizZg1GjhyJAwcOYNOmTQgKCsLhw4el1jwuiLq6utR7BQUFqXYB2T8+fUqSszCxsbGIi4uTetI+fzwoKSnh7t27YsL5+fPnUFdX/+Q4yjzhWrNmTVy9ehWpqanYunUrfHx8cOLECdSqVQt+fn5iPTs7OxgbG6Nt27aIjY2VyrwX14QJEzBmzBjxfVpaGszMzNC+fXvxLxrfiuUvlxer/g+VfvhMkRRPTk4ODh8+jO+//17ufyio/OI4p/Lgc47ztFmzitW21vjxctflOKfi4Din8oDjnOSV/6QlfR3q16+P9evXf3I7tWrVgrKyMhISEtCqVasC6+Qvf5ibmyuWPXv2DHfv3sXKlSvFmaAfbsZUkqTlqVOn4OvrK246lJ6ejri4uGK3U1C77u7u6NOnD4B3yat79+6JE+g0NTWhqan5yX00a9YMw4YNE8s+dRMx4N2GVObm5uLMUADiLFDgXQIuP4n5MfkzIUtKVVW1wL4yMjJkHmdXVFT86Hq+ioqKACAm5s3Ni/dHwKLUr18f9evXx4QJE9C0aVNs2LABTZo0QcWKFaXGclEMDAzw6tUrvH79WkzGfviIf926dREZGVnoUgTF6S+fjY0Nbty4IVUWFBSEV69eYfHixTAzMxPLb968+dHZ6PIo84RrxYoVxcHl6OiICxcu4P/Yu/f4nuv//+P392Zms/PsYA7bHHI+hTR8ipxCzh2cR4pChVYoh7EcUk4VisqQtVT0EZNYjpHkkERyXhj6hM0sM9t+f/ju/fO2mfd7e83MbtcuLhfv1+v5er4er9eeI/c9X8/X7Nmz9dFHH2Vpm/nTkSNHjqhixYry9/fPsiDzuXPnJCnHtRwcHR2z/cmSg4NDofufi4xiGXdudJN77foK4z3H3cc4R1GQn+O8mI0vWsjNeGWcwxqMcxQFjHNYi/t8b2nTpo1Gjx6tixcvytPTM9f9uLq6KiwsTMOHD1d6erqaNm2qhIQE/fjjj3Jzc1NoaKgCAwNlMpm0atUqtWvXTk5OTvL09JS3t7fmz5+v0qVLKy4uTqNs+KHJ7VSuXFnLly9Xhw4dZDKZNHbs2Dy9hOvmfr/66itt27ZNnp6emjFjhs6dO2fxKP7t/P3331lCtuweT69cubIWL16stWvXKjg4WEuWLNHOnTsVHByc59rj4uIUHR2thg0bavXq1VqxYsUdj5syZYoaNGigihUrKiUlRTExMVqyZInmzfv/P2gbPXq0Tp8+rcWLF5u3ZV5rUlKS+dqLFy+e473q0KGDJk2apPLly6tGjRras2ePZsyYkeXFU8nJyeYnvM+dO6eIiAiVKFFCrVu3tuWW5Oj48eOaP3++OnbsqICAAB06dEiHDx9W3759Jd1YA/f48ePau3evypYtK1dX19vO5m7UqJGcnZ31xhtv6OWXX9aOHTsUGRlp0Wb8+PFq0aKFKlasqO7du+v69euKiYnRyJEjzefbvHmzunfvLkdHxzsuESBJJUqUyLIOcub6ybdu37JliyH3r8AD11vl9NOBzEGa+Y0YEhKiSZMm6fz58/L19ZV04xEYNzc3q77JAQAAAABA/qsQWuHOjQpQrVq19OCDD2rZsmUWa4/mRkREhHx8fDRlyhQdO3ZMHh4eevDBB83rgJYpU0YTJkzQqFGj1L9/f/Xt21eRkZGKjo7Wyy+/rJo1a6pKlSp67733cv2oeqbMkK5x48YqVaqURo4cacjs6jFjxujYsWNq06aNnJ2dNXDgQHXu3FkJCQl3PDYqKkpRUVEW2yIiIsyzZTMNGjRIe/bs0TPPPCOTyaQePXpo8ODBt12H1FodO3bU8OHDNXToUKWkpKh9+/YaO3aswsPDczzuypUrGjx4sE6dOiUnJydVrVpVn332mZ555hlzm/j4+Czr4948W3LXrl2KiopSYGBgjjON33//fY0dO1aDBw/W+fPnFRAQoEGDBmncuHEW7RYsWKAFCxZIkjw9PVW7dm3FxMSoSpUqVt6NO3N2dtYff/yhRYsW6Z9//lHp0qU1ZMgQ8/dJt27dtHz5cjVv3lyXLl3SwoUL1a9fv2z78vLy0meffabXXntNCxYsUIsWLRQeHm7xhHuzZs305ZdfKiIiQlOnTpWbm5seeeQR8/6JEydq0KBB5uA7c4kCk8mU47mtcfr0aW3bts2Q2e6mjFsXT7iLRo8erbZt26p8+fK6fPmyoqKi9Pbbb2vt2rWqUKGCoqKi1K5dO3l7e2vfvn0aPny4ypYtq02bNkm6Mf2+bt26CggI0LRp03T27Fn16dNHzz33nCZPnmx1HYmJiXJ3d1dCQkKhW1Jg9sXZNrV/xfOVfKrENqmpqYqJiVG7du34yS7uiHGOoiA/x3lCNuuW58R9/Hir2zLOYQvGOYoCxjmsVZj/HXo7V69e1fHjxxUcHHzbF1Ddy1avXq3XXntN+/fvz9PbyQHcXcePH9cDDzygAwcOqHLlyrnuZ+TIkbp48aLmz5+f7X5b/owr0Bmu58+fV9++fRUfHy93d3fVrl1ba9euVatWrfTXX39p/fr1mjVrlq5cuaJy5cqpW7duGjNmjPl4e3t7rVq1Si+++KJCQkJUsmRJhYaGauLEiQV4VQAKk/z8hwsAAACAwqN9+/Y6fPiwTp8+bbGmI4B7W0xMjAYOHJinsFWSfH19Ld75lBcFGrh+8sknt91Xrlw580zWnAQGBiomF2/fBAAAAAAAuNmwYcMKugQANhoyZIgh/bz66quG9CNJzJEHAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAirx//vlHvr6+OnHiREGXYrWNGzfKZDLp0qVLkqTIyEh5eHgUaE25deLECZlMJu3du7egS0ER1b17d02fPt2QvghcAQAAAABA/jr8/d37lUuTJk1Sp06dFBQUZNx1Z+PWkLQoadasmYYNG1bQZeTK8uXL1aBBA3l4eKhkyZKqW7eulixZkuMx8fHx6tmzpx544AHZ2dlZfe1paWkaO3asgoOD5eTkpIoVKyoiIkIZGRnmNs2aNZPJZDL/8vPz01NPPaWTJ0/m2LfR4y88PFx169Y1pC9rBAUFadasWbk6dtasWapSpYqcnJxUrlw5DR8+XFevXjXvHzNmjCZNmqSEhIQ810ngCgAAAAAAirTk5GR98sknGjBgQEGXgnuUl5eX3nzzTW3fvl379u1T//791b9/f61du/a2x6SkpMjHx0djxoxRnTp1rD7X22+/rXnz5umDDz7QwYMH9fbbb2vatGl6//33Ldo9//zzio+P15kzZ/Tf//5Xf/31l3r37p3ra8xPqampBXr+qKgojRo1SuPHj9fBgwf1ySef6IsvvtAbb7xhblOzZk1VrFhRn332WZ7PR+AKAAAAAACKtJiYGDk6Ourhhx82b8ucCbh27VrVq1dPTk5Oeuyxx3T+/HmtWbNG1apVk5ubm3r27Knk5GTzcenp6ZoyZYp5dmKdOnX01VdfSbrx2Hzz5s0lSZ6enjKZTOrXr58k6bvvvlPTpk3l4eEhb29vPfHEEzp69Gieruvo0aPq1KmT/Pz85OLiooYNG2r9+vUWbYKCgvTWW2+pb9++cnFxUWBgoFauXKm///5bnTp1kouLi2rXrq1ffvnFfMw///yjHj16qEyZMnJ2dlatWrX0+eef56nWW6WlpWnAgAHm+1ilShXNnj3bok2/fv3UuXNnTZ48WX5+fvLw8NDEiRN1/fp1vfbaa/Ly8lLZsmW1cOFCi+NGjhypBx54QM7OzqpQoYLGjh17x0CwWbNm6tKli6pVq6aKFSvqlVdeUe3atbV169bbHhMUFKTZs2erb9++cnd3t/rat23bpk6dOql9+/YKCgrSk08+qdatW+vnn3+2aOfs7Cx/f3+VLl1aDz/8sIYOHardu3dbfR7p/y9DsXbtWlWrVk0uLi56/PHHFR8fb26zceNGPfTQQypZsqQ8PDzUpEkTnTx5UpGRkZowYYJ+/fVX80zbyMhISZLJZNK8efPUsWNHlSxZUpMmTcp2yYtvvvlGJpPJYtu3336rhg0bqkSJEipVqpS6dOki6cbX4OTJkxo+fLj5fNbatm2bmjRpop49eyooKEitW7dWjx49stzTDh06KDo62oY7mD0CVwAAAAAAUKRt2bJF9evXz3ZfeHi4PvjgA23btk1//fWXnn76ac2aNUtRUVFavXq1vv/+e4uZh1OmTNHixYv14Ycf6vfff9fw4cPVu3dvbdq0SeXKldPXX38tSTp06JDi4+PNIeKVK1c0YsQI/fLLL4qNjZWdnZ26dOmi9PT0XF9XUlKS2rVrp9jYWO3Zs0ePP/64OnTooLi4OIt2M2fOVJMmTbRnzx61b99effr0Ud++fdW7d2/t3r1bFStWVN++fc2PtF+9elX169fX6tWrtX//fg0cOFB9+vTJEl7lRXp6usqWLasvv/xSBw4c0Lhx4/TGG29o2bJlFu1++OEHnTlzRps3b9aMGTM0fvx4PfHEE/L09NSOHTv0wgsvaNCgQTp16pT5GFdXV0VGRurAgQOaPXu2FixYoJkzZ1pdW0ZGhmJjY3Xo0CE98sgjhl1zpsaNGys2NlZ//vmnJOnXX3/V1q1b1bZt29sec+HCBS1btkyNGjWy+XzJycl69913tWTJEm3evFlxcXEKCwuTJF2/fl2dO3fWo48+qn379mn79u0aOHCgTCaTnnnmGb366quqUaOG4uPjFR8fr2eeecbcb3h4uLp06aLffvtNzz77rFW1rF69Wl26dFG7du20Z88excbG6qGHHpJ0Y1mHsmXLauLEiebzWatx48batWuXeYweO3ZMMTExateunUW7hx56SD///LNSUlKs7js7xfJ0NAAAAAAAQCF38uRJBQQEZLvvrbfeUpMmTSRJAwYM0OjRo3X06FFVqFBBkvTkk09qw4YNGjlypFJSUjR58mStX79eISEhkqQKFSpo69at+uijj/Too4/Ky8tLkuTr62sx269bt24W5/3000/l4+OjAwcOqGbNmrm6rjp16lg8yh4REaEVK1Zo5cqVGjp0qHl7u3btNGjQIEnSuHHjNG/ePDVs2FBPPfWUpBszQkNCQnTu3Dn5+/urTJky5kBOkl566SWtXbtWy5YtM4djeeXg4KAJEyaYPwcHB2v79u1atmyZnn76afN2Ly8vvffee7Kzs1OVKlU0bdo0JScnmx8VHz16tKZOnaqtW7eqe/fukm6s1ZkpKChIYWFhio6O1uuvv55jTQkJCSpTpoxSUlJkb2+vuXPnqlWrVoZc781GjRqlxMREVa1aVfb29kpLS9OkSZPUq1cvi3Zz587Vxx9/rIyMDCUnJ+uBBx7IcYmD20lNTdWHH36oihUrSpKGDh2qiRMnSpISExOVkJCgJ554wry/WrVq5mNdXFxUrFgx+fv7Z+m3Z8+e6t+/v021TJo0Sd27d7f42meOYS8vL9nb28vV1TXb8+WkZ8+e+t///qemTZsqIyND169f1wsvvGCxpIAkBQQE6Nq1azp79qwCAwNtOsfNmOEKAAAAAACKtH///VclSpTIdl/t2rXNv/fz8zM/hn7ztvPnz0uSjhw5ouTkZLVq1UouLi7mX4sXL77j8gCHDx9Wjx49VKFCBbm5uZlf3pU5G7Vt27bm/mrUqGHVdSUlJSksLEzVqlWTh4eHXFxcdPDgwSwzXG+9RkmqVatWlm2Z15mWlqaIiAjVqlVLXl5ecnFx0dq1a839Ll261OL6t2zZYlW9t5ozZ47q168vHx8fubi4aP78+Vlqr1Gjhuzs/n+85efnZ1G7vb29vL29zbVL0hdffKEmTZrI399fLi4uGjNmjLnfuLg4i9onT55sPs7V1VV79+7Vzp07NWnSJI0YMUIbN27M1bVJN2ZW33yupUuXSpKWLVumpUuXKioqSrt379aiRYv07rvvatGiRRbH9+rVS3v37jXPgK1UqZJat26ty5cvm+9NZt85zY51dnY2h6mSVLp0afP98vLyUr9+/dSmTRt16NBBs2fPtnpmaYMGDWy6H5K0d+9etWjRwubj7mTjxo2aPHmy5s6dq927d2v58uVavXq1IiIiLNo5OTlJksUyIbnBDFcAAAAAAFCklSpVShcvXsx2n4ODg/n3JpPJ4nPmtszH/pOSkiTdeCy6TJkyFu0cHR1zrKFDhw4KDAzUggULFBAQoPT0dNWsWVPXrl2TJH388cf6999/s9SUk7CwMK1bt07vvvuuKlWqJCcnJz355JPmPm93jbfblnmd77zzjmbPnq1Zs2apVq1aKlmypIYNG2but2PHjhaPtt96L6wRHR2tsLAwTZ8+XSEhIXJ1ddU777yjHTt23Lb2zFpz+hpt375dvXr10oQJE9SmTRu5u7srOjpa06dPl3RjhuPevXvNx2bOSJYkOzs7VapUSZJUt25dHTx4UFOmTFGzZs1svj7pRiB587kyg+3XXntNo0aNMs/IrVWrlk6ePKkpU6YoNDTU3N7d3d1cT6VKlfTJJ5+odOnS+uKLL/Tcc88pJibGvDZtZpCYnezuV+byEZK0cOFCvfzyy/ruu+/0xRdfaMyYMVq3bp3FmsfZKVmypMVnOzs7i36lrC/TyqnOvBg7dqz69Omj5557TtKNe3rlyhUNHDhQb775pjm0v3DhgiTJx8cnT+cjcAUAAAAAAEVavXr1DHkzefXq1eXo6Ki4uDg9+uij2bYpXry4pBuzRDP9888/OnTokBYsWKD//Oc/kpTlZUy5CS1//PFH9evXz/zSoaSkJJ04ccLmfrLrt1OnTurdu7ekG0Hsn3/+qerVq0u6MRPU1dU1z+do3LixBg8ebN6W15eISTdenhQYGKg333zTvO3kyZPm3xcrVswcYt5Jenp6ntb6dHJyyvZcycnJFrN2pRszde+0nq+9vb0kmYP5vDwSf6t69eqpXr16Gj16tEJCQhQVFaWHH35YxYsXtxjLOfHx8dHly5d15coVcxh7c+As3ZhtHRsbe9ulCGw5381ud08lWYTA+/fvV9myZVWqVCmbz3EzAlcAAAAAAFCktWnTRqNHj9bFixfl6emZ635cXV0VFham4cOHKz09XU2bNlVCQoJ+/PFHubm5KTQ0VIGBgTKZTFq1apXatWsnJycneXp6ytvbW/Pnz1fp0qUVFxenUaNG5fm6KleurOXLl6tDhw4ymUwaO3Zsnl7CdXO/X331lbZt2yZPT0/NmDFD586dMweuOfn777+zhGylS5fO9hyLFy/W2rVrFRwcrCVLlmjnzp0KDg7Oc+1xcXGKjo5Ww4YNtXr1aq1YseKOx02ZMkUNGjRQxYoVlZKSopiYGC1ZskTz5s0ztxk9erROnz6txYsXm7dlXmtSUpL52osXL57jverQoYMmTZqk8uXLq0aNGtqzZ49mzJiR5cVTycnJOnv2rCTp3LlzioiIUIkSJdS6dWtbbkmOjh8/rvnz56tjx44KCAjQoUOHdPjwYfXt21fSjTVwjx8/rr1796ps2bJydXW97WzuRo0aydnZWW+88YZefvll7dixQ5GRkRZtxo8frxYtWqhixYrq3r27rl+/rpiYGI0cOdJ8vs2bN6t79+5ydHS0Ohjt0KGDZsyYoXr16qlRo0Y6cuSIxo4dqw4dOpiDV+nGMg9G3D8CVwAAAAAAkL8qGxcA5YdatWrpwQcf1LJly8wvj8qtiIgI+fj4aMqUKTp27Jg8PDz04IMPml/OU6ZMGU2YMEGjRo1S//791bdvX0VGRio6Olovv/yyatasqSpVqui9997L9aPqmTJDusaNG6tUqVIaOXKkEhMT89SndOOlU8eOHVObNm3k7OysgQMHqnPnzkpISLjjsVFRUYqKirLYFhERYZ4tm2nQoEHas2ePnnnmGZlMJvXo0UODBw/WmjVr8lR7x44dNXz4cA0dOlQpKSlq3769xo4dq/Dw8ByPu3LligYPHqxTp07JyclJVatW1WeffaZnnnnG3CY+Pj7LGrP16tUz/37Xrl2KiopSYGBgjjON33//fY0dO1aDBw/W+fPnFRAQoEGDBmncuHEW7RYsWKAFCxZIkjw9PVW7dm3FxMSoSpUqVt6NO3N2dtYff/yhRYsW6Z9//lHp0qU1ZMgQ8/dJt27dtHz5cjVv3lyXLl3SwoUL1a9fv2z78vLy0meffabXXntNCxYsUIsWLRQeHq6BAwea2zRr1kxffvmlIiIiNHXqVLm5uemRRx4x7584caIGDRpkDr4zZ6eaTKYczz1mzBiZTCaNGTNGp0+flo+PjznYznT16lV98803+u677/J41yRTxq2LJxRBiYmJcnd3V0JCgtzc3Aq6HJvMvjjbpvaveL6ST5XYJjU1VTExMWrXrp3Va8+g6MrPcZ5w05sPreE+frzVbRnnsAXjHEUB4xxFAeMc1irM/w69natXr+r48eMKDg6+7Quo7mWrV6/Wa6+9pv3792d59BjAvev48eN64IEHdODAAVWuXDnX/cybN08rVqzQ999/n+1+W/6MY4YrAAAAAAAo8tq3b6/Dhw/r9OnTKleuXEGXA8BKMTExGjhwYJ7CVunGy8Pef/99Q2oicAUAAAAAAJA0bNiwgi4BgI2GDBliSD/PPfecIf1IEnPkAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAgCLvn3/+ka+vr06cOFHQpVht48aNMplMunTpkiQpMjJSHh4eBVpTbp04cUImk0l79+4t6FJQBF27dk1BQUH65ZdfDOmvmCG9AAAAAAAA3Ebbfbvu2rnW1K6fq+MmTZqkTp06KSgoyNiCbrFx40Y1b95cFy9eLLThaG41a9ZMdevW1axZswq6FJstX75ckydP1pEjR5SamqrKlSvr1VdfVZ8+fXI8Zt68edq7d69SUlJUo0YNhYeHq02bNnc814cffqhdu3bpwoUL2rNnj+rWrWvRJigoSCdPnpQk2dnZyc/PT23bttW7774rT0/P2/YdGRmpYcOGmUP6vOrXr58uXbqkb775xpD+7sRkMmnFihXq3LlzrvuYOnWqRo8erVdeecU8FosXL66wsDCNHDlSsbGxea6TGa4AAAAAAKBIS05O1ieffKIBAwYUdCm4R3l5eenNN9/U9u3btW/fPvXv31/9+/fX2rVrb3vM5s2b1apVK8XExGjXrl1q3ry5OnTooD179uR4ritXrqhp06Z6++23c2w3ceJExcfHKy4uTkuXLtXmzZv18ssv5+r68ltqampBlyBJ2rlzpz766CPVrl07y75evXpp69at+v333/N8HgJXAAAAAABQpMXExMjR0VEPP/yweVvm4/pr165VvXr15OTkpMcee0znz5/XmjVrVK1aNbm5ualnz55KTk42H5eenq4pU6YoODhYTk5OqlOnjr766itJNx6bb968uSTJ09NTJpNJ/fr1kyR99913atq0qTw8POTt7a0nnnhCR48ezdN1HT16VJ06dZKfn59cXFzUsGFDrV+/3qJNUFCQ3nrrLfXt21cuLi4KDAzUypUr9ffff6tTp05ycXFR7dq1LR61/ueff9SjRw+VKVNGzs7OqlWrlj7//PM81XqrtLQ0DRgwwHwfq1SpotmzZ1u06devnzp37qzJkyfLz89PHh4emjhxoq5fv67XXntNXl5eKlu2rBYuXGhx3MiRI/XAAw/I2dlZFSpU0NixY+8YCDZr1kxdunRRtWrVVLFiRb3yyiuqXbu2tm7dettjZs2apddff10NGzZU5cqVNXnyZFWuXFnffvttjufq06ePxo0bp5YtW+bYztXVVf7+/ipTpoyaN2+u0NBQ7d69O8djbhUeHq66detqyZIlCgoKkru7u7p3767Lly+b23z11VeqVauWnJyc5O3trZYtW+rKlSsKDw/XokWL9N///lcmk0kmk0kbN240Lw/xxRdf6NFHH1WJEiW0dOlS87luvUe3zir/9NNPVaNGDTk6Oqp06dIaOnSoJJnbdenSRSaTyebZ6ElJSerVq5cWLFiQ7SxgT09PNWnSRNHR0Tb1mx0CVwAAAAAAUKRt2bJF9etnvxRBeHi4PvjgA23btk1//fWXnn76ac2aNUtRUVFavXq1vv/+e73//vvm9lOmTNHixYv14Ycf6vfff9fw4cPVu3dvbdq0SeXKldPXX38tSTp06JDi4+PNIeKVK1c0YsQI/fLLL4qNjZWdnZ26dOmi9PT0XF9XUlKS2rVrp9jYWO3Zs0ePP/64OnTooLi4OIt2M2fOVJMmTbRnzx61b99effr0Ud++fdW7d2/t3r1bFStWVN++fZWRkSFJunr1qurXr6/Vq1dr//79GjhwoPr06aOff/4517XeKj09XWXLltWXX36pAwcOaNy4cXrjjTe0bNkyi3Y//PCDzpw5o82bN2vGjBkaP368nnjiCXl6emrHjh164YUXNGjQIJ06dcp8jKurqyIjI3XgwAHNnj1bCxYs0MyZM62uLSMjQ7GxsTp06JAeeeQRm67p8uXL8vLysvoYa50+fVrffvutGjVqZPOxR48e1TfffKNVq1Zp1apV2rRpk6ZOnSpJio+PV48ePfTss8/q4MGD2rhxo7p27aqMjAyFhYXp6aef1uOPP674+HjFx8ercePG5n5HjRqlV155RQcPHrzjMgqZ5s2bpyFDhmjgwIH67bfftHLlSlWqVEnSjdmpkrRw4ULFx8ebP1tryJAhat++fY5B9kMPPaQtW7bY1G92WMMVAAAAAAAUaSdPnlRAQEC2+9566y01adJEkjRgwACNHj1aR48eVYUKFSRJTz75pDZs2KCRI0cqJSVFkydP1vr16xUSEiJJqlChgrZu3aqPPvpIjz76qDls8/X1tVjDtVu3bhbn/fTTT+Xj46MDBw6oZs2aubquOnXqqE6dOubPERERWrFihVauXGmeNShJ7dq106BBgyRJ48aN07x589SwYUM99dRTkm7MCA0JCdG5c+fMMyrDwsLMx7/00ktau3atli1bpoceeihXtd7KwcFBEyZMMH8ODg7W9u3btWzZMj399NPm7V5eXnrvvfdkZ2enKlWqaNq0aUpOTtYbb7whSRo9erSmTp2qrVu3qnv37pKkMWPGmI8PCgpSWFiYoqOj9frrr+dYU0JCgsqUKaOUlBTZ29tr7ty5atWqldXX9O677yopKcmi/rwYOXKkxowZo7S0NF29elWNGjXSjBkzbO4nPT1dkZGRcnV1lXRjhm1sbKwmTZqk+Ph4Xb9+XV27dlVgYKAkqVatWuZjnZyclJKSIn9//yz9Dhs2TF27drWplrfeekuvvvqqXnnlFfO2hg0bSpJ8fHwkSR4eHtmeLyfR0dHavXv3HUPagIAA89q4ecEMVwAAAAAAUKT9+++/KlGiRLb7bl7r0c/Pz/wY+s3bzp8/L0k6cuSIkpOT1apVK7m4uJh/LV68+I7LAxw+fFg9evRQhQoV5ObmZn5cOnM2atu2bc391ahRw6rrSkpKUlhYmKpVqyYPDw+5uLjo4MGDWWa43nqNkmWolrkt8zrT0tIUERGhWrVqycvLSy4uLlq7dq2536VLl1pcf25nDM6ZM0f169eXj4+PXFxcNH/+/Cy116hRQ3Z2/z/e8vPzs6jd3t5e3t7e5tol6YsvvlCTJk3k7+8vFxcXjRkzxtxvXFycRe2TJ082H+fq6qq9e/dq586dmjRpkkaMGKGNGzdadS1RUVGaMGGCli1bJl9fX0l5v0+vvfaa9u7dq3379plf9NS+fXulpaVJkkXfL7zwwm37CQoKMoetklS6dGnz/apTp45atGihWrVq6amnntKCBQt08eJFq+pr0KCBTddz/vx5nTlzRi1atLDpuDv566+/9Morr2jp0qW3/T7P5OTkZLFESG4xwxUAAAAAABRppUqVum2I5ODgYP69yWSy+Jy5LfOx/6SkJEnS6tWrVaZMGYt2jo6OOdbQoUMHBQYGasGCBQoICFB6erpq1qypa9euSZI+/vhj/fvvv1lqyklYWJjWrVund999V5UqVZKTk5OefPJJc5+3u8bbbcu8znfeeUezZ8/WrFmzVKtWLZUsWVLDhg0z99uxY0eLR9tvvRfWiI6OVlhYmKZPn66QkBC5urrqnXfe0Y4dO25be2atOX2Ntm/frl69emnChAlq06aN3N3dFR0drenTp0u6McNx79695mNvfvzfzs7O/Hh73bp1dfDgQU2ZMkXNmjW747U899xz+vLLLy0eZ8/rfSpVqpS5nsqVK2vWrFkKCQnRhg0b1LJlS4vrcHNzu20/Od0ve3t7rVu3Ttu2bTMvn/Hmm29qx44dCg4OzrG+kiVLWny2s7MzL0uR6ea1c52cnHLsL7d27dql8+fP68EHHzRvS0tL0+bNm/XBBx+YZyxL0oULF8wzafOCwBW3lXDT1H1ruI8fn0+VAAAAAACQf+rVq6fPPvssz/1Ur15djo6OiouL06OPPpptm+LFi0uSeRaidOMlVIcOHdKCBQv0n//8R5KyvIwpN6Hljz/+qH79+qlLly6SbgTCJ06csLmf7Prt1KmTevfuLelGEPvnn3+qevXqkm7MBL15xmRuz9G4cWMNHjzYvC2vLxGTpG3btikwMFBvvvmmedvNj5AXK1bMHGLeSXp6ulJSUnJs8/nnn+vZZ59VdHS02rdvb7HPiPt0s8zQMDOYt/Y67sRkMqlJkyZq0qSJxo0bp8DAQK1YsUIjRoxQ8eLFLcZyTnx8fHT27FllZGSYQ/ybQ2FXV1cFBQUpNjbW/HK5Wzk4OFh9vkwtWrTQb7/9ZrGtf//+qlq1qkaOHGm+b5K0f/9+1atXz6b+s0PgCgAAAAAAirQ2bdpo9OjRunjxYrZvL7eWq6urwsLCNHz4cKWnp6tp06ZKSEjQjz/+KDc3N4WGhiowMFAmk0mrVq1Su3bt5OTkJE9PT3l7e2v+/PkqXbq04uLiNGrUqDxfV+XKlbV8+XJ16NBBJpNJY8eOzdNLuG7u96uvvtK2bdvk6empGTNm6Ny5c+bANSd///23Rcgm3XiEPbtzLF68WGvXrlVwcLCWLFminTt33nFWpTW1x8XFKTo6Wg0bNtTq1au1YsWKOx43ZcoUNWjQQBUrVlRKSopiYmK0ZMkSzZs3z9xm9OjROn36tBYvXizpxjICoaGhmj17tho1aqSzZ89KujGT093d/bbnunDhguLi4nTmzBlJN16wJkn+/v4Wa5devnzZHGD+9ddfev311+Xj42Px4qq82rFjh2JjY9W6dWv5+vpqx44d+vvvv1WtWjVJN5YjWLt2rQ4dOiRvb+8cr6tZs2b6+++/NW3aND355JP67rvvtGbNGovZt+Hh4XrhhRfk6+urtm3b6vLly/rxxx/10ksvmc8XGxurJk2ayNHR0arvV1dX1yzrIJcsWVLe3t5Ztm/ZskURERFW35/bIXAFAAAAAAD5ak3t+gVdQo5q1aqlBx98UMuWLTO/PCq3IiIi5OPjoylTpujYsWPy8PDQgw8+aH6JU5kyZTRhwgSNGjVK/fv3V9++fRUZGano6Gi9/PLLqlmzpqpUqaL33nvvjo+q38mMGTP07LPPqnHjxipVqpRGjhypxMTEPPUp3Xjp1LFjx9SmTRs5Oztr4MCB6ty5sxISEu54bFRUlKKioiy2RUREmGfLZho0aJD27NmjZ555RiaTST169NDgwYO1Zs2aPNXesWNHDR8+XEOHDlVKSorat2+vsWPHKjw8PMfjrly5osGDB+vUqVNycnJS1apV9dlnn+mZZ54xt4mPj7dYY3b+/Pm6fv26hgwZoiFDhpi3h4aGKjIy8rbnWrlypfr372/+nPmyr/Hjx1vUOW7cOI0bN07SjdmjDRs21Pfffy9vb29rboVV3NzctHnzZs2aNUuJiYkKDAzU9OnT1bZtW0nS888/r40bN6pBgwZKSkrShg0bzOsP36patWqaO3euJk+erIiICHXr1k1hYWGaP3++uU1oaKiuXr2qmTNnKiwsTKVKldKTTz5p3j99+nSNGDFCCxYsUJkyZXTixAmdOHFCwcHB2rBhQ56+Z7Zv366EhASL8+WWKePWxROKoMTERLm7uyshISHHNS3uRbMvzrap/Suer9y50f/JzyUFUlNTFRMTo3bt2lm99gyKLsY5igLGOYoCxjmKAsY5rFWY/x16O1evXtXx48cVHBx8xxfT3ItWr16t1157Tfv377d4CROAe9uGDRvUtWtXHTt2LE8z1J955hnVqVPH/MORW9nyZxwzXAEAAAAAQJHXvn17HT58WKdPn1a5cuUKuhwAVoqJidEbb7yRp7D12rVrqlWrloYPH25ITQSuAAAAAAAAkoYNG1bQJQCw0TvvvJPnPooXL64xY8YYUM0NzJEHAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAirx//vlHvr6+OnHiREGXYrWNGzfKZDLp0qVLkqTIyEh5eHgUaE25deLECZlMJu3du7egS0ER9fDDD+vrr782pK9ihvQCAAAAAABwG6kTXr1r53IYPz1Xx02aNEmdOnVSUFCQsQXdYuPGjWrevLkuXrxYaMPR3GrWrJnq1q2rWbNmFXQpNlu+fLkmT56sI0eOKDU1VZUrV9arr76qPn365HjMvHnztHfvXqWkpKhGjRoKDw9XmzZt7niuDz/8ULt27dKFCxe0Z88e1a1b16JNUFCQTp48KUmys7OTn5+f2rZtq3fffVeenp637TsyMlLDhg0zh/R51a9fP126dEnffPONIf3diclk0ooVK9S5c2ebjktLS1N4eLg+++wznT17VgEBAerXr5/GjBkjk8kkSRozZoyGDx+uLl26yM4ub3NUmeEKAAAAAACKtOTkZH3yyScaMGBAQZeCe5SXl5fefPNNbd++Xfv27VP//v3Vv39/rV279rbHbN68Wa1atVJMTIx27dql5s2bq0OHDtqzZ0+O57py5YqaNm2qt99+O8d2EydOVHx8vOLi4rR06VJt3rxZL7/8cq6uL7+lpqYW6PnffvttzZs3Tx988IEOHjyot99+W9OmTdP7779vbtO2bVtdvnxZa9asyfP5CFwBAAAAAECRFhMTI0dHRz388MPmbZmP669du1b16tWTk5OTHnvsMZ0/f15r1qxRtWrV5Obmpp49eyo5Odl8XHp6uqZMmaLg4GA5OTmpTp06+uqrryTdeGy+efPmkiRPT0+ZTCb169dPkvTdd9+padOm8vDwkLe3t5544gkdPXo0T9d19OhRderUSX5+fnJxcVHDhg21fv16izZBQUF666231LdvX7m4uCgwMFArV67U33//rU6dOsnFxUW1a9fWL7/8Yj7mn3/+UY8ePVSmTBk5OzurVq1a+vzzz/NU663S0tI0YMAA832sUqWKZs+ebdGmX79+6ty5syZPniw/Pz95eHho4sSJun79ul577TV5eXmpbNmyWrhwocVxI0eO1AMPPCBnZ2dVqFBBY8eOvWMg2KxZM3Xp0kXVqlVTxYoV9corr6h27draunXrbY+ZNWuWXn/9dTVs2FCVK1fW5MmTVblyZX377bc5nqtPnz4aN26cWrZsmWM7V1dX+fv7q0yZMmrevLlCQ0O1e/fuHI+5VXh4uOrWraslS5YoKChI7u7u6t69uy5fvmxu89VXX6lWrVpycnKSt7e3WrZsqStXrig8PFyLFi3Sf//7X5lMJplMJm3cuNG8PMQXX3yhRx99VCVKlNDSpUvN57r1Ht06q/zTTz9VjRo15OjoqNKlS2vo0KGSZG7XpUsXmUwmm2ajb9u2TZ06dVL79u0VFBSkJ598Uq1bt9bPP/9sbmNvb6927dopOjrapnuYHQJXAAAAAABQpG3ZskX169fPdl94eLg++OADbdu2TX/99ZeefvppzZo1S1FRUVq9erW+//57i1lyU6ZM0eLFi/Xhhx/q999/1/Dhw9W7d29t2rRJ5cqVM68ReejQIcXHx5tDxCtXrmjEiBH65ZdfFBsbKzs7O3Xp0kXp6em5vq6kpCS1a9dOsbGx2rNnjx5//HF16NBBcXFxFu1mzpypJk2aaM+ePWrfvr369Omjvn37qnfv3tq9e7cqVqyovn37KiMjQ5J09epV1a9fX6tXr9b+/fs1cOBA9enTxyK8yqv09HSVLVtWX375pQ4cOKBx48bpjTfe0LJlyyza/fDDDzpz5ow2b96sGTNmaPz48XriiSfk6empHTt26IUXXtCgQYN06tQp8zGurq6KjIzUgQMHNHv2bC1YsEAzZ860uraMjAzFxsbq0KFDeuSRR2y6psuXL8vLy8vqY6x1+vRpffvtt2rUqJHNxx49elTffPONVq1apVWrVmnTpk2aOnWqJCk+Pl49evTQs88+q4MHD2rjxo3q2rWrMjIyFBYWpqefflqPP/644uPjFR8fr8aNG5v7HTVqlF555RUdPHjwjssoZJo3b56GDBmigQMH6rffftPKlStVqVIlSdLOnTslSQsXLlR8fLz5szUaN26s2NhY/fnnn5KkX3/9VVu3blXbtm0t2j300EPasmWL1f3eDmu4AgAAAACAIu3kyZMKCAjIdt9bb72lJk2aSJIGDBig0aNH6+jRo6pQoYIk6cknn9SGDRs0cuRIpaSkaPLkyVq/fr1CQkIkSRUqVNDWrVv10Ucf6dFHHzWHbb6+vhZruHbr1s3ivJ9++ql8fHx04MAB1axZM1fXVadOHdWpU8f8OSIiQitWrNDKlSvNswYlqV27dho0aJAkady4cZo3b54aNmyop556StKNGaEhISE6d+6ceUZlWFiY+fiXXnpJa9eu1bJly/TQQw/lqtZbOTg4aMKECebPwcHB2r59u5YtW6ann37avN3Ly0vvvfee7OzsVKVKFU2bNk3Jycl64403JEmjR4/W1KlTtXXrVnXv3l3SjbU6MwUFBSksLEzR0dF6/fXXc6wpISFBZcqUUUpKiuzt7TV37ly1atXK6mt69913lZSUZFF/XowcOVJjxoxRWlqarl69qkaNGmnGjBk295Oenq7IyEi5urpKujHDNjY2VpMmTVJ8fLyuX7+url27KjAwUJJUq1Yt87FOTk5KSUmRv79/ln6HDRumrl272lTLW2+9pVdffVWvvPKKeVvDhg0lST4+PpIkDw+PbM+Xk1GjRikxMVFVq1aVvb290tLSNGnSJPXq1cuiXUBAgP766y+lp6fnaR1XZrgCAAAAAIAi7d9//1WJEiWy3Ve7dm3z7/38/MyPod+87fz585KkI0eOKDk5Wa1atZKLi4v51+LFi++4PMDhw4fVo0cPVahQQW5ububHpTNno7Zt29bcX40aNay6rqSkJIWFhalatWry8PCQi4uLDh48mGWG663XKFmGapnbMq8zLS1NERERqlWrlry8vOTi4qK1a9ea+126dKnF9ed2xuCcOXNUv359+fj4yMXFRfPnz89Se40aNSyCMT8/P4va7e3t5e3tba5dkr744gs1adJE/v7+cnFx0ZgxY8z9xsXFWdQ+efJk83Gurq7au3evdu7cqUmTJmnEiBHauHGjVdcSFRWlCRMmaNmyZfL19ZWU9/v02muvae/evdq3b59iY2MlSe3bt1daWpokWfT9wgsv3LafoKAgc9gqSaVLlzbfrzp16qhFixaqVauWnnrqKS1YsEAXL160qr4GDRrYdD3nz5/XmTNn1KJFC5uOs8ayZcu0dOlSRUVFaffu3Vq0aJHeffddLVq0yKKdk5OT0tPTlZKSkqfzMcMVAAAAAAAUaaVKlbptiOTg4GD+vclksvicuS3zsf+kpCRJ0urVq1WmTBmLdo6OjjnW0KFDBwUGBmrBggUKCAhQenq6atasqWvXrkmSPv74Y/37779ZaspJWFiY1q1bp3fffVeVKlWSk5OTnnzySXOft7vG223LvM533nlHs2fP1qxZs1SrVi2VLFlSw4YNM/fbsWNHi0fbb70X1oiOjlZYWJimT5+ukJAQubq66p133tGOHTtuW3tmrTl9jbZv365evXppwoQJatOmjdzd3RUdHa3p06dLujHDce/eveZjb378387Ozvx4e926dXXw4EFNmTJFzZo1u+O1PPfcc/ryyy8t1mXN630qVaqUuZ7KlStr1qxZCgkJ0YYNG9SyZUuL63Bzc7ttPzndL3t7e61bt07btm0zL5/x5ptvaseOHQoODs6xvpIlS1p8trOzMy9LkenmtXOdnJxy7C8vXnvtNY0aNco8y7lWrVo6efKkpkyZotDQUHO7CxcuqGTJknmuhcAVAAAAAAAUafXq1dNnn32W536qV68uR0dHxcXF6dFHH822TfHixSXJPAtRuvESqkOHDmnBggX6z3/+I0lZXsaUm9Dyxx9/VL9+/dSlSxdJNwLhEydO2NxPdv126tRJvXv3lnQjiP3zzz9VvXp1STdmgt48YzK352jcuLEGDx5s3pbXl4hJN16eFBgYqDfffNO87eTJk+bfFytWzBxi3ok1MyE///xzPfvss4qOjlb79u0t9hlxn25mb28vSeZg3trruBOTyaQmTZqoSZMmGjdunAIDA7VixQqNGDFCxYsXtxjLOfHx8dHZs2eVkZFhDvFvDoVdXV0VFBSk2NhY88vlbuXg4GD1+W6WnJycZYkAe3v7LGsk79+/X/Xq1bO5/1sRuAIAAAAAgCKtTZs2Gj16tC5evChPT89c9+Pq6qqwsDANHz5c6enpatq0qRISEvTjjz/Kzc1NoaGhCgwMlMlk0qpVq9SuXTs5OTnJ09NT3t7emj9/vkqXLq24uDiNGjUqz9dVuXJlLV++XB06dJDJZNLYsWPz9BKum/v96quvtG3bNnl6emrGjBk6d+6cOXDNyd9//20Rskk3HmHP7hyLFy/W2rVrFRwcrCVLlmjnzp13nFVpTe1xcXGKjo5Ww4YNtXr1aq1YseKOx02ZMkUNGjRQxYoVlZKSopiYGC1ZskTz5s0ztxk9erROnz6txYsXS7qxjEBoaKhmz56tRo0a6ezZs5JuzOR0d3e/7bkuXLiguLg4nTlzRtKNF6xJkr+/v8XapZcvXzYHmH/99Zdef/11+fj4WLy4Kq927Nih2NhYtW7dWr6+vtqxY4f+/vtvVatWTdKN5QjWrl2rQ4cOydvbO8fratasmf7++29NmzZNTz75pL777jutWbPGYvZteHi4XnjhBfn6+qpt27a6fPmyfvzxR7300kvm88XGxqpJkyZydHS0+vu1Q4cOmjRpksqXL68aNWpoz549mjFjhp599lmLdlu2bFHr1q1tvU1ZELgCAAAAAIB85TB+ekGXkKNatWrpwQcf1LJly8wvj8qtiIgI+fj4aMqUKTp27Jg8PDz04IMPml/iVKZMGU2YMEGjRo1S//791bdvX0VGRio6Olovv/yyatasqSpVqui9996746Pqd5IZKDVu3FilSpXSyJEjlZiYmKc+pRsvnTp27JjatGkjZ2dnDRw4UJ07d1ZCQsIdj42KilJUVJTFtoiICPNs2UyDBg3Snj179Mwzz8hkMqlHjx4aPHiw1qxZk6faO3bsqOHDh2vo0KFKSUlR+/btNXbsWIWHh+d43JUrVzR48GCdOnVKTk5Oqlq1qj777DM988wz5jbx8fEWa8zOnz9f169f15AhQzRkyBDz9tDQUEVGRt72XCtXrlT//v3NnzMfgx8/frxFnePGjdO4ceMk3Zg92rBhQ33//ffy9va25lZYxc3NTZs3b9asWbOUmJiowMBATZ8+XW3btpUkPf/889q4caMaNGigpKQkbdiwwbz+8K2qVaumuXPnavLkyYqIiFC3bt0UFham+fPnm9uEhobq6tWrmjlzpsLCwlSqVCk9+eST5v3Tp0/XiBEjtGDBApUpU0YnTpzQiRMnFBwcrA0bNtz2e+b999/X2LFjNXjwYJ0/f14BAQEaNGiQ+f5J0unTp7Vt2zZDZrubMm5dPOEumjdvnubNm2eezl6jRg2NGzfO/EW7evWqXn31VUVHRyslJUVt2rTR3LlzzYs1SzcWM37xxRe1YcMGubi4KDQ0VFOmTFGxYtZnyYmJiXJ3d1dCQkKOa1rci2ZfnG1T+1c8X7lzo/+TcNPbAK3hPn681W1TU1MVExOjdu3aWb32DIouxjmKAsY5igLGOYoCxjmsVZj/HXo7V69e1fHjxxUcHHzbF1Ddy1avXq3XXntN+/fvz9PbyQHcXRs2bFDXrl117NixPM1QHzlypC5evGgRAN/Mlj/jCvRPkLJly2rq1KnatWuXfvnlFz322GPq1KmTfv/9d0nS8OHD9e233+rLL7/Upk2bdObMGXXt2tV8fFpamtq3b69r165p27ZtWrRokSIjIy3SaQAAAAAAgDtp3769Bg4cqNOnTxd0KQBsEBMTozfeeCNPYask+fr6KiIiwpCaCnRJgQ4dOlh8njRpkubNm6effvpJZcuW1SeffKKoqCg99thjkqSFCxeqWrVq+umnn/Twww/r+++/14EDB7R+/Xr5+fmpbt26ioiI0MiRIxUeHm5eiPpWKSkpFosaZ06nT01NtXg7WmFgum6yqb0t13fdxp/o2dJ3ZtvCdr9RMBjnKAoY5ygKGOcoChjnsBb3+t40bNiwgi4BgI3eeecdQ/p59dVXDelHKuAlBW6WlpamL7/8UqGhodqzZ4/Onj2rFi1a6OLFi/Lw8DC3CwwM1LBhwzR8+HCNGzdOK1eutFhs+fjx46pQoYJ2795927eKhYeHa0I2j+NERUXJ2dnZ6EsDAAAAAMBCcnKyevbsyZICAFBI2PJnXIG/NOu3335TSEiIrl69KhcXF61YsULVq1fX3r17Vbx4cYuwVZL8/PzMb3U7e/asxXqumfsz993O6NGjNWLECPPnxMRElStXTq1bty50f9HNuzTvzo1u8qLHi1a3TZw61aa+3Wx4g2JqaqrWrVunVq1asUYU7ohxjqKAcY6igHGOooBxDmsZ8eIiAMC9qcAD1ypVqmjv3r1KSEjQV199pdDQUG3atClfz+no6ChHR8cs2x0cHArd/1xkFLNtgrIt11csPT3f+r75mMJ2z3H3Mc5RFDDOURQwzlEUMM5hrfv5Pt8jD9ICgKFs+bOtwF+7V7x4cVWqVEn169fXlClTVKdOHc2ePVv+/v66du2aLl26ZNH+3Llz8vf3lyT5+/vr3LlzWfZn7gMAAAAAAHdHZoicnJxcwJUAgPGuXbsmSbK3t79j2wKf4Xqr9PR0paSkqH79+nJwcFBsbKy6desmSTp06JDi4uIUEhIiSQoJCdGkSZN0/vx5+fr6SpLWrVsnNzc3Va9evcCuAQAAAACAosbe3l4eHh46f/68JMnZ2Vkmk20vkgOAe1F6err+/vtvOTs7q1ixO8epBRq4jh49Wm3btlX58uV1+fJlRUVFaePGjVq7dq3c3d01YMAAjRgxQl5eXnJzc9NLL72kkJAQPfzww5Kk1q1bq3r16urTp4+mTZums2fPasyYMRoyZEi2SwYAAAAAAID8k/m0aWboCgD3Czs7O5UvX96qHyQVaOB6/vx59e3bV/Hx8XJ3d1ft2rW1du1atWrVSpI0c+ZM2dnZqVu3bkpJSVGbNm00d+5c8/H29vZatWqVXnzxRYWEhKhkyZIKDQ3VxIkTC+qSAAAAAAAoskwmk0qXLi1fX1+lpqYWdDkAYJjixYvLzs661VkLNHD95JNPctxfokQJzZkzR3PmzLltm8DAQMXExBhdGgAAAAAAyCV7e3ur1jkEgPtRgb80CwAAAAAAAADuFwSuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYpJitBxw/flxbtmzRyZMnlZycLB8fH9WrV08hISEqUaJEftQIAAAAAAAAAIWC1YHr0qVLNXv2bP3yyy/y8/NTQECAnJycdOHCBR09elQlSpRQr169NHLkSAUGBuZnzQAAAAAAAABwT7IqcK1Xr56KFy+ufv366euvv1a5cuUs9qekpGj79u2Kjo5WgwYNNHfuXD311FP5UjAAAAAAAAAA3KusClynTp2qNm3a3Ha/o6OjmjVrpmbNmmnSpEk6ceKEUfUBAAAAAAAAQKFhVeCaU9h6K29vb3l7e+e6IAAAAAAAAAAorOxsPSAmJkZr167Nsn3t2rVas2aNIUUBAAAAAAAAQGFkc+A6atQopaWlZdmekZGhUaNGGVIUAAAAAAAAABRGNgeuhw8fVvXq1bNsr1q1qo4cOWJIUQAAAAAAAABQGNkcuLq7u+vYsWNZth85ckQlS5Y0pCgAAAAAAAAAKIxsDlw7deqkYcOG6ejRo+ZtR44c0auvvqqOHTsaWhwAAAAAAAAAFCY2B67Tpk1TyZIlVbVqVQUHBys4OFjVqlWTt7e33n333fyoEQAAAAAAAAAKhWK2HuDu7q5t27Zp3bp1+vXXX+Xk5KTatWvrkUceyY/6AAAAAAAAAKDQsDlwlSSTyaTWrVvrkUcekaOjo0wmk9F1AQAAAAAAAEChY/OSAunp6YqIiFCZMmXk4uKi48ePS5LGjh2rTz75xPACAQAAAAAAAKCwsDlwfeuttxQZGalp06apePHi5u01a9bUxx9/bGhxAAAAAAAAAFCY2By4Ll68WPPnz1evXr1kb29v3l6nTh398ccfhhYHAAAAAAAAAIWJzYHr6dOnValSpSzb09PTlZqaakhRAAAAAAAAAFAY2Ry4Vq9eXVu2bMmy/auvvlK9evUMKQoAAAAAAAAACqNith4wbtw4hYaG6vTp00pPT9fy5ct16NAhLV68WKtWrcqPGgEAAAAAAACgULB5hmunTp307bffav369SpZsqTGjRungwcP6ttvv1WrVq3yo0YAAAAAAAAAKBRsnuEqSf/5z3+0bt06o2sBAAAAAAAAgELN5hmuf/31l06dOmX+/PPPP2vYsGGaP3++oYUBAAAAAAAAQGFjc+Das2dPbdiwQZJ09uxZtWzZUj///LPefPNNTZw40fACAQAAAAAAAKCwsDlw3b9/vx566CFJ0rJly1SrVi1t27ZNS5cuVWRkpE19TZkyRQ0bNpSrq6t8fX3VuXNnHTp0yKJNs2bNZDKZLH698MILFm3i4uLUvn17OTs7y9fXV6+99pquX79u66UBAAAAAAAAQJ7YvIZramqqHB0dJUnr169Xx44dJUlVq1ZVfHy8TX1t2rRJQ4YMUcOGDXX9+nW98cYbat26tQ4cOKCSJUua2z3//PMWs2ednZ3Nv09LS1P79u3l7++vbdu2KT4+Xn379pWDg4MmT55s6+UBAAAAAAAAQK7ZHLjWqFFDH374odq3b69169YpIiJCknTmzBl5e3vb1Nd3331n8TkyMlK+vr7atWuXHnnkEfN2Z2dn+fv7Z9vH999/rwMHDmj9+vXy8/NT3bp1FRERoZEjRyo8PFzFixfPckxKSopSUlLMnxMTEyXdCJNTU1NtuoaCZrpusqm9Ldd33c62CdC29J3ZtrDdbxQMxjmKAsY5igLGOYoCxjmsxb0GgPuXKSMjI8OWAzZu3KguXbooMTFRoaGh+vTTTyVJb7zxhv744w8tX74818UcOXJElStX1m+//aaaNWtKurGkwO+//66MjAz5+/urQ4cOGjt2rHmW67hx47Ry5Urt3bvX3M/x48dVoUIF7d69W/Xq1ctynvDwcE2YMCHL9qioKIvZswAAAAAA5Ifk5GT17NlTCQkJcnNzK+hyAAAGsjlwlW48xp+YmChPT0/zthMnTpjXUM2N9PR0dezYUZcuXdLWrVvN2+fPn6/AwEAFBARo3759GjlypB566CFzsDtw4ECdPHlSa9euNR+TnJyskiVLKiYmRm3bts1yruxmuJYrV07/+9//Ct1fdPMuzbOp/YseL1rdNnHqVJv6dhs1yuq2qampWrdunVq1aiUHBwebzoOih3GOooBxjqKAcY6igHEOayUmJqpUqVIErgBwH7J5SQFJsre3twhbJSkoKChPhQwZMkT79++3CFulG4Fqplq1aql06dJq0aKFjh49qooVK+bqXI6OjuZ1aG/m4OBQ6P7nIqOYbXm5LddXLD093/q++ZjCds9x9zHOURQwzlEUMM5RFDDOYS3uMwDcv6xaBOjxxx/XTz/9dMd2ly9f1ttvv605c+bYVMTQoUO1atUqbdiwQWXLls2xbaNGjSTdWH5Akvz9/XXu3DmLNpmfb7fuKwAAAAAAAADkB6tmuD711FPq1q2b3N3d1aFDBzVo0EABAQEqUaKELl68qAMHDmjr1q2KiYlR+/bt9c4771h18oyMDL300ktasWKFNm7cqODg4Dsek7lWa+nSpSVJISEhmjRpks6fP29ezmDdunVyc3NT9erVraoDAAAAAAAAAIxgVeA6YMAA9e7dW19++aW++OILzZ8/XwkJCZIkk8mk6tWrq02bNtq5c6eqVatm9cmHDBmiqKgo/fe//5Wrq6vOnj0rSXJ3d5eTk5OOHj2qqKgotWvXTt7e3tq3b5+GDx+uRx55RLVr15YktW7dWtWrV1efPn00bdo0nT17VmPGjNGQIUOyXTYAAAAAAAAAAPKL1Wu4Ojo6qnfv3urdu7ckKSEhQf/++6+8vb1zvfbMvHk3FpRv1qyZxfaFCxeqX79+Kl68uNavX69Zs2bpypUrKleunLp166YxY8aY29rb22vVqlV68cUXFRISopIlSyo0NFQTJ07MVU0AAAAAAAAAkFu5emmWdGMWqru7e55OnpGR84Ly5cqV06ZNm+7YT2BgoGJiYvJUCwAAAAAAAADklVUvzQIAAAAAAAAA3BmBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABslV4Hrp0iV9/PHHGj16tC5cuCBJ2r17t06fPm1ocQAAAAAAAABQmBSz9YB9+/apZcuWcnd314kTJ/T888/Ly8tLy5cvV1xcnBYvXpwfdQIAAAAAAADAPc/mGa4jRoxQv379dPjwYZUoUcK8vV27dtq8ebOhxQEAAAAAAABAYWJz4Lpz504NGjQoy/YyZcro7NmzhhQFAAAAAAAAAIWRzYGro6OjEhMTs2z/888/5ePjY0hRAAAAAAAAAFAY2Ry4duzYURMnTlRqaqokyWQyKS4uTiNHjlS3bt0MLxAAAAAAAAAACgubA9fp06crKSlJvr6++vfff/Xoo4+qUqVKcnV11aRJk/KjRgAAAAAAAAAoFIrZeoC7u7vWrVunrVu3at++fUpKStKDDz6oli1b5kd9AAAAAAAAAFBo2By4ZmratKmaNm1qZC0AAAAAAAAAUKjlKnDduXOnNmzYoPPnzys9Pd1i34wZMwwpDAAAAAAAAAAKG5sD18mTJ2vMmDGqUqWK/Pz8ZDKZzPtu/j0AAAAAoHA5EXVC9rK/Y7sKoRXuQjUAABRONgeus2fP1qeffqp+/frlQzkAAAAAAAAAUHjZ2XyAnZ2aNGmSH7UAAAAAAAAAQKFmc+A6fPhwzZkzJz9qAQAAAAAAAIBCzeYlBcLCwtS+fXtVrFhR1atXl4ODg8X+5cuXG1YcAAAAAAAAABQmNgeuL7/8sjZs2KDmzZvL29ubF2UBAAAAAAAAwP+xOXBdtGiRvv76a7Vv3z4/6gEAAAAAAACAQsvmNVy9vLxUsWLF/KgFAAAAAAAAAAo1mwPX8PBwjR8/XsnJyflRDwAAAAAAAAAUWjYvKfDee+/p6NGj8vPzU1BQUJaXZu3evduw4gAAAIB7xYmoE7KX/R3bVQitcBeqAQAAwL3K5sC1c+fO+VAGAAAAAAAAABR+Ngeu48ePz486AAAAAAAAAKDQs3kNVwAAAAAAAABA9qya4erl5aU///xTpUqVkqenp0wm023bXrhwwbDiAAAAAAAAAKAwsSpwnTlzplxdXSVJs2bNys96AAAAAAAAAKDQsipwDQ0N1WOPPably5crNDQ0v2sCAAAAAAAAgELJ6jVcN27cqGvXruVnLQAAAAAAAABQqFk1wxUAACAnJ6JOyF72d2xXIbTCXagGAAAAAAqOTYHrgQMHdPbs2Rzb1K5dO08FAQAAAAAAAEBhZVPg2qJFC2VkZGTZbjKZlJGRIZPJpLS0NMOKAwAAAAAAAIDCxKbAdceOHfLx8cmvWgAAAAAAAACgULMpcC1fvrx8fX3zqxYAAAAAAAAAKNTsCroAAAAAAAAAALhfWB24PvrooypevHh+1gIAAAAAAAAAhZrVSwps2LAhP+sAAAAAAAAAgEKPJQUAAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGMTql2ZlGjFiRLbbTSaTSpQooUqVKqlTp07y8vLKc3EAAAAAAAAAUJjYHLju2bNHu3fvVlpamqpUqSJJ+vPPP2Vvb6+qVatq7ty5evXVV7V161ZVr17d8IIBAAAAAAAA4F5l85ICnTp1UsuWLXXmzBnt2rVLu3bt0qlTp9SqVSv16NFDp0+f1iOPPKLhw4fnR70AAAAAAAAAcM+yOXB95513FBERITc3N/M2d3d3hYeHa9q0aXJ2dta4ceO0a9cuQwsFAAAAAAAAgHudzYFrQkKCzp8/n2X733//rcTEREmSh4eHrl27lvfqAAAAAAAAAKAQydWSAs8++6xWrFihU6dO6dSpU1qxYoUGDBigzp07S5J+/vlnPfDAA0bXCgAAAAAAAAD3NJtfmvXRRx9p+PDh6t69u65fv36jk2LFFBoaqpkzZ0qSqlatqo8//tjYSgEAAAAAAADgHmdz4Ori4qIFCxZo5syZOnbsmCSpQoUKcnFxMbepW7euYQUCAAAAAAAAQGFhc+CaycXFRbVr1zayFgAAAAAAAAAo1GwOXK9cuaKpU6cqNjZW58+fV3p6usX+zFmvAAAAAAAAAFDU2By4Pvfcc9q0aZP69Omj0qVLy2Qy5UddAHDfOBF1Qvayv2O7CqEV7kI1AAAAAAAgP9kcuK5Zs0arV69WkyZN8qMeAAAAAAAAACi07Gw9wNPTU15eXvlRCwAAAAAAAAAUajYHrhERERo3bpySk5Pzox4AAAAAAAAAKLRsXlJg+vTpOnr0qPz8/BQUFCQHBweL/bt37zasOAAAAAC4n03d8z+r276Yj3UAAADj2By4du7cOR/KAAAAAADcr9ru22VT+zW16+dTJQAA5D+bA9fx48cbdvIpU6Zo+fLl+uOPP+Tk5KTGjRvr7bffVpUqVcxtrl69qldffVXR0dFKSUlRmzZtNHfuXPn5+ZnbxMXF6cUXX9SGDRvk4uKi0NBQTZkyRcWK2Xx5AAAAAAAAAJBrNq/haqRNmzZpyJAh+umnn7Ru3TqlpqaqdevWunLlirnN8OHD9e233+rLL7/Upk2bdObMGXXt2tW8Py0tTe3bt9e1a9e0bds2LVq0SJGRkRo3blxBXBIAAAAAAACAIsyqKaBeXl76888/VapUKXl6espkMt227YULF6w++XfffWfxOTIyUr6+vtq1a5ceeeQRJSQk6JNPPlFUVJQee+wxSdLChQtVrVo1/fTTT3r44Yf1/fff68CBA1q/fr38/PxUt25dRUREaOTIkQoPD1fx4sWznDclJUUpKSnmz4mJiZKk1NRUpaamWl3/vcB0/fZfi+zYcn3X7WzL423pO7NtYbvfKBiFfZynKc3wvnH/YZyjKGCcoyiwdZzbpV+3um1hHucO6ek2tS8K30dF4RoBoKgyZWRkZNyp0aJFi9S9e3c5Ojpq0aJFObYNDQ3NdTFHjhxR5cqV9dtvv6lmzZr64Ycf1KJFC128eFEeHh7mdoGBgRo2bJiGDx+ucePGaeXKldq7d695//Hjx1WhQgXt3r1b9erVy3Ke8PBwTZgwIcv2qKgoOTs757p+AAAAAACskZycrJ49eyohIUFubm4FXQ4AwEBWzXC9OUTNS6Cak/T0dA0bNkxNmjRRzZo1JUlnz55V8eLFLcJWSfLz89PZs2fNbW5ezzVzf+a+7IwePVojRowwf05MTFS5cuXUunXrQvcX3bxL82xq/6KH9e82TZw61aa+3UaNsrptamqq1q1bp1atWsnBwcGm86DoKezjvMqlKrKX/R3bB/UMsqkW3F8Y5ygKGOcoCmwd51fjnra67YCYBTb1fS+N826/77Wp/dc16trUvjDKfNISAHD/ydVbpY4ePaqFCxfq6NGjmj17tnx9fbVmzRqVL19eNWrUyFUhQ4YM0f79+7V169ZcHW8LR0dHOTo6Ztnu4OBQ6MK/jGJ3nKBswZbrK2bjYz+5uXeF8Z7j7ivs49z+//7Lj75x/2CcoyhgnKMosHWcp9tZ/0+ywjzOU21cDqEofB8VhWsEgKLK5pdmbdq0SbVq1dKOHTu0fPlyJSUlSZJ+/fVXjR8/PldFDB06VKtWrdKGDRtUtmxZ83Z/f39du3ZNly5dsmh/7tw5+fv7m9ucO3cuy/7MfQAAAAAAAABwt9gcuI4aNUpvvfWW1q1bZ/FCqscee0w//fSTTX1lZGRo6NChWrFihX744QcFBwdb7K9fv74cHBwUGxtr3nbo0CHFxcUpJCREkhQSEqLffvtN58+fN7dZt26d3NzcVL16dVsvDwAAAAAAAAByzeYlBX777TdFRUVl2e7r66v//e9/NvU1ZMgQRUVF6b///a9cXV3Na666u7vLyclJ7u7uGjBggEaMGCEvLy+5ubnppZdeUkhIiB5++GFJUuvWrVW9enX16dNH06ZN09mzZzVmzBgNGTIk22UDAAAAAAAAACC/2By4enh4KD4+Psts1D179qhMmTI29TVv3o0F5Zs1a2axfeHCherXr58kaebMmbKzs1O3bt2UkpKiNm3aaO7cuea29vb2WrVqlV588UWFhISoZMmSCg0N1cSJE229NAAAAOCua7tvl03t19Sun0+VAAAAwAg2B67du3fXyJEj9eWXX8pkMik9PV0//vijwsLC1LdvX5v6ysi484LyJUqU0Jw5czRnzpzbtgkMDFRMTIxN5wYAAAAAAAAAo9m8huvkyZNVtWpVlStXTklJSapevboeeeQRNW7cWGPGjMmPGgEAAAAAAACgULB5hmvx4sW1YMECjR07Vvv371dSUpLq1aunypUr50d9AAAAKKKm7rH+/QAv5mMdAAAAgC1sDlwzlS9fXuXLlzeyFgAAAAAAAAAo1GwOXJ999tkc93/66ae5LgYAAAAAAAAACjObA9eLFy9afE5NTdX+/ft16dIlPfbYY4YVBgAAAAAAAACFjc2B64oVK7JsS09P14svvqiKFSsaUhQAAAAAAAAAFEZ2hnRiZ6cRI0Zo5syZRnQHAAAAAAAAAIWSIYGrJB09elTXr183qjsAAAAAAAAAKHRsXlJgxIgRFp8zMjIUHx+v1atXKzQ01LDCAAAAAAAAAKCwsTlw3bNnj8VnOzs7+fj4aPr06Xr22WcNKwwAANze1D3/s7rti/lYBwAAAADAks2B64YNG/KjDgAAgCza7ttlU/s1tevnUyUAAAAAYJ1creF6/fp1rV+/Xh999JEuX74sSTpz5oySkpIMLQ4AAAAAAAAAChObZ7iePHlSjz/+uOLi4pSSkqJWrVrJ1dVVb7/9tlJSUvThhx/mR50AAAAAAFv8YbK+bZqTpM/zrRQAAIoSm2e4vvLKK2rQoIEuXrwoJycn8/YuXbooNjbW0OIAAAAAAAAAoDCxeYbrli1btG3bNhUvXtxie1BQkE6fPm1YYQAAAAAAAABQ2Ng8wzU9PV1paWlZtp86dUqurq6GFAUAAAAAAAAAhZHNgWvr1q01a9Ys82eTyaSkpCSNHz9e7dq1M7I2AAAAAAAAAChUbF5SYPr06WrTpo2qV6+uq1evqmfPnjp8+LBKlSqlzz9nkXUAAAAAAAAARZfNgWvZsmX166+/Kjo6Wvv27VNSUpIGDBigXr16WbxECwAAAAAAAACKGpsDV0kqVqyYevfubXQtAAAAAAAAAFCo2Ry4Ll68OMf9ffv2zXUxAAAAAAAAAFCY2Ry4vvLKKxafU1NTlZycrOLFi8vZ2ZnAFQAAAPe+P0zWt01zksS7CgAAAGAdO1sPuHjxosWvpKQkHTp0SE2bNuWlWQAAAAAAAACKtFyt4XqrypUra+rUqerdu7f++OMPI7pEEXEi6oTsZX/HdhVCK9yFagAAAAAAAIC8sXmG6+0UK1ZMZ86cMao7AAAAAAAAACh0bJ7hunLlSovPGRkZio+P1wcffKAmTZoYVhgA5NbUPf+zuu2L+VgHAAAAAAAoemwOXDt37mzx2WQyycfHR4899pimT59uVF0AAAAAAAAAUOjYHLimp6fnRx0AAAAAAAAAUOjleg3X//3vf0pMTDSyFgAAAAAAAAAo1Gya4Xrp0iW9+eab+uKLL3Tx4kVJko+Pj/r376+xY8fK2dk5X4oEAGTVdt8um9qvqV0/nyoBAAAAAACZrA5cL1y4oJCQEJ0+fVq9evVStWrVJEkHDhzQ+++/r3Xr1mnr1q3at2+ffvrpJ7388sv5VjQAAAAAAAAA3IusDlwnTpyo4sWL6+jRo/Lz88uyr3Xr1urTp4++//57vffee4YXCgAAAAAAAAD3OqsD12+++UYfffRRlrBVkvz9/TVt2jS1a9dO48ePV2hoqKFFAgCAu+gPk/Vt05wkfZ5vpQAAAABAYWP1S7Pi4+NVo0aN2+6vWbOm7OzsNH78eEMKAwAAAAAAAIDCxurAtVSpUjpx4sRt9x8/fly+vr5G1AQAAAAAAAAAhZLVgWubNm305ptv6tq1a1n2paSkaOzYsXr88ccNLQ4AAAAAAAAAChObXprVoEEDVa5cWUOGDFHVqlWVkZGhgwcPau7cuUpJSdHixYvzs1YAAAAAAAAAuKdZHbiWLVtW27dv1+DBgzV69GhlZGRIkkwmk1q1aqUPPvhA5cuXz7dCAQAAAAAAAOBeZ3XgKknBwcFas2aNLl68qMOHD0uSKlWqJC8vr3wpDgAAAAAAAAAKE5sC10yenp566KGHjK4FAAAAAAAAAAo1q1+aBQAAAAAAAADIGYErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGKVbQBQBAofKHyfq2aU6SPs+3UgAAAAAAwL2HGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMUaOC6efNmdejQQQEBATKZTPrmm28s9vfr108mk8ni1+OPP27R5sKFC+rVq5fc3Nzk4eGhAQMGKCkp6S5eBQAAAAAAAADcUKCB65UrV1SnTh3NmTPntm0ef/xxxcfHm399/vnnFvt79eql33//XevWrdOqVau0efNmDRw4ML9LBwAAAAAAAIAsihXkydu2bau2bdvm2MbR0VH+/v7Z7jt48KC+++477dy5Uw0aNJAkvf/++2rXrp3effddBQQEGF4zAAAAAAAAANxOgQau1ti4caN8fX3l6empxx57TG+99Za8vb0lSdu3b5eHh4c5bJWkli1bys7OTjt27FCXLl2y7TMlJUUpKSnmz4mJiZKk1NRUpaam5uPVGM903WRTe1uu77qdbROgbek7s22a0gzvG/cfW8e5Xfp1q9vaPM7TnKxvm36jbX6Nc4f0dJva8310b2OcZ49xfn9hnGePcX5/YZxnj3GeVVG4RgAoqkwZGRkZBV2EJJlMJq1YsUKdO3c2b4uOjpazs7OCg4N19OhRvfHGG3JxcdH27dtlb2+vyZMna9GiRTp06JBFX76+vpowYYJefPHFbM8VHh6uCRMmZNkeFRUlZ2dnQ68LAAAAAIBbJScnq2fPnkpISJCbm1tBlwMAMNA9PcO1e/fu5t/XqlVLtWvXVsWKFbVx40a1aNEi1/2OHj1aI0aMMH9OTExUuXLl1Lp160L3F928S/Nsan817mmr2w6IWWBT326jRlndNjU1VevWrVOVS1VkL/s7tg/qGWRTLbi/3FPjvOsUq9umpjtp3bFP822cd/t9r03tv65R16b2uLsY59ljnN9fGOfZY5zfXxjn2WOcZ5X5pCUA4P5zTweut6pQoYJKlSqlI0eOqEWLFvL399f58+ct2ly/fl0XLly47bqv0o11YR0dHbNsd3BwkIODg+F156eMYrZNUE63s/5LXszGx35yc+/s/++//Ogb9497apzb/2tTeyn/xnmqjY8V8n10b2OcZ49xfn9hnGePcX5/YZxnj3GeVVG4RgAoqmz7W6+AnTp1Sv/8849Kly4tSQoJCdGlS5e0a9cuc5sffvhB6enpatSoUUGVCQAAAAAAAKCIKtAZrklJSTpy5Ij58/Hjx7V37155eXnJy8tLEyZMULdu3eTv76+jR4/q9ddfV6VKldSmTRtJUrVq1fT444/r+eef14cffqjU1FQNHTpU3bt3V0BAQEFdFgAAAAAAAIAiqkBnuP7yyy+qV6+e6tWrJ0kaMWKE6tWrp3Hjxsne3l779u1Tx44d9cADD2jAgAGqX7++tmzZYrEcwNKlS1W1alW1aNFC7dq1U9OmTTV//vyCuiQAAAAAAAAARViBznBt1qyZMjJuv8bR2rVr79iHl5eXoqKijCwLAAAAAAAAAHKlUL00C7BW23277tzoJmtq18+nSgAAAAAAAFCUFKqXZgEAAAAAAADAvYzAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMUqygC8B95A+T9W3TnCR9nm+lAAAAAAAAAAWBGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwSLGCLgAAAAAAUMgc/t7GA7zzpQwAAO5FzHAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQYoVdAEAAADAfeXw9zYe4J0vZQAAAKBgMMMVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYpFhBFwAAAIqQw9/beIB3vpQBAAAAAPmFGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgxQry5Js3b9Y777yjXbt2KT4+XitWrFDnzp3N+zMyMjR+/HgtWLBAly5dUpMmTTRv3jxVrlzZ3ObChQt66aWX9O2338rOzk7dunXT7Nmz5eLiUgBXBAAAAADIq9QJr1rd1mH89HysBAAA2xXoDNcrV66oTp06mjNnTrb7p02bpvfee08ffvihduzYoZIlS6pNmza6evWquU2vXr30+++/a926dVq1apU2b96sgQMH3q1LAAAAAAAAAACzAp3h2rZtW7Vt2zbbfRkZGZo1a5bGjBmjTp06SZIWL14sPz8/ffPNN+revbsOHjyo7777Tjt37lSDBg0kSe+//77atWund999VwEBAXftWgAAAAAAAACgQAPXnBw/flxnz55Vy5Ytzdvc3d3VqFEjbd++Xd27d9f27dvl4eFhDlslqWXLlrKzs9OOHTvUpUuXbPtOSUlRSkqK+XNiYqIkKTU1Vampqfl0RfnDdN1kU3u79OtWt71uZ9sE6NQ0J+vbpt9om6Y069rb+HVxSE+3qX1h+7oXNYzz7DHO7y9FZpynZdhUC+P8/sI4zx7j/P7COM+ezePczt6GxoXze4LvZQC4f5kyMjJs+5syn5hMJos1XLdt26YmTZrozJkzKl26tLnd008/LZPJpC+++EKTJ0/WokWLdOjQIYu+fH19NWHCBL344ovZnis8PFwTJkzIsj0qKkrOzs7GXRQAAAAAANlITk5Wz549lZCQIDc3t4IuBwBgoHt2hmt+Gj16tEaMGGH+nJiYqHLlyql169aF7i+6eZfm2dT+atzTVrcdELPApr7duk6xum1qupPWHftUVS5Vkb3u/NProJ5BNtXS7fe9NrX/ukZdm9rj7mKcZ49xfn8pMuO80TGbaul21cum9ozzexvjPHuM8/sL4zx7to7z6G+/tLqtw6hJNvV9r8h80hIAcP+5ZwNXf39/SdK5c+csZrieO3dOdevWNbc5f/68xXHXr1/XhQsXzMdnx9HRUY6Ojlm2Ozg4yMHBwYDq756MYrZNUE63s/5LXszGx34c7P+1qb0k2f/ff3fs28avS6qNj1sVtq97UcM4zx7j/P5SZMa5vW2P2jLO7y+M8+wxzu8vjPPs2TzO061b2kAqvN8ThbVuAMCd2fa33l0UHBwsf39/xcbGmrclJiZqx44dCgkJkSSFhITo0qVL2rVrl7nNDz/8oPT0dDVq1Oiu1wwAAAAAAACgaCvQGa5JSUk6cuSI+fPx48e1d+9eeXl5qXz58ho2bJjeeustVa5cWcHBwRo7dqwCAgLM67xWq1ZNjz/+uJ5//nl9+OGHSk1N1dChQ9W9e3cFBAQU0FUBAAAAAAAAKKoKNHD95Zdf1Lx5c/PnzHVVQ0NDFRkZqddff11XrlzRwIEDdenSJTVt2lTfffedSpQoYT5m6dKlGjp0qFq0aCE7Ozt169ZN77333l2/FgAAAAAAAAAo0MC1WbNmysi4/RpHJpNJEydO1MSJE2/bxsvLS1FRUflRHgAAAHDPSZ3wqtVtHcZPz8dKAAAAkJ17dg1XAAAAAAAAAChsCnSGKwDgJoe/t/EA73wpAwAAAAAA5B4zXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABilW0AUAAAAYJXXCq1a3dRg/PR8rAQAAAFBUMcMVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMEixgi4AAHB3pE541eq2DuOn52MlAAAAAADcv5jhCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGKVbQBQBWOfy9jQd450sZAAAAAAAAQE6Y4QoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxyTweu4eHhMplMFr+qVq1q3n/16lUNGTJE3t7ecnFxUbdu3XTu3LkCrBgAAAAAAABAUXZPB66SVKNGDcXHx5t/bd261bxv+PDh+vbbb/Xll19q06ZNOnPmjLp27VqA1QIAAAAAAAAoyooVdAF3UqxYMfn7+2fZnpCQoE8++URRUVF67LHHJEkLFy5UtWrV9NNPP+nhhx++26UCAAAAAAAAKOLu+cD18OHDCggIUIkSJRQSEqIpU6aofPny2rVrl1JTU9WyZUtz26pVq6p8+fLavn17joFrSkqKUlJSzJ8TExMlSampqUpNTc2/i8kHpusmm9rbpV+3uu11O9smQKemOVnfNv1G2zSlWdl3hk21OKSn29S+sH3dixrGefZsHud29jY05nvibmOcZ49xfn9hnGePcX5/YZxnj3GeFf8GAYD7lykjI8O2vynvojVr1igpKUlVqlRRfHy8JkyYoNOnT2v//v369ttv1b9/f4vgVJIeeughNW/eXG+//fZt+w0PD9eECROybI+KipKzs7Ph1wEAAAAAwM2Sk5PVs2dPJSQkyM3NraDLAQAY6J4OXG916dIlBQYGasaMGXJycsp14JrdDNdy5crpf//7X6H7i27epXk2tb8a97TVbQfELLCpb7euU6xum5rupHXHPlWVS1Vkrzv/9Dqo0TGbaul21cum9l/XqGtTe9xdjPPs2TrOo7/90uq2DqMm2dQ38o5xnj3G+f2FcZ49xvn9hXGePcZ5VomJiSpVqhSBKwDch+75JQVu5uHhoQceeEBHjhxRq1atdO3aNV26dEkeHh7mNufOnct2zdebOTo6ytHRMct2BwcHOTg4GF12vsooZltenm5n/Ze8mI2P/TjY/2tTe0my/7//7ty3bY9mpdr4uFVh+7oXNYzz7Nk8ztOte0RQ4nuiIDDOs8c4v78wzrPHOL+/MM6zxzjPqrDWDQC4M9v+1itgSUlJOnr0qEqXLq369evLwcFBsbGx5v2HDh1SXFycQkJCCrBKAAAAAAAAAEXVPT3DNSwsTB06dFBgYKDOnDmj8ePHy97eXj169JC7u7sGDBigESNGyMvLS25ubnrppZcUEhKS4wuzAAAAAAAAACC/3NOB66lTp9SjRw/9888/8vHxUdOmTfXTTz/Jx8dHkjRz5kzZ2dmpW7duSklJUZs2bTR37twCrhoAAAAAAABAUXVPB67R0dE57i9RooTmzJmjOXPm3KWKAAAAAAAAAOD2CtUargAAAAAAAABwLyNwBQAAAAAAAACD3NNLCgB3S+qEV61u6zB+ej5WAgAAAAAAgMKMGa4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYJD7JnCdM2eOgoKCVKJECTVq1Eg///xzQZcEAAAAAAAAoIi5LwLXL774QiNGjND48eO1e/du1alTR23atNH58+cLujQAAAAAAAAARUixgi7ACDNmzNDzzz+v/v37S5I+/PBDrV69Wp9++qlGjRqVpX1KSopSUlLMnxMSEiRJFy5cUGpq6t0p2iApCSl3bnSTa4kXrW578do1m/q+nlDC6rap6SWUnJyshH8TZC/7O7b/JyHJplp01cGm5v9cu251W4d//rGtFuQZ4/w2GOf3Fcb5bTDO7yuM89tgnN9XGOe3wTjP4vLly5KkjIyMAq4EAGA0U0Yh/9P92rVrcnZ21ldffaXOnTubt4eGhurSpUv673//m+WY8PBwTZgw4S5WCQAAAABAVn/99ZfKli1b0GUAAAxU6Ge4/u9//1NaWpr8/Pwstvv5+emPP/7I9pjRo0drxIgR5s/p6em6cOGCvL29ZTKZ8rVe3JCYmKhy5crpr7/+kpubW0GXA+QLxjmKAsY5igLGOYoCxvndl5GRocuXLysgIKCgSwEAGKzQB6654ejoKEdHR4ttHh4eBVNMEefm5sb/0OG+xzhHUcA4R1HAOEdRwDi/u9zd3Qu6BABAPij0L80qVaqU7O3tde7cOYvt586dk7+/fwFVBQAAAAAAAKAoKvSBa/HixVW/fn3Fxsaat6Wnpys2NlYhISEFWBkAAAAAAACAoua+WFJgxIgRCg0NVYMGDfTQQw9p1qxZunLlivr371/QpeE2HB0dNX78+CxLOwD3E8Y5igLGOYoCxjmKAsY5AADGMWVkZGQUdBFG+OCDD/TOO+/o7Nmzqlu3rt577z01atSooMsCAAAAAAAAUITcN4ErAAAAAAAAABS0Qr+GKwAAAAAAAADcKwhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAADINd69CQAAAACWCFwBAECuOTo66uDBgwVdBgAgF7Zs2aLevXsrJCREp0+fliQtWbJEW7duLeDKAAAo3IoVdAHAX3/9pfHjx+vTTz8t6FKAPPn333+1a9cueXl5qXr16hb7rl69qmXLlqlv374FVB2QNyNGjMh2e1pamqZOnSpvb29J0owZM+5mWUC++OCDD/Tzzz+rXbt26t69u5YsWaIpU6YoPT1dXbt21cSJE1WsGP8bjcLt66+/Vp8+fdSrVy/t2bNHKSkpkqSEhARNnjxZMTExBVwhAACFlymDZwFRwH799Vc9+OCDSktLK+hSgFz7888/1bp1a8XFxclkMqlp06aKjo5W6dKlJUnnzp1TQEAA4xyFlp2dnerUqSMPDw+L7Zs2bVKDBg1UsmRJmUwm/fDDDwVTIGCQt956S9OmTVPr1q31448/atiwYXrnnXc0fPhw2dnZaebMmXrxxRc1YcKEgi4VyJN69epp+PDh6tu3r1xdXfXrr7+qQoUK2rNnj9q2bauzZ88WdIkAABRa/Gge+W7lypU57j927NhdqgTIPyNHjlTNmjX1yy+/6NKlSxo2bJiaNGmijRs3qnz58gVdHpBnkydP1vz58zV9+nQ99thj5u0ODg6KjIzMMqsbKKwiIyMVGRmprl276tdff1X9+vW1aNEi9erVS5JUtWpVvf766wSuKPQOHTqkRx75f+3dbUzV5ePH8c/3KBxvuBGGBmwiM1yEUQmHbmQITpJSCSIXMxTblE1dSitEfJCbmvM80FnTLVuhqGGiq2zZrEwtbzPBpJGmqaU2TmmIQ8CyI+f/wHX25weZ5he+cs77tZ0H5/refc7GA/hwnesa1WE8NDRUly9f7v5AAAD4EApXdLnc3FwZhnHTjVUMw+jGRID5Dhw4oC+++EIRERGKiIjQxx9/rFmzZiktLU27d+9W//79rY4I3JGysjKNGTNGkydPVnZ2tpYuXaqAgACrYwGmq6+vl8PhkCQ99NBDstlsevjhh73Hk5KSVF9fb1E6wDyRkZE6deqUYmNj243v27dPQ4cOtSYUAAA+gk2z0OWioqL0wQcfqK2trdPXkSNHrI4I3LGrV6+2W8/PMAy9+eabys7OVnp6uk6ePGlhOsAcKSkpqqmp0cWLF+VwOFRXV8c/zOBzIiMjdezYMUnSjz/+qOvXr3vfS9L333+vQYMGWRUPME1RUZGKi4t16NAhGYah+vp6VVZWqqSkRDNnzrQ6HgAAPRozXNHlkpOTVVNTo5ycnE6P/9vsV6AniI+PV3V1te6///5246tWrZIkPf3001bEAkwXFBSkdevWadOmTcrMzGRdYvicgoICFRYWKicnRzt37lRpaalKSkrU0NAgwzC0ZMkSTZw40eqYwB0rKytTW1ubxowZo9bWVo0aNUp2u10lJSWaPXu21fEAAOjR2DQLXW7v3r1qaWnRk08+2enxlpYWVVdXKz09vZuTAeZZunSp9u7d+487+s6aNUurV69WW1tbNycDus4vv/yimpoaZWZmsmwGfEZbW5ucTqcOHjyokSNHqqysTFVVVSotLVVra6uys7O1atUqfubhM65du6ZTp06publZCQkJCgoKsjoSAAA9HoUrAAAAAAAAAJiEJQUAAAAAwM+0tLTI6XRq586dunDhQodv4Zw5c8aiZAAA9HwUrgAAAADgZ6ZPn66vvvpKU6ZMUVRUFJsgAgBgIpYUAAAAAAA/M2DAAH3yySdKTU21OgoAAD7HZnUAAAAAAED3CgsLU3h4uNUxAADwSRSuAAAAAOBnFi9erAULFqi1tdXqKAAA+ByWFAAAAAAAPzNixAidPn1aHo9HsbGxCggIaHf8yJEjFiUDAKDnY9MsAAAAAPAzubm5VkcAAMBnMcMVAAAAAAAAAEzCGq4AAAAAAAAAYBKWFAAAAAAAPxAeHq6TJ08qIiJCYWFhMgzjH8+9dOlSNyYDAMC3ULgCAAAAgB9YsWKFgoODJUmvv/66tWEAAPBhrOEKAAAAAAAAACZhhisAAAAA+IGmpqZbPjckJKQLkwAA4NuY4QoAAAAAfsBms9103VZJ8ng8MgxD169f76ZUAAD4Hma4AgAAAIAf2L17t9URAADwC8xwBQAAAAA/kJeXp4qKCoWEhGj9+vXKz8+X3W63OhYAAD6HwhUAAAAA/EBgYKDOnj2rqKgo9erVSy6XS4MGDbI6FgAAPoclBQAAAADAD8THx2v+/PkaPXq0PB6PNm/e/I+bYxUWFnZzOgAAfAczXAEAAADAD+zfv1+vvPKKTp8+rUuXLik4OLjTTbQMw9ClS5csSAgAgG+gcAUAAAAAP2Oz2fTrr7+ypAAAAF3AZnUAAAAAAEDXy8vLU1NTkyRp7dq1Cg4OtjgRAAC+iRmuAAAAAOAH2DQLAIDuwaZZAAAAAOAH2DQLAIDuwQxXAAAAAPADBw4c0Msvv8ymWQAAdDEKVwAAAADwMzabTS6XS/fcc4/VUQAA8DkUrgAAAADgZ86ePauQkBCtWbNGx48flyQNHz5c06ZN+8dlBgAAwK2hcAUAAAAAP1NdXa2srCz17dtXjzzyiCTp8OHDunr1qj777DMlJydbnBAAgJ6LwhUAAAAA/ExaWpri4uL09ttvq3fvG3spu91uTZ8+XWfOnNGePXssTggAQM9F4QoAAAAAfqZv37769ttvFR8f32782LFjcjgcam1ttSgZAAA9n83qAAAAAACA7hUSEqJz5851GD9//ryCg4MtSAQAgO+gcAUAAAAAP5Ofn69p06apqqpK58+f1/nz57Vp0yZNnz5dkyZNsjoeAAA9Wm+rAwAAAAAAuteyZctkGIYKCwvldrslSQEBAZo5c6acTqfF6QAA6NlYwxUAAAAA/FRra6tOnz4tSbr33nvVr18/ixMBANDzUbgCAAAAAAAAgElYwxUAAAAAAAAATELhCgAAAAAAAAAmoXAFAAAAAAAAAJNQuAIAAAAAAACASShcAQDwQ4ZhaOvWrabd78svv5RhGLp8+bJp9wQAAACAnojCFQCAHuLgwYPq1auXxo8ff8f3crlceuqpp0xIdetiY2NlGIYMw1C/fv2UmJiod95557bvY3ZZDAAAAABmonAFAKCHKC8v1+zZs7Vnzx7V19ff9FyPxyO3291h/Nq1a5KkyMhI2e32Lsl5M4sWLZLLQDxxLgAABhpJREFU5VJdXZ0mT56soqIibd++vdtzAAAAAEBXoXAFAKAHaG5uVlVVlWbOnKnx48eroqKi3fG/v9K/fft2JScny263a9++fcrIyNCLL76ol156SREREcrKypLUfpboyJEjNW/evHb3u3jxogICArRnzx5J0oYNG+RwOBQcHKzIyEg9//zzunDhwm1/jr+vHzp0qObNm6fw8HDt2LHDe/zw4cN64oknFBERodDQUKWnp+vIkSPe47GxsZKkZ555RoZheN9L0kcffaSkpCT16dNHQ4cO1cKFCzstnQEAAACgK1G4AgDQA2zevFnx8fG67777NHnyZK1Zs0Yej6fDeWVlZXI6nTp+/LgefPBBSdK6desUGBio/fv3a/Xq1R2uKSgo0KZNm9rdr6qqStHR0UpLS5Mk/fXXX1q8eLFqa2u1detW/fzzz3rhhRf+8+dpa2vT+++/r8bGRgUGBnrHr1y5oqlTp2rfvn36+uuvNWzYMI0bN05XrlyRdKOQlaS1a9fK5XJ53+/du1eFhYUqLi7WsWPH9NZbb6miokJLliz5zxkBAAAA4L8wPJ39tQYAAO4qqampeu6551RcXCy3262oqCht2bJFGRkZkm7McB09erS2bt2qnJwc73UZGRlqampqN0tUujHD9cMPP1Rubq4uXryo6Oho7dq1y1uwjhw5UqNGjZLT6ew0T3V1tVJSUnTlyhUFBQV5n9/Y2KgBAwZ0ek1sbKxcLpcCAgL0559/yu12Kzw8XIcOHVJcXFyn17S1tWnAgAHauHGjJkyY0CH73zIzMzVmzBjNnz/fO/buu++qtLT0X5dfAAAAAAAzMcMVAIC73IkTJ/TNN99o0qRJkqTevXsrPz9f5eXlHc51OBwdxpKTk296/4EDB2rs2LGqrKyUJP300086ePCgCgoKvOfU1NQoOztbMTExCg4OVnp6uiTp3Llzt/VZ5s6dq6NHj2rXrl169NFHtWLFinZl62+//aaioiINGzZMoaGhCgkJUXNz878+p7a2VosWLVJQUJD3VVRUJJfLpdbW1tvKCAAAAAB3orfVAQAAwM2Vl5fL7XYrOjraO+bxeGS327Vq1SqFhoZ6x/v379/h+s7G/ldBQYHmzJmjlStXauPGjUpMTFRiYqIkqaWlRVlZWcrKylJlZaUGDhyoc+fOKSsry7sJ162KiIhQXFyc4uLitGXLFiUmJsrhcCghIUGSNHXqVDU0NOiNN97QkCFDZLfb9fjjj//rc5qbm7Vw4ULl5eV1ONanT5/byggAAAAAd4IZrgAA3MXcbrfWr1+v5cuX6+jRo95XbW2toqOj9d5775nynJycHP3xxx/69NNPtXHjxnazW3/44Qc1NDTI6XQqLS1N8fHx/2nDrP81ePBg5efnt1sGYP/+/ZozZ47GjRun4cOHy2636/fff293XUBAgK5fv95uLCkpSSdOnPCWuf//ZbPx6w4AAACA7sMMVwAA7mLbtm1TY2Ojpk2b1m4mqyQ9++yzKi8v14wZM+74Of3791dubq5effVVHT9+3Lt8gSTFxMQoMDBQK1eu1IwZM1RXV6fFixff8TMlqbi4WA888ICqq6vlcDg0bNgwbdiwQQ6HQ01NTZo7d6769u3b7prY2Fjt3LlTqampstvtCgsL04IFCzRhwgTFxMRo4sSJstlsqq2tVV1dnV577TVTsgIAAADArWDKBwAAd7Hy8nJlZmZ2KFulG4VrdXW1vvvuO1OeVVBQoNraWqWlpSkmJsY7PnDgQFVUVGjLli1KSEiQ0+nUsmXLTHlmQkKCxo4dqwULFki68XkbGxuVlJSkKVOmaM6cORo0aFC7a5YvX64dO3Zo8ODBGjFihCQpKytL27Zt0+eff66UlBQ99thjWrFihYYMGWJKTgAAAAC4VYbH4/FYHQIAAAAAAAAAfAEzXAEAAAAAAADAJBSuAAAAAAAAAGASClcAAAAAAAAAMAmFKwAAAAAAAACYhMIVAAAAAAAAAExC4QoAAAAAAAAAJqFwBQAAAAAAAACTULgCAAAAAAAAgEkoXAEAAAAAAADAJBSuAAAAAAAAAGASClcAAAAAAAAAMMn/AST4flJEnzHJAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 1200x800 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create a list to store the queueing time data\n",
+    "qt_data = []\n",
+    "\n",
+    "# Iterate over the models, batch sizes, and arrival rates to calculate queueing time\n",
+    "for ssm in small_model_names:\n",
+    "    for batch_size in batch_sizes:\n",
+    "        for arrival_rate in arrival_rates:\n",
+    "            model_name = ssm.replace(\"/\", \"-\")\n",
+    "            filepath = f\"/usr/FlexFlow/inference/output/specinfer_llm_meta-llama-Llama-3.1-70B-Instruct_ssm_{model_name}_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "            if os.path.exists(filepath):\n",
+    "                qt = get_queueing_time(filepath)\n",
+    "                qt_data.append({\n",
+    "                    'Model': model_name,\n",
+    "                    'Batch Size': batch_size,\n",
+    "                    'Arrival Rate': arrival_rate,\n",
+    "                    'Queueing Time': qt\n",
+    "                })\n",
+    "# add incremental decoding entry\n",
+    "for batch_size in batch_sizes:\n",
+    "    for arrival_rate in arrival_rates:\n",
+    "        model_name = ssm.replace(\"/\", \"-\")\n",
+    "        filepath = f\"/usr/FlexFlow/inference/output/incr_dec_llm_meta-llama-Llama-3.1-70B-Instruct_bz_{batch_size}_rate_{arrival_rate}_dataset_sharegpt.csv\"\n",
+    "        if os.path.exists(filepath):\n",
+    "            qt = get_queueing_time(filepath)\n",
+    "            qt_data.append({\n",
+    "                'Model': \"Incr Dec (baseline)\",\n",
+    "                'Batch Size': batch_size,\n",
+    "                'Arrival Rate': arrival_rate,\n",
+    "                'Queueing Time': qt\n",
+    "            })\n",
+    "\n",
+    "# Convert the list to a DataFrame\n",
+    "qt_df = pd.DataFrame(qt_data)\n",
+    "print(qt_df.head())\n",
+    "\n",
+    "# Pivot the dataframe to have models and batch sizes as columns\n",
+    "pivot_df = qt_df.pivot_table(index='Arrival Rate', columns=['Model', 'Batch Size'], values='Queueing Time')\n",
+    "\n",
+    "# Plot the data\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "\n",
+    "colors = ['lightgreen', 'skyblue', 'lightcoral', 'gold', 'plum', 'peachpuff', 'mediumturquoise', 'salmon']\n",
+    "pivot_df.plot(kind='bar', ax=ax, color=colors)\n",
+    "\n",
+    "ax.set_title('Queueing Time vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n",
+    "ax.set_xlabel('Arrival Rate')\n",
+    "ax.set_ylabel('Queueing Time (sec)')\n",
+    "ax.grid(True)\n",
+    "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
+    "\n",
+    "# Save the plot as a PDF\n",
+    "plt.savefig('/usr/FlexFlow/wildchat/queueing_time_vs_arrival_rate.pdf')\n",
+    "\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/benchmarking/queueing_time_vs_arrival_rate.pdf b/benchmarking/queueing_time_vs_arrival_rate.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e77da10bad2cce12d587cb6ea92c4d3eada9af5d
GIT binary patch
literal 18042
zcmb_^2RxPEAAi}dQ3%;rR#|ttTq`?!XGPf~`?@48BP(RjR7kR;M3L+j8Idg&86{MN
zQvT0#)%RPuzyH_k|N8Sfp7WgNJm)jc=bX>`oacN9=&PuSA;i%Tf#Pv!Q8fesg+o1U
zPeWv7p)ljKUJg*0vJKwG&C>}A)3<SQ@PQ(M3WiX5d5D9DJy=odw-?krJ@8Nzp#scI
z&-S!~9UdyVeXHt^KW2ot!8<_F+XebIc)SD711bUDLSRPDHuf$aPEgFxr#MeLBL_V6
z1VC0r4PeE=9}k6TxC1OG{k$swylOzLeq#r*Qvi@}kb9g1z}+_cFhd6)PhXrJAP>kt
zKwrnf-o-}A(;uJ+2Y#_qQsM{{1}OoRKuL+Cq$I$s6`%}2LZBHRD0Z7qMGp^8;0^)4
z^Sf5S$G_C5?cm{rcZMQ<l2&tZ0|bY{)Z74Vs5sbp+B<;!``~d7HtrDr%$5^6o*Y=l
z+Rvsd2nrK))a3ZJ+WQ<`P^`VJb<p*$hNAYXq#cTGLVLdqdB*EVnCleW`cj{;ckiOm
zz3O)or^?&&x-Xeus5?=5@zRBn?^ALzFw@Rs6cbLH<9<2cqHpqW;qQL*=_EVpZnk_Q
z<lggFFRH7NPkr9%Y@UhH>3A#D>M^`Q&VIJ}<XyJ@6v_q0hg63!BXG);YwQ;L;^M}Y
zT5)acnyqzyc8P-&9x^4&lK0M;`ED&P$~45h&^@?xyEQ7S@>2Bq&t5fk4w9!k9>21P
z*N<b=Bhd&<2@~<s42#IsCDXKd{9c!pDlzL(j-IM!$94V&dv%V^T2!k-^Vb_>G8beu
z&xvzI9?K3lN%TkNmb?|FBJW!pPqe-hd+b~XZ=63qqC{Pm?YZvj`(9ZWe!jx88Z$8X
z(X8N6-kXlNFZHW|t8&yBNp3cDpmOYTBhBmslDD?9ioxYCHJxy>H7ERTrPz(w<>5Iw
zBX1w$r=nMqlAERo&XkQaSxuul@QPfAOpZT!N-&V-I1V~IKjmT^8(GgT9w->hoxqNW
zF=9=cRw(Ol>(7@LB#E`)5C|sIu#>uIpPxY5pv$9oD?pn}6Q|c1M5}RL+v02B=h-+;
zDh%pINo=E9DmmFTS%+0so}L>i6ZrzJm+Nexw37Ij9dV5VqQxIEv>QPRcVhG6FLqG`
zxyn4HpniVHi<BiQBY~I7@2fxiyd9=K(xA^pNZCTz_+oEsGOZ%Whx(<aXs=Ac5N^f2
zIF91R19H=p!Pn(8)Yj8z&|Vr*y|gZz_`nCMpXAo~hl~B8c?C(ga{KnO3eRC%($wWK
zr`3W&-UM|Eu?6purPB6)5^=*CHf6_WU|ixi-ft7nOMS!DrbJY!G&}WU83Wh5{WQ|Y
z?q3YcReENAJ6CL!f4Fm9iiJ^=hPJArurqET_yn(#Ce4LoB(>4uF;Y1v;gezUCm9d$
zR2iE&73S~;KO3m4+JJR0OI>&63ueuT_e}V)R<WQaW-h`%z`~N8;5bX$QhtR7!kjf1
z1N8YFkYd!+z6@#J=hDT|NytJ&#~4t!tbB9PBSB$_yz_lLvnA!pS6vrYKB56x>@<!z
z7o_lt!o7q0c;_@%Bz|fRE*0!tlETX->KI_tV9J)$I^!g2e@JJR8)let#WXH~_1f2r
z2ZxSLh|s2VQ?Lb_eF^D%o@}X5W)~ozf`W{n=A1TuEqD2cH*;0ebXEWfSO1dQLxWDK
z`SAD@CNlr63TGeHNivzd*%%dr2WFct7MC2TG9#~@d>Ni&AI#PNSuHxh=V6C>(LLIr
zAx<ybFUgM77}TUX?6^f@=fUR6ce)I!vlmCRnC%A+pKOm;W8yC#AU}dH6v62QbB%pd
zOFnEY<n5poI;!k^B_x?!_#@-O@Jq?(<8aX)#8$uDO~5iaWSl~TA%er@wVu<FS6W7g
z`wGRC8N;jl<aND5u2B2AEpbHVoW4GTZdz046wW^@V&8{om?<r*)pw~9jW!*YUb<cM
zc&zdcIC3pIa;}sQ8>5$;s4h2h#wH}Ij(|fmCc<FI%uQID!=CDZwfTo~H(?7QaDdpy
zXtvoJUG+P2j;hSfsI|a4>MJ#<_UvOHDCr$@zmo+aoVB%1Jy)ayx~_Z~!Tc&gEgGk|
z@3nn{*x_q0t5&(F(IOe2Y1mHP>|B%-Rg%9mzQR{Z(Ku-SrJAF*ZL&P6TmSoy%vwEp
z`)@flB_Gb*<wH6^c)BreejQog8lE771X<FK%IxX%c7LPTTGcdOu-=1f62CjVQKui1
zQ&X)m3n`VqceL8nYw-Js)%VsPXIoy@LJ>8C(`*@9U6%_qY;zlIX7&lHafhc_b$+9L
zCg9jOz>k2?Sn|;es__7Y-^hN|)E<vZrevCmRqiUK3;$JfP`wnd7p;;=|M;5iOy{#_
zUl%VP|NJ>wRXNI;kw!5WJM2uFT2(vpy;YpM+e@OOU+k3J_=UQYXX<YHGWwI0-ZW`D
zOG|nEj`TEdMOZIN?%w?iWTEH>7Zg6W;PewWqwEYxNZd6Tf<-fphdB)+3T`l;F347G
zy}l}S@r^4FOrkri+m507CWntDuDCvf=745#oKczi^gtdG+ia-S4Lx35U3T)tsEA!j
zhVZ^-gZaoixME7v_XUkJy~^6FEBabYEA3`4&V7i{XEzmLTH!piqE6$&W~Not=-6j;
z?di&XEu)o#tZNqpr1bkG2E32Ok`((@3_N%!>_#uwY^=BM0a`JZk$&i*VC{aC-<2Cv
zSGhxJcr7oTa}f!EV4`k}8s~+ZW#qIqTOpXD%No8>2Y_u*ba-n({%Au_wwmwq@mCe$
zaMd25Lgf{a+uCBKkZY=X?!x`bqB?yceQXXTL(g?f4d-4Ju(oH^7{fH?`wDfnKS-`D
zj=dXuxA>#7rtuKdy!P>#3C9-~{8t}ZT$CzWV$@((?mb$>dWN(382vS2-2KnS@|Cg`
zB<_p%hjP?^tnH(JZdX%slSE3i=R&EpgDgUFM%QHg0~F@Wnsv!H<M8H>;oGOVxsr6(
zfAoIR@4vr3&}lPcHhM<#6nFKhMGp^?VfU5QcRvcweS14~`unie<fR4iyEjvAR<E7e
zIwALl&sJuT3wiMB+gwdo_c{SvJ@mn@Ym%N@FKf^G42r@1MDML&lA@|uV<c%>Y+g6N
zeKYi}?M}F7=beDe%v1UK@vm8Vq%6Nqo`Z)Q73m*IXXlWMa>|&>E}yedoVu$(mWp}B
zZ`8mrjye7@B9h{m!gUUDb47{gaN1z@);l@OFSCmlZ-<Ow#4jFyh2u~REnz@?tUtkD
z)X0}%K`+U^-)Iaa#}^QBAu#M<5yt@xyLhVix21#_lySGti}vL-K84%y*|l#;7FNDu
z=xX}%Z}rq{adl@?u7*$U6__ks8lL`mku@}S1zZ2(Y{DbH`szIwY0?<tE=j5!3>FSC
z6~!){e*bNLWBJ|CH-UG451Ml~Cf~OmAE#aS`ckFazhdqzy>RezT3$BmHTliQbkaA_
zOOn2j<K!Z4ucbuNT*{riOgui*j&5>~(#w9t=9H}1_Rp$RWxIB52H(RdgjV=pJf+_)
zJja$qx0YtRaP;Pr3nSwbFjmjOp=<9$Ee;F3T;y=B(cOG8zOeqJ49aEt{fSm~wsk1w
zZIP~uVN}}Ts<RZ#-za*5C~v5R+&SMS7^~sV*Css7-Z;W?BwF#{ew5mchQTisMwad-
zdP7mZZ;sp#y<Y`OT$`pEy@a{Vrmbu=m}%5eE}Y4uyv5fYbPFOPN^ND6*D`}TJ~4CG
z`+;YKPTdMmO!4!y53;;vZ#$<N_eRjtREX1FAxWnQpfGKcb2JoSUH)w2yWH95EzA=T
zW3(}s{4Rww%Hgi=`@+VP4Uh}n#-Jkhn|XX5I><>^d1hhj%*_&65QP5(jz8fz0cC>7
z6N!OieqnS1JOv9+K&}2M1A+Np5EY96VCjFrQ;Fyrx_UU3%DKFU4C_*XYmqqzzWi{H
z5Mv#en~(hLW^I2xoUdYv<zAkbk+z|3{u$cSw<Wn`<^-O7M|T8>v#;#)US+PlMw(#2
zA0Mug+L`S1_SAWKT2D?}BC`Cxw)lynkr?!U^C)NdD4rBS+gSxksXt{X_vDec_1aDD
zdN~;cLW^OK;+ls6mgA_qKxgdcu!EDvbjn(m`={!hcN7KJpgtYm9n7{VpB^&uvxqqe
z4muTveDbC{&HQAU3B}aUtI^DeGmJe{9D6rpXq9vPl;r#-I~xz)O+P>@p(x9^=?Sia
zQ6}Y`Rmeg}-nHlNQ2nI~fww<~6%XX<UgBD<;}dPneW|WW$;(pL-^q1PG?#N=?9oNi
z*H#~b&GbrS7xufnGzrjR?waW$(`Iv*DGq#KZr5l`z0C5AcfcZ|i&eTLVq<!|uK|7h
zEc>A*!_bfqa<?Qb=hzvwnl?0A$!6-7zZSQ<U)tK^<?K{VM4`l%CA(5+TmyqLFO|xY
zNx48^pn_b7J#M{j%T*!K(w@tDtw1Iy&LAAs<n*Q^q`w^P<;fiFN_CfSg-VAcvA($>
z!=ILB?;V?XJMqcmDaY?5muAG=7^<~XeBqKRLFYai^r<BFXrsi-Tg&VV+)9+?hw&Z`
z7BuN*vaXpjUAMYsFY1n+kvmwDH&85z8yS+5?i|oRY4PoHw{yjJp&?ie;v`S#0W8%{
zVUggP>@NeC*oA?k?xByv^~);uFovbfKxWiZAV<P1733q5JIR?k_ui)sM4P$LYpc?m
zQkb&usf(sFAwv51Md(k%#q5T(goNof>AXC(KS(<`yU;tn4K!zIRiIF3%e2{(+mPX7
z9e{g?eE57%C-eR;B20km&A;Tw!gn!vu6BY0TzN^mf$Fsp$v5NU`#OFMg`E#**mt7+
zAvFsv&#lqWWKz%Iav@)9tCOV6N-S*l(_=EMQJRh{bs2&MOwakfU#euw=xAQwSJ6NA
zDs+}~b)IX12#4TS!CxFAB_#fvMF|5f4;2K}*z5sI>s*C({@L`?1FGk?+})p7QM2&a
zC5{Rh9eR7a@^*^mqmI<8zLrRxi)~0(d$t6TW;YGD#I@cM#)JKtZ(dnj&KK`7PcKvM
znP*#72^DRdPIx*IT#+c-;8;wd?s2Z$Ex6*eQfp0cgw}{6L*}D<>Pp$E{nO>2xF`8o
zRKy<I=E!YDFRv=J4QnO0KaO{ur118;JTM=_VqaUY8ldJcGukf5+4RtJgmQy!pTXmM
z*w7y`dzPhxPY-<PZt=T1ba~R}QV7k#@~`%{^54;anM}{PZ@wPF@9_9UL^PW(Ra5Pk
z^Bz&Dor>?)+t0Ca&wr+*e`WWah#tW$mB00fLjJcN<>It(WW2P>Yne*N_FQ@`dtLKc
z&g_q|_QiC!xWf8ROx~PKPM;&Fayd7PZ5;KP9*2i2zp$3cdSCCPp!<ql9Dl^?*63xQ
z^VgfMK{6hOx4At~FTVa(oA(`DP{VCbqaD-jjGM|FFEoCs?ONOV<{Iuy%~RLWl*Zcd
zr(dO}jIl++uJIU#bX8Z51~u-5vYuN-&YkAM<C4)jR!qrF*IE|MCpz9-S-NXE^+L3v
zYptoOj+4i9Votbi`03>l!^D~A*wwo!eP6vQqCx{+A3G<g?&VtqJ$`$zd(<f4yU{rR
z*YBh&4W?W~6b5cO{iQGzdKUv!!cgY<HASlTbw+a}H_Tslp3UsYUH93-kjOBLmMM`u
zC!tLDkK`>hC|wuOgP#;yK2s|jbF-}`&SKd(m**kIed@c}eL+ZynrP@q+Y=svY5&uk
z93*nAE{)OJYV`c1-gSesJpqYF!Y2dK?z%hz8vAm{Yfdy>h0!3@0}PuIu5Xr=lAEw*
zQ~TKI@bld<$(z|2Mp?Ep+41LRkn_(otrY|_xU|~M77Oe{a_X(VX}MeV(P9R%daqES
z71O}!CWLx@K$YwbYZAYtH>(a;OJA3nPff4B;q;T5@WjNEYpIF(^&-5PEf$RkWj*s5
z`#Xx%#wD41P{l(};JW>4)Mrjt*NS{u{oKVLJ;v#0qM6r#rC)SN#0J~nl@WAnoZA2S
zrCe})nq*y+Sg=SccF6u~IgcvGe8`)xnMFtLGYf1gC(D+<Vc)tr`EBF#<`2r~!x#C8
z^bWWO_*d^>F#m1MC=}WluHP#`!sr+#_{Z!?n7F&Ny8=b+jEwkG_QnCbc_ID#*|PR}
zQK(!3=W#>V1NB5WA%c-4rT&`}ovsGDKz%CGUKt}e$*F9fHAHL3;-v@HwvQ`d{0G=h
zzhn05f8<uZkGjHR?|uA({kn9yGZsSf=iz=m_-EGPLAMTnU+%t+lf8SCb%F?2L|_pL
zF8SYV$t7rc0I*1TO+Hk7fBn~y8=rFzSEs$aG$tf4e}!{|MfG$_wYIqtcN{wL@}>D@
zuk!1kWZZfj9bZ}Gq(;6?n(bw_UCcQ@(xql)F4XewvZHj>oB@`V5~EXivq<#zgt$Nb
zkl>9IUl@))?#<#JkPxSq#GBr7ID)n-{C4U2xy@+gtJB(6ZiYqYcw|qi9G{z`JQjJ#
zE|qn5N|AbAv(eciBKeFesp8_Nil^tO7sp<nZofxakk&18#Ifu>-ca0wHJq2b{Zsz(
z+grn~N<HgkA_6Pw!(k|#(4uqe%s~5_xF0WL$h2Zhoq0dMy6Z70ZvEIuTGiHG7gy{O
zbzFB%|3Lqvrdrzlt8S{wM||R6ygSv@P2-y4#^SH}m7M$5grF9^$;eo~?ox^DNbeet
z#Sor{i0+9XC?t9pkW@8Z2S?6Jb$RVZ4s5w_+ElJ5*U(&hv5xN9V}22f&>7Nx5~gFz
zjLHj<$xJ-R^RPMO`6{H$vh+y0m)=7U%O-~=l+B4(GQyoQCza1s8Xmo#UAV-KeJO4z
zfp@UK8IYYp(-jv{%3zKcN%d9ePg*p5&F*$0_`s@WvC#D_^c=&}<uggH=Y5&<?v_s*
zKjXD{R>H)7rOeG=aiD{r;Q?KgP>K()!A2oRk+isVqQIM#!11hP$MXS(8VvA|^Ti#`
zT_l*^#2P2o)(kCf)vKoHv@#hDiRHkxH<eG;Fy67BS=U3)Om2zLUl<~hzqCi|hZV(6
z<)MfmGURUjl!SqXM>CwZ<{R^7{hq<a$3iVx;a|3f44)pLQV^(oC}3;HM*>5vV<X1b
z6C`d4UUb5lZi?!K9&0RlrEwrA_4P@=;7*QKhokf;p2w*s-MOuawC$=GIavneu)uz4
zcnyW`u;Xat6W-VN`3#?_rSuOmt~?hedumiUIUmgEUi8)HTQQCMw^1ftzjN<2-ZDkZ
zrWDsZOfe_6m{G_XJkmm3h<oRjEtLKE@XZi6`O^_9sfV3jMK&}X?xkON^ug)Pab9Pu
zi97Cg`D&VN{dIIYItf%8D%4Ju=tY(vmwIZS41L?D%WKR)L}5g5q$KLU2RaIa0dOQ7
zO%|`FMghrFFuB6Zs$SpQuELv2!%t-k;7AGu_8ck7Qo(Q$(k=>@(0s~r=(`xPm~3GN
z(FiwiLcG(?dY1@8L@*_07ZV~1Q=tzqBx%|!fd<eu5(zMeG*CxO17n90<@3lo7;Bvp
zxi`*+ioby!_t4tHr(PO*Prn3f$!6)Y$2*77i3l&m&?9O$PFxS00HB`Ev!U|v;|IqS
z>h9dB#0+m-s!mAPja%mJ=x5?e_fO~JD|2icv#^IczB1Bosryo~kEw9&75AZ}C>oov
zc(1|YDUTahS-1R>d(0@8FzZ*$PwpRX)E+ctaS|URw{yvC#&&o_TQQZ#_?Fx;GCKy(
zroD`>Y)XRq80W5ZTt&{zNyQ7@h+6tGChCX1NdKM&6T{0qyDl6Mj0wqEIJ-o`j#J#S
znpo#nt(g;ma!|7j8rUvozP)^^)(_A9;4q_)Qih^96YsNm;k^AB`o;HeWf>ZH70BP2
zE53Q@?I}w!BKjhR64BWIwkA1!nX4+iwC}N@ux0*Fd|upYMPiOilcl5?NA$HXR-*WO
z8G??{B9iV-36x}&2GvC=ee`(qKBN&^q-+T74%iogU&zD6Gf8u0qK?`~+Fh+zMNnkU
z<MZSw1+m{6A(wqP92QK}g(3?NU44hTdgO-ixtsnhq958ZCxjB@rlGG54I?{l-%NUX
z)ct7^MB1^BOqB7_D|;p#sm>Y=!)Uf7<U8&>ze5*_W!rB_&)MsA7xSf$^y`hKuXB1;
zof@^*O>V<;cdchubcbM>tJkI?F_bznqjL*uOfJGO!x}D&>lb|)lP3<&9%M{)y2LhV
zzqLX6Bi!N;5e*W9jo4kRSL6(}JX#b8>y@nM%h!6&8Xli`{LRx<*|0$PaU`M@e&K1{
zqqkaCjNYLgRVPbN$mX3FKVgx6HlijfMnJxDJav(`(h4tLF2!~16AkjZuuA^3!oKWv
zrIb+0INjxJn#K#*A5*ZvhM_XLhp*+3ie>yWN0eGU4A?Rl6&Y-1<C*hw^>3OtP8uw}
zUqh0g_KdzwW5>BdT|utouHH=YPGOJALXvSw+*K!jraI;cCtFVI*}i*wOWBN7QqC>3
zMHF=BM?Y3+me<qydN@T}%%s)Bi0#y7QuM%iJh^ZI`cu-U-sAUsY0M(eGJe5ss;|!u
zy77iddklVxe9MDsK{9q*YhkUkQ>Tu-8Z-B0DCK(eup1x5lRW69UGe^6`>Ng#{Fucv
z-^Vzr9EG<!ipw?hrE2BP%}Dv{F-HP&M9bJS9;=}mZ1yt@t~IrjvVX@t-jBZHdZ@ze
z(6tA)Dlu2weB^4$2DO)3u!BuZjc!ZV2ea>?P{O>yqt#)j-z)gtfAZO@^5#+0T#h@D
zJ|zM+fe*j`Jt}l!73oP4RHRF1LvJcT75sZlj_(`WI;A!2*H6RU5$I+>5hM`qAI|Z*
zZ*1SBDu&r<D)W@#5nJ9gJ+GT^BhHv3XFl1*26`u*7QN%_e^RSkr-@bU43pUHlGH>t
z&xebtYsZq^<LOj;#AuX;+!8jBL6qkYrXBO0a=G0$@x|@#m(4xhZfFf60ujNi$lVNc
z9RrOkq=1v1W&WzbdouCy9p!Z8*{z|c^r2WX>bnVvDQRXj`w*9TMCI1wd3_|mb6m_B
zsWA^o=or~bn>}P?ux^+yZmVP!!O@e=uq>DD%4l9*`w=U8E8XV_B~R!>GF0|ny9*k~
zNBJ|!yiB^ZzMS+Hxs4B9ZJXX$^vTAIdwf%KbP&8u7i+IHelRcs`kv(8LD3`(h6!oj
zP5ObZgU-TwsDQ&DDp(|GkQ$$3(p=c}>i*22pQN(br&swGK7MsJNwp5`(^@t2KNTvU
zR*>$un4lE!T1Vzk7dIpqDI+HG;9T=NkD#wdI}Mk=cdV^*-QD+W(=wBY28rNn>~0`l
zOantbl!|Q2Sdkh68PyX^1VBp6e&l)US^CAx<TRCM)p*F01y3@YQEJ+a>&Gh8zQ&1l
zjFa#461p_nYn++Vf5s(V<M@&BT*k9o^iG0t`Dgp=uCLM9ed8mt(TgI2M0d4@RRty^
zaPy?j$SX+wb!yz_ib8M^aN?Twi)XK>t=;@Pjq>cDmOT>S;~z@WyKqRkIBye;<3E(Q
zu0c)immJ&Xs#P4b5TAeG=IF<%@)fnb5oU|k#lG9matxk5ZO69n&%VK2eec`={m62F
zrF!*J(lcJp&Mo~QQHKc}L_qOG@0kUL$$ea<!6LynWv=2O9~_VaTVKOjESD~<*|v!X
zvJ}rgX`tVee}}1pyq#@y5p%?O&++OgjmBG_0zCY3+-@D8-{9PXqKD(^BLq%xJ?VS7
zC*^|GoNCOK?;;8Y<_eqbFZ>!_B0AK)SY{}$)_Tddq<d*ewUQ`YseN-WU|C>z24-3s
zk+DqeTWt?}+<0SjW>zVY>P*QV`jhreB}KRLQ$Ebg27hr<#i>OPL`g~WCRjYj2ETcp
z#ceTVDOvu;ov-|n>Et5M>r9yCm-nn%3zwUB+7Oxu)J5!Or&Tqs&6^xpqb(JZ2*3j&
ztHo`f_r3pdH*BR}G2oyQt*u^TnWd?>4ZbP1N(@DJ=@{E$>sO@R;}>R7p*d38_{gHZ
z+^>EuY=tY+FKkP1<ygX`4>5KnOcg2cotM7XmRMPd`i?)zv%tH=v?VM&>tVB3yl_!r
z!;t6WK<tRYyNv8uVG&pE)1#)Btr#@N<i+nGbth^`Ene3q7s-t-9p(0u{zODrA~02A
z7Xt=`K>-VPc)2kgi7|j<&?L4@p$C-3v#5UUwJH?i9%eoDu?8;SJ}3D>dL$9*L@=r3
zE;heV$SMGnmZA4DhOq$K1&$PiecmSpk&l$bS=DKqpMrb66#|*T5l(?=Q1|^UP!^T)
z^hh=puFSphov$=W_Xv!!bP(Z(2rfnMVs%cGpOpP3;;h=DlwBl!DPE96HyV;D_B?W;
z6Cy&E7`&6%1vXcW2Q~qrR4QvXg82eBr)3j09T1PUFm&Da7{Ao~@0T_C@)$*D51fTx
z==k)^zH<Hy)<lMb{DA7=w2tF*<s?NiM$PH2k*D9=IZQw7m!W9k=(fTZ*n8iK_BHku
z_R{W2UNxB5@H0=kaE#soGF<G#>GtZWd(%N7wQD7d^lOn&Jt3oGEgzsG%alSf7m|xa
zyso=_>C=MKlYLiWN}G;ScuyPD`pjr$nsH%j%5OHv+Vd=f`pEwFc;16t!G6~*x84kH
z)NXl)$@Dy6E9G2zb0%%jukDm=0<+KFsPQBDv##P6Lu}G76H#r`dEdgiA6ci?glI2q
z93AZW`h%4Fh~rlxsw9H*5SU%8yHw+37)gQs)N{JoeCvFq@;#2oBbYT$yJ0uO`hGKL
zNDWi)z-VyfTyS`$zy$eH37M1K<~+tjf^nM6H_#+^$~npK!lM?KWc){GjE@LS2@{ti
zeL&JJwygcq%r8IWOylwvyEl}*R!^<g*-l`uMFw>|<)V7_&S-oCBa>5I-SH#0ZM=Bs
zqR7H=8y9n}PU@=bHI53@<3`xgBDteH5a};OWF>-skceHFjhz0Y7&71+n^h>gS$)8V
z0w<Ozt-P%j^^Grw`dCyAg(cDxj=+4A1k8<IN}o?@rav3@*25knAgl^YEE8Dkg?zja
zIdfob;zi1RWpt`D_96{RxR8UZUkj7rCCpTvHQt2!O6QIzH3{b~KLPE{wH~U3;@b8d
zq-{uNiaJEAsY9_<ii^E@GKxdK>7(+2_yqbHi7m75pM2=<8pR65dw4#1b277lo|J(z
zA>%Qhm07tsBZG^CqE1h}g;-!s)yW?m6@1mS3ST0dy)z82X4koT-R^<C6*3woq9P)Q
zN@_RO@Xwc0YqE=iU*rvv8ei7Y?HBNPC^R8w$wi%h!~@l&OmTx&G|b97Xe9+sV|h&f
z>h*<NHZPvMBa2zhGyJ@?FX_fY@td&sdr6eo9f`0;1gan;cJq-(4@=(!ryctaxd;vP
zHLHA?cxW!}$EA|FEG{yJEdTa<x9rI25kVSloRO&&Je%kk1<{ZUz5t~bxTqIKBPd+a
zM8j16M5Z?H1M-vV>w^bmIdtrUZ?r{lJxuJFBV95)QSLjp_rw0<{8smA?aE84z2t1j
zBw7NPXI9~MkqO6c9@l!mmz-(+`NpM1k0rkgd#zjGgfG*7e#{17>yDjZ#BRRfNyK*2
z0pckyWdzfZ_4eG8)w_}vj^A<>xI<2x2Ja9WVF`(kR@Sy&NV9t_V60an&(Bv?t%2QN
zpf5?L=d4YO%FNYzl|28%LAJ^tTQWd?J+6Ji+#RjL_DnYH)|~}r-LEj8#QbD72m1D6
zqY2&ir@_o=tOK~oo`)s4Y$onXk)|A(mI&N6HkbA9OSzI(r^fx^FLrhx&{I3)+l_cX
zw^Pt?$^^AAg_QV6C2)ghPjlDXVCtFQH+Mor==Qv>pi3h{nuyOvV0K|Ss&QCGVBJIJ
zQyVO#ZW+8*-1(s4>sA5f;IW)}5;SKui_X)41J3gE<!ag0y{>bjnh7cmED2otckCi$
z4y$Xc+a;)1a)<C`nBBKZOE(t2Z}_^xrKPcM#NtJ}N{m;`X5>7D@uuE8%cNzUkh=+p
zuLq4E%Y;O0te;dJw?KUOp#m>w`$FxhDoB1gvb<pANso0{*BWF<s_iqKcCh4oB7zcu
z2f&wt|6bsU8EWBLptNH?5UVWII_0$NJigk$cB?WWQB{rJdIj+|EK>8lIkf85U~^SW
zmqO_4yG320J?dla(h;3;Q4bTuLeHs)P9vISn}e5%K5}z!6nAypr)-6doY^PxW#F;{
z?^Fxhs10+Q4fC{FnJ;hQIe(+vrUL%qsi-aa79TM(vGZ!Zfv2W#-o5&2BZ0+bEzP~Z
zG@wjdrZBgR(@oFkq~IwFr$^G<J(L|^N{1N?Ye)`+K0<Z-yguA-qE^A;hfzOm>E?O$
zyP0NhsE4K66~4He1&3s_ayz;NA#Y+cxpBgzHt#Y@@i>m--gj<vNnDGnLTNa@f1+NV
z-gwqZ>i!Q}BgY44g?ex~0|it|<xX0?z1Y$lwJGM#_-jm$b1(O_>gwq}F-5)S#WGVD
z*jku2xEx5gr5<BEWn^HK5y1B2+o#2kG8cdB74<pqL!?`Xpd`s%m>&v-F#=E<oP;qi
zFO0}X9?`BKQ9hcu6i#tv8aueVty`|Oqb-HKITQh{1~Ft>1`{a}Zis-V-8ezQKn!d;
zpj?XH!!<){NTg+ASOT>h?-8p-PX-*hSj)`0THnSq>y4Zb_1XFmc-@1^PA=3q=uvRC
zoVY9Lb!Hx3w#tuH1u0im&E<^)3{CcRyb3RF?c^QL5?v0~5Dw$jk4j^Y7jy8a3LL}G
z-Mg1;bo**q)Gc`ImkKv#?o&A@4*4b=4QUpJjZal`wcj4Bd6^US<WlZl1T_(9h@b@&
zu-5tSQ$vPW4*)H^{|>nT4fGgW8iA_EZ`RQjj2s~K#v3NdSQz_thKCwISio;oB#$ym
zPM>iIZKm!hV*Fqjrk?-Ug!jc#I`VFngcDi2;qAuh+-dKQ;G<2q@6fbuRF>a;+s+ls
zZN|)(tx}I{X`W+G>0T^zJaI9_r9RgGa;C_jSeqHSA;*N&hbwE@XT|POMAW9IT!C<z
zM6z5|{Frqt=E9eQ#%9pI51bla(UJP`fqp@w>a^H%qJ9QEE2zT=rnylL)p5ljfrK<{
zOErFFa=o`a>&I}kWSz9|{0R$lyXW#8ACg){k@Ep&EBMcYZ6P%1rJT*#7qpCMhnR#`
z%92rRSWdcuiGP)HBHc4Y8}DcO8%=D~nqOrP)Wr%gaV51+($i&MI>mh}IurAfWG3gR
zJ2NHyoBK^C9|bXMvbrX$?Jdxr3!2}$`|-h(p&=fw4}q;jI)?~OLGH$#h#6?$NENAS
zdPG((JKmYJOBD&9-WsBMN=ZQ%rGU41lNEgvDr)-pZG4e)(<BNJJV_pYQDn&7Y+At>
zs$r)uk?r>}V1;|wJB_+4aj=Zmv3TER^pWwome&Y2r>Y@`f&B|scjeL*JBlrv1(;a1
zU)b<D(H~7midXo^NkV7U%u;RQJWSY8j91a2>b#+a*K)hb6d9vS)nYKX1K)1LV)0?e
zWAYITK4R{tdUT{*dl$@#KTRibTX~BMvBHWuS5`5ktOhQ{pA^*VzkBRk^|_Zg>8~~0
z&C)-0?pepDMq}%iYeOCim=|v*A4g1mi@rNMyRk`n%KAhE5gif(RFb>!7CLbn^vZ;z
z6pgI;mr?bzvUP>k&P#PA%9OrAo6}G)S=nB1esR40T=Z3uYn)T{_n~%g`lPlx9?CZ(
z6en`T%u@Z@M!Fb%8j%Q+O0|v=*_`@)ZI)Dw_NhEi>Y<0MVmC}an<uXHRLM-lR#!Nl
zlFO-&T%c+x+iM#5y}B((t8wFG|8%ObfE`o#r=WL-3^6j<`yUuD;0mwYpSULC#k{iF
zr@Z)q`~jcrBoVQR04>Q~Y;2<*pbdc)JZ2ALo}(HhPjKd+vr-Pu_7KOnUTo$WN6gIj
zun9WO=<}w9af;g+`DE=?P;Je#W-mM%E&>9S<T%Eqoh$)ZWCu2q+{M1ET(k^*AOQW=
zG8>OMU*xA$iU@jn>GqL|sP8NMV=aBw7V#ui^&Qjl1^u`lewp+8uklWsa#%blefm6u
zHYy{!j&5>7XNV?Gij~ND0N@PQ4xW&RUFb=|080-n3>Vgze<<)r!~FGf4nO-*_bj^4
zguB5qtRjgw{gYv0KJQy2`G9os%<A!>uG8q*lnODQz=xHOVI>Lmk&Y1|4K8U9_h!iM
zRUYcgH&ABf?j5*obhvDsO#Vuh$G!=>2SKToyt9!|+rw<4iq8sEnONL=gmNY7fF;t)
z)DIKwMz>p24hB)|HId>Rnj2b9meUjL!!siXiX=J@QZuO@lHItP_k!uY1^PLXaiEo<
z_%!KbneTAjJ$snEJ4x(sY9`&EZuTr+xldvG^~c_u$Ba{U_8FjxfAzB+Jw^x#gDKhg
zIQ-1%7#OISh^RQ6wlVQFvhnZ%eFx45uk37t173z{+x*M}2f-jP@EsKgA3L0j7v2*G
zMSyPF5DGK$wZ#)+(?A-4FV(ei2O_+b{%(5veWmTFE-?fgiGYfsfpt7W64=f~AOSB7
zM09~9bZqcA7eeTlI2;c4G4T6O&I$qpnbQX%(ttoO&;bJBVyYfM+!h$G_HUfruObLc
z#l_Ll0f;sO!^=)U!4KR%UN&|><QpKL7Z`@-=HQ6`nZ>y{Ipd+=2?9G$cXu1$xcONx
zX9o}9sRPc%(;kWjmf`^pI8O-7(*uwZf5sE|<BW3v@{T|R8}Qe~&jAV^n)7k+1K)EX
zGzf0|*m=5ndO$J2BSk<tQ1Cmg3RDsWfoVX&fus$E=|H7`@1+J%7^qVq$PNmg543<{
zf!!*A6I-AOD9j#c5}*Qw0TF=CP?!r4>jpIB28FppVIBZiPz2B`Fn|%DCjp^+p)fx{
zw}8w6z*%D;t_^1DVvlzQGzS8{-$UdGo%!pS&%eg<TXz%s=701p&~a`yPCkHkch$>^
zK6YUA9~KFG?*u7;H)2TeJCCxBm-@EO;1HPEuPhK-2t+-)*eQBAxj6ug!;J6_?j~SH
z$Ht!^5g-s6`Fr5uFF<<&ih;xbe<aBNYda_ax}pI)Mgw!G6iNz;L17?RV67=B1&1Pm
zX&nW0Gnga*KEdZ;eH1Xa5w0jnu>SuoL4G|)0d>Iyw(%=T0AJt{a8LnKXdnR%VWpsw
z0J6sb<3Iu+E(s<HBov92fB*@wPYhtbKqJ7wlmx6@QVL)I4&=cr_#6c!AgWUmcpq#N
zD}@EF+hyRDAPh<p0$xETK%u~d0Spd&R{{`xyA7Z&K$nmQ)_+LgAQzY<B(a1m7Vz>I
zpd5oh0s?N+MFVstfLQ~D1>ZwMC4j3Gpe*o8NI+01s0>gE;1#4#=rmA)01tpdAs|f&
z;2nZeelZ7L3HAe0LW4JOP(>iyplIL~NQC|X$!#Zy#Lqtar>X4>z~FWhNI+uXH3^U_
zfOGJ5&<uWc*RLzUE#wy?gci_}KqVk>0AwGugMZowYk`FXV**+N@Be89WC{+45T5;P
z0E@vwe<jcmwrv2kgFh2N*MMDsHt|~`*aK)2zY@V#e%b_R55FgnFTy?0251|<5-2$a
zE&=o*z$<72L=sjC=o8Q`0EhK^`ehTpuAowYeuogQVE(s6=<Dt0V4Z&wXcu6wg4ch8
zEq-4&DFP0NFc$)oGN43)M^k|!KnMoR!x~V+!F3>~4F!6ZP^JR~CR9Ss019+6kb?kI
zHu9G@gaRE-cxntq68sZ5i6ei~uz@0f(y#^ex?Rr>N;rxIw6g6ZfUY9EzwLqmWfJb}
z0lVJLIY0pc31!<eB_JN*&Ji#dFbCQ8Lcl24E(RSF&^?43Ct&n!=bQmM`kCAIalnaN
zLKPP%Fq0CRaf70Nl5_`sBH^hA6#bK&CorRJmw~<z_@+X5{;x*`EQQbtq2#Ay{Ym?$
zHzjPi0jk9TV`}?tA7Hrr%n|&j#835Zdu-q<7J&vH^sw7Gg6{?9tnE9%9RqV0A@2(X
zoGu~f2XsD|g8)8|0Am4)pbrP^hfv|);CYbPFBhl=47_b3|2_xF|9Y;7_#JHg6HX6P
zRs<@+>}-G?4$$2yBL8PKz}Np<4fQ{(!No!FU;{uPDWJvQTpY;VPj3EY5!A$vLQo|;
z3PC;p5@y@qZ~sRDn3jt@2$X<<3#yM0W$z34XM#LG!<sO^zpM;I%7bw5Fo4^<+&uAa
zF1AoVlsHx#DGGJQ<Gp;OVKDc99*KM6oQ^<X0DiOgwF3h0e}Bu%-VthRW9JG``Psq|
z01<%AyLfu2*Z}{n;IN7`5{?8M0kH3kfg>g07Dqr-^OG%qoP#3-oD(5{%lr8Qcr*!M
zfH*?`DFZ<x_yhI$Sq6Lw212d=E<=LW_E$Y5;6eW`LxThQuX<>3tNiaWz@_|I2HXK~
z@b5AdIOf2gfB1u=;NY11vkZ;|fZtzbC_puTl_AiW9W)SVG&nZ?Xb%B6>>WCgkOU{h
zKk5Oz>}U^;LhjH393?@do)jD~oWI(`g45-nWoY;g{(u9D;6(gaJqd7*`KwF{oHPC`
zlR#pCA@EljKx9Xm1URq!RSyeJuYZ;S=hb(}0hrki8W_o)@&e9x$Geh1Fa1qJ5)G^n
z{-P_11t<7Fd67cwv<)frPI&=9w?i*jI5=zn*&Y@Npo725&|t{oU-cw*j1M>#1009_
zvmOXu{%#Lg`hl+JkM<C75TX2C1|rtK%dqerV+(=UAsYgL1iiwaG!SS2t^Qpmh22pP
z2?yQfpYI|OJ8T?Wv+j@$DY-+Z2p}9_hioVi5B^01h2Ak{fWvR#QteM(09*U5Jv`3F
z#mxan_#ZJwE&&d}{0A%{Jw3rGo8SO7JRAXM2Ts|91)mSz28Sn1l?c#4Ap!!bdTNmW
E1NdqxP5=M^

literal 0
HcmV?d00001

diff --git a/benchmarking/throughput_vs_tpot.pdf b/benchmarking/throughput_vs_tpot.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d17ec837758d25c127c21785e653e3386e33c13a
GIT binary patch
literal 28243
zcmd_TbzD`=^8ieV($XMxY3Uo>3ztwDNu@)&k%kK*iqaw7A)vI<B_bWtC?E>b3MwcF
z3J53y?>UIi*WmX)e%{af-y@&(?3uG?cV>2WXLe@KUd{`03IZ@eVM5N5DM)cWAq)bA
zxL8~wl#qZ3X<u=*h6u@;W6T|0Y#~Ay%x$ebAaJ07CgjW+LThI$u%PHqFDSS;V<3ow
z0z&%dEiPGGVjv>lZ{@u)idq<Rj5S2~JK%yj24n5+3_*goghE<&=2rI3wh+|eQ+F3j
zEo%(qBG9ay0)UFOHwGf4<OHA~b9j|KyedJAe?kZLCjlVdAolLo0CwNe7t*x$aPf4v
z1n2?r2ijM&wz4;uaq$LPgaUssVFX-ISQLUlA_bA62oWUEh>$Em9#Dq|1pOUJX=i5_
z;0^}9@H0!m$3J+eYVB-`v4g-4+f}f41gM4xDL4YGkh8XQv9bn{_rSPYn>!JDXFN8Z
zjGj`6_bo4fnTzLWyRbVaV&CL2Ak|*tgiW;U&0HfXpi({k)KFl&_1bQhS=>NR5r0*c
zHivGlqvvV*NYTi~%oAKLEiL=I9n-t#&o(aaeO{P$+IzQRUs!+R<yLv-@}B?f4NTDv
zcFT=vrEeQ`TBCO6?0es!yW^tVM#62<ToJctR=rle9tgR9p)A`re^710F_s3&$ft=B
z&x}8(Y&+b+@zjJcMO$dnw41Z({cJsM@PvJwMA>tNbE%tOns)|QEHCfKBoNf^AdLJk
zvbDo_uEq0Sq0gD#9ZEVKbXxO*AfB@$jk}81+12aI*BMq>v6&i<w+31%WmZ;rt?sUH
z4((fRG3@a*Fc26^6P4E_Ph47kMUTF=Rd@A^&E=idBCV>c>t#1YUG|!H9JU7P7N%$G
zMQ+UOPTrc1X);fdt(iaR8G4c@KX{~9v?J5vsctL{^^^9zJleiOs+^h`{yD2RHlos%
zk<#z9>yq*xiwf{$boX+QV@^ETMzr@j)2g!)xH!eve&kiTWp1b`N~Q&sQ4H<rxY5C;
z?p*EqlrTi;gC#6dy<Inv=GkM5yx})FNCg~q+Z5OYf3h&uaiP-nU_FD0$fxn`S29Yd
zbb0CXN7M!8UO$M)KOL;0!JzX`qXfQ!59u=XMg_OfrjN8>k9hq3FEj`eFy%b;Fr4|;
zZ90Q#k+^!Tes}HNjjRNJtC{MZm==lI_b_6g+!Komu#eWm9O7jAmTIFXmXsx_r;cTj
zTnJcKQYcT{YTIj}`>^Tbdy>u~UXM7P-@!k`VM11G5ls>$0v*9&oSGe!qC_#g-t<XO
z47=(e5y;hVc;{96<`9(+Z*EJgJ9~S6kdxH8C;dStX9A<*8M;ZBGh$6RJ8?q^F23Z!
zeE7tJe2kS>Lop#9-W2!X5-Dqw+xBT&Xz+2$FE)_`w@%L0;|CF{d7MD=<Rz%bHQA76
zmv%D`Tx@4xx^ld~itAcx0XB1)Eh&>Db*>xFMA-91MzM=zXAODGb%eX%-l-MGZp(}@
z-_aZ)mNCSCb-zl`AXPjtcE*P|r}K6)SKlM@zILvlx-#r+nGr^rE=%(F36*g37}3Dk
zGx~aQSqx9_mYfKOtesFS_PJm9`P-YNl>ObW`B&HX);%N?rK4{9e*L<1m11vg*{t$2
zW(RZQaz1qmMcEBuI?v4RpeUlsz0HntKb8uboBGif9%H+h8m_Kb>x|1{UUUhKn<iV8
z9|BHv;f2qX=bdA{bT;V0Bj3TsNn4-V8&sv2JxW6r$Itfm3eNlY+THr}qVN?D=0Umj
z)^6cFG%G_z_-E{DO5IOgJT5S7`_5B0_<Azgmb4PN+E}WXBL;j!!@7IwPxIqfm_b?E
z&o5_vr7n6Pj9--MYHK&5Z+(xr<h}Z=xs*pUL7=O!-=4XGH%roFZjNY>iP>u<LM+Q;
zd6@RxN@;^P#ulaJ<aedXXefE^1$t7fyeQkg#V*RuT$x>{GLW+FZ%3HZ|FR`MdUKwl
zVcXAcO+F*bqnfcW!BJ!^-CrU#nEHOgnMwTEHzy0z10y*T$+`?9?#({Ei7>lWVhNw9
z7}di<Z@%&`bn5ZO2oamLHSC2pbGHZd*_{jsd*)$en=3Uai?<<e7!(f=WA~<mC^B+J
zuNhpvb>Rufpf7gN@!;Vpt4Y^S%%~1&-I@q8h1?WuD@<Sz59C-$zxNW$7va1uXJV}j
zo-OGj+ua)yexf;>EXB{x7-l6<r>4JgIRAc$r41UjiFLV!X2TD@xUK1<Hu24s^HvWA
z9oN)~SK#UrH6*-Ie0B3n$EpiU{q3x+^<w%Po`W;jAMTg!Y^+RogvM+!-5zi@rIYbg
zF2Nf2HMCASc}G1#Ku+z_qurP3GeXGfSWR|ayQM{4RrRNp_Nq#TnerM@r7K>!u{QDK
zrTiWHuswy5%3F4QZ#D!JJ*r+u8)W5^H3WSQlkxsk3wx71;#)VQs$88ozSr70-hYiR
zM1guxypOhNib=3Cq*F(F$D*SgsvkF~Z=NswAlrBM-Rc(EH`rF7`dNfY{kL;AGM#q*
z^7wINbfea4dlk9*WxaYP$yA$WHdbC1+ue)~{3(`aC<qIDswgg38d_?|Yew$&uH>g0
z=jJKOd*Bxs*@b4Hp6u>b-d3tAd}>y`NS5ERyD7%?FuN3+E3IsZShn)^T3HS5D|=j<
zT$AI~r?N{p1n$>`#;P+{ExfE_GQ^|JTeEfu?t#OY?YeBbPm6ll9S<Rr%d5?QUiCg(
z!auYmr&}12m&VlTVXj+Um@4`Dd?O8utVrVSY4546nz^ymSLq2&t1UA(b~m<P%}+lr
z4t)^g;xZ5gTk~uZc_Ki*P=DuUs{aQKvW!R8$F1gyOHKNB#B9__JCp|4X=E7kD({84
z3|Ky0v?8rf%P|;2>uhib2S-K7=~F))?yX9Y&UhK3BG0WA9=5GL;fJQgCB7!mpno>W
zg0t+VU`Q&RuO)eYDhdBL_esWkMo!U7?IBYHsZCYy(@W+d#dly$^s|n#pZKfVzO`A;
zzCe^c>iEJ$bk)11<+W9dPvidD=G*L5E!jbeuL@0#Uq-qMg>JXZ_@dFTjBBFIn@F;T
zwa7xJ1^WB<y58`ux+q`ec!21>l)PRSBAgf2*0_l0{jzK1%!fPNnvj=+1sNYto=8m&
z6yk1NzA;W~PO*8s&UTpA%_H15n3r4PYwj|AaFxjBb1JO7jE|5^2`g`JXT@lP8L@7W
zu?%rpd&L#>)Fp$@7qVJqb%!m}>7Ncn&pm_|k84>m_wh2rC*5pXv}+RR%k1B!zULCW
zJkVd;{Lqqj*^3>@)9{8vK#Vh?tdU;O$!)xy$B3JU$dQ`-`b2}fI@`UN@G8AVk)p6^
zKdtOVuh-eyG^J$uyG=LC?pC;bqWGk(8bQcF8|3p4kBjxqAZDDLdJ};%n0r)b8%vsH
zO{`!rOkmK@lS-7SwU~mhx*Np6SA4yf7r*MJo^Jo0$6Qn~tdsq>4L14DPS1;qw#>gG
z95HpPn%sRiKfSQBv(vDWV!OY)`7Ud7lT%AlTXd~R3GKWZ6&>JpH*(swrqVOHnjtrg
zl72o(Q77inQ^tEuhNC!q6eVXP5%Q&M3ALj^{Ja^Z1Z{gv#%;XiYVQ5kzHd)ARI_O^
zSKcKrm2YB8K(XJ?b7kgPxR<LyAEn&?9MK(|!^f7Cv6zq|-}%quQ%ytD6{NLZ51E(h
zv@w!-{Pn@LPebBOA3xR20}KbYV1ec^4f848=2_MIx)EhzRQ17X%(7Q{<<38~$xEhs
z!X7*mN+7O3Tc|g=B(En&IL$VseUZ3Tbt3W^U;*R}?NmJKXweLQL%v&g38=o}1(ha#
z@lEVhrHgTf+M*GOxTqNgdz{f$+h?-QbXug-GFMB+!N<4a0OKPjv7;yEgQf8B4uaS5
zyiW97+{t~f&wazl6o)_?-F378l%ZJBp+2%8hs;-Bx;{#)(a-}^bbqC~zyb6A&ZY-L
zp?Bt+MQw*p8?~{;XVTB#qSUFM$K~ohPD<8s$~6srp3H#mia5pHZAipXJi*mXdXq!M
zJyM5V{W@|KIf_v;NxC03DXNorQvKHZQTInhwZr8d@sFYy=aX(ESeEJp={O}=K38tP
zw>@}z0ITnjK>xnrqtBCsSeb2x?c<45)Z=KQY<7R=Y!s&tOokP&qPq)svI+}2X>EVw
z>;w%Xb@gW*iR7_wqCSrwD}G2+(#8t&8pTG;Y))!87h?%;teD5|ycaK7etB<%DgZ^e
zRP%1h0N2g;QCOKKu4o#R20PlO=VS<*RJVB~4qgA9`%&bj_k84)iqutZ-fvc~Gf6y)
zV7?VjRk1eRg5w~wJpWuS)tN)mS2@Kyl<9pzuwQ~XcK@0qLo%0nOoov1vv^mjq%aHO
zKpw)vyQiczSS_7>sWXIBLPg7iWKt&dCV7ZMckYum=E=F661)&s)&3Ha<Tox@Y}He1
zJ|+sG?ep#`8!$^fZbU~-QLAFNO5t+Vi`6W6s;qLfbuMiEdWrjt-4$l@C#Ti|S}Db7
z331d~zXi2zMY^}DyvP33O1fvUwY>MBeCxdFS9`LAnUxIH07dUBT2f)5Cp;vN+fcdT
zb@3Ia1+m}PyV&4e9IO<=rD=mmsCCDSAhbo}xZIs-_j%*iZ7sN-%3?}>@1<Ppa^cMf
z6=^l}_c(*s&zY-s%<?$gC2YuKHYK|&CF|G<FEFc4tNCIU4Lnx#xPYR-MKsYtQcV5M
z7IXuG+|+KdNqlU*C-3n&z(iKODaKMzL^ZS{m<>F<>2pk#Z|aow<I|6MZYhR|G1b!Q
zW9>wcYUz_@zYub?E7wpxV=!^^X->OA9@}TJK%GKmLX~MR!)x++cg4-n8fBQ&L$ox?
ziMEsSQu^7yC>b(N!frk;9+~5NayQpT!TNl0zJg(6LI2hy{l(nQvCB8L#&fkwYrFwb
z)()mIo#j!tj#0n=bPj2pt3_>i&9{({?dnER8cq2nhTWp!IR??L@a5i0UBkvAC-)MW
z2$z?1xmDaPr|dtam-QA~9=<TaTKjYk91BRpYq;H_agyj!<p{m|YSE8J=1eHY7E$?~
zLV!ryeY$IxF{}*_Y9Rl-9BvaS=V)hXy*L|Dc=;*rRo|<=hS#e4znVGKs(syi-n6pm
z$#4HjYT=n(@jFR<{)olr#a7-UF2^56QEzR`+#J3tc4};LPrz92PC=(UyoRJwNk5I>
zxrLDLOd7OmE|8vU96RliG#&?Q4;@od{%IyWow1SZ+l`+*Ek3iUPhEs8sM6UjSQ)*y
zrE#oG%+Bc9Bf$yuyd`MiX{~&%Dq_YpW<XRW)8<Bjl-o^Qv6OHHu8H>>C3Q7^4Cnjl
zX1?)WFja_9j37A&mqSfL{7P4{qt(f2&oo0;GrcC8A!o;}?HZDts-HyLO5TO%ac*jm
z4r4!5VLPU*#X~RY;)_C<usS&K^S>0Gb7d)J6g13I#HtR-pg)`K;ZMI6O&SpC0Gt0R
zHsGBnLMwxvt+}yQ8_t-f)CHp}FLj?+yjiYDHRSM~?pU9PNLo$WX?;!6^Hg%&e$i_6
zx8){_m&asz?R})#&6nRs!|eGq0nv_A-ay?=+2ujnW@uc=kk=>Y89VP%3wd5uL0@>q
zbNzH%p&zG=J3i&BV?06B8aWC48ErC!ANf>nxq%a?cAndFxtooMCn1vRtR<m|?R|9T
zHTvS$7Utd}(o%E;8P}r5+j$KqRXn<uq`I#JAz!gU7eCjowc2wzhdOAy!7(E3H!6vy
zV4AGWSeH=eEg{gWdND$G1L}rw6E}+BrG1&BM_r0MA?MkZJ=>iYO#Mnzd&SR5<qHY6
zVDrnyUCyL+*I`tgI*~#lreCr5_JCmC83Cs9EMt9-&nP<^djG~bM^&L`iJf#oB+bbt
z`m=ml6se7u9s{5#6P-1prK_DDI0<^C1R?2F!^O%0=Dy`O6I);FdW2*bIb%wa&0n-4
zvM779?ibJ%z2_R_<q08?u(bDI*0+q#E^u^T678;N)|1zTxZJ;(K&<1LYk8le13&9i
zjnrldvj2jP$Q9IW>o5(1g!Tf15F-EGT7KNoO9Z8y&qu|lZi_DN4vl)`N6&P8SSOBF
zPkaF%j1|ZYwB4}EjI`mm)2){{gQ5$4o^W&YWG>ywT0v!GR%`K#-h$Zx&x=*8N@69m
z(fntFDK<4;l=BKK3LCHyI#Th-JXh`w=?%(od~wTvhl~JEhtX8|_!*1pK((rwLWaa7
zI{f8FX}6XjSKOLe=>@1Vt4nt%jVa!w$P6YA_0KCk4NcC=ru?wz8%jb+Iu5;mmd6`P
z>a|UN>1wKL>r0B#M7!7kXqWdpj*0cRdTn2;f&z0=pc&!X3Sm;7FMPa5Voq)##fam5
z`&bFz<O-S&8Hm4W<wxii_lik)BD3f|^kv{Or&Zet;zpf(rfqKK5&htd_`-$kHiu=@
zM^T$@mZfFUH{4Iy#|%`myhQUoSy!>Jf<4?z8v~ovXRxue-*qS)L(vpl8^2}3Cl58N
zuSvfws(Ve6*&lv8wTd*F<K>(Uv$??=oS0ZPE<EmV+;g^8EYbQ!I-lT=l+EIKl_<EJ
zy8)?_Fu?@yd0ECKw||Xv*=8EnE}o+v*dyM0`ao$SdNiQ@Sjn|JvLlV2wP%xa(o9V$
z`DE9reB<xrT~4c`FSSl0G7U{s4q6-%{w#D%gmSrf&icNq#ua1IhpJQ2V>x}t8Y5{>
zwPbjyo3>Y!-_$!H^K?%(>4~+QxCtQ79|>!&sB8#^HHTT4QVOYj!^z5Zdy|;e`C;~w
zu{Js;B_H=fLWJO#8i{L}dmL=OEg>QO4hhQ+XQGvmh2!0yb=dn&^a{e%G%CBQU0&(8
z)sRKsr}*G$ooh*5<V(~|Smo<llH^d%$G}uv@!_JV?%Iby+sBw40?%9*WV<ir0%4yo
zrYy<rY__v|S^GnmIY!%Q#o1FzTlH1F)s_=L4a#%AQx3IjE8NebnxCg`sb%ohZ4E!v
zeZL!2-}~^&8q<fx;Eo!eo$EYxv*%iFh0VO<_m$5gO|4|TG+n;Bcu`!AX>(aTGu>Ha
z_p33AUyRcerT#w7aGmpux+#@PpLsOoQfXoJ7LugoGxZ&3%UN=CNb@9eZAMd(@WxGf
zF&hSK`saHH?<aQG!@eTFY&ptty}T;>G4^@b`P0J~&74B6Q&Wp7-Wb=i<kagCZ}o0Y
zMV$7QXV*@={tc3aVtweZ+ot<Os(X*&gqQ_V+a_9<>s6Y)F8<>IEjNkF@}0OeXY7ck
zCqkd)b1XVYE_ysVTm5#dZAa$ROy4=h((Q`<uNks~kvo}D<S$Gkg(ax{)71>Nd!Z#{
z^Hvw{yku%sc|#<vJ~A<D?s|vPjX&#l@E5J~=hQf^B~g~eHOY0bxOc|rl+|%B4_c&O
z=?@4lcQF2{&g^R%;_l2>`SH^=nk<&jrxwL)-S?Y(EL4p%L>$hNq-?R+&d=Vizh1tx
zG6ERxWUDjNEFw+&+xdaE4eOFu#mpZT`j2d|3?aRlHJ;FsHkdZ~>>6*qf4;tg-*7#c
zK<@*0udR?ti)g*mK2I!UX%>ek+Ut`X_fYY-5O}l4%eIgdmzJrf)$4m72X_ir#dd^@
z`IC7X_Hnm2M&5%y=0n%$p||<Kj|%!$kq9*WhYR|^0Sf{zLX3aF5TM`rHzzC_1~^{-
z#Q}?qY#?cco{+m*P)EKk>bDi1L$$f*6ed7BCHXFV-O<b{GnBn%o~Ek6RZCS<{qAMr
zOZg&9;)|S*cZGX>1W#>{xqYUty@eaE!4VfKm(rhP`^NOz8DbYki$7^W{OmeE^b^BT
z!vBt=q~^moTo`fx1H$B1Q%%X{hi+zDx0zcd#bGcNaxCdBXALyNStri^m~StvZIxap
zZ{2ZPtIv6lfA1E=qt~sM+9G+aj)H?mz?y5+wkUATjpPz_^G8Yq<p`_NBMNuT7`Bp_
z^1!jrj8mo}@4lU)XTdmL@KK2`zN6jI>|R45p8Vc8a5=Ex*3<Whk+NmK{7)ez&+^o7
zFnw-e=WoxOIVXRdm8NB+pQ(yJkMY^$!|S-Oj6VeEpD&eIK50Lr>!U(F@OA)CmEK9b
z#P7a=WxF=PN1Dg1&kVx`XvIpyzPz3qZWGqIa*C}(GdS>rWG>QZ@f3wh#}}n8ytgeM
zzn1hk-Pp%+wX>~1f}uw_-v5fBv2EnCtS97Fbt^gj{G=p%t=zX;_8p`WjI5Z<w(g1h
z$7+NkI&9~914k-_U0tXn9Zr<9Z=6uWNoal4mhMeVgk5MJXDK+VldMyiRF)o<HP&P)
z{oFnUN#Zo&zg8M^wjDW>`|;E=v&`|z(->!KL!vZ&35N{vf!u+G>*|x2CFx2Fo|TBW
zkB>=;^*_5{Vz?9Z#I9zSdrYVSX2KFog+B2ovBF@~;Wq(C{)K=es)Ti*7b<G7C_<9o
z624VPCgce*k~$NX)PIb!AG?~^PgvicR8^i-4`1&TR!by_?h#G@9Ah~YT-4t*jYR5w
zZ@QpB;g_bZ?d(Zy-uM~NbV=c#qt7tM%4<vaF!OP*gV#O9>Zd+Aa0C)yMCCX7(a^sb
zJkkB&0GC}=Z9DNw3ui}LhpcyREaaL`8`;I4IszJEmfVTpBwUw(N^VaxV-s9z85(-4
z*OTJ35z00+E$LkMD4%k;&B$eot0`xa)r?HO3|_$f{Elh)2n@kU*l!rZVaR_+5qUwS
z3>S7{a)GK@Dqo7?#6nt;&#jpoJJ7ok<fAWHJ}sq+hNSRsQ*7%tT9>?lYF+Er6v*>3
zTaxm8Wz>myO&DzwrnX%3(x~>KQeGKEmQ9&`(p@&?LJf}?=M9(ii}gGYxYg2~NBUZy
z^~YAThtVnHt6UhlZ`Ypl;of%UT4ak@1nu>1Qa;Sos}U{o&ypRsXTnsPx;F2Qr^(d|
z^~8>(s}QIOj}Nc*P(6_l$%}mgk=8T{aIlWueV3ApO96#F;Ft2!%gNxh?|#~X@P4|$
zVs4}zbp(IFNZ@b$fr<Q07sjekbU=t@w=!fDvG)DeYewD$Ox?KHpN0vfvg{X<)59z$
zg?4u(Ms3+yo(~c*dV@Kou0<ac(O_eIr{5et;?kJ@uvObU?)h@gSGvaR^%9%VavcOL
zNF<Holo{`3ce2l)3q?On#>YD9nGa>2%BzMB#C?-IspT=`5f=B2-PGN|d?@WLEv-N?
zU~Q=mFPj%R3J|}KlD^#c`I%4II+T2M9d}mxjMWh&gVE04NQNWe|1J`eu`2F(tcN7u
zcqNgk{5WS}Z?b14%`vv9b&b-Ek<xZO>_i^pw-R%k3zUta!LrZI#P2M%+DfUvJSB+X
zam}3w^0=1Saf>j$F0|Y6{<)IOH>#}fp<GIii%LDHCwA`h8B;~tGpY_vU30hGFE{i#
zOeD8Ah4y_Fo!3Se^E$+#$kXH<7}^w=mn#Nyi|zUic&9`n)r={VI&O8Y7)<xh-CQj<
znt#sUHn7$4poNh|Z+elp`$b>SxMsrJr|8e+$-`e=Ya)VuUMXJXI_K(H4AIFSeKMit
zv#T}5@pTt>qfL+L2!?_2;NKXA5dMp~2B|6j^p^CAr53G4oGgQx{wo>1dD|ZQC>(KW
z{t6kKr#Q#cyu(?GG|IL)&qGbPKVEK<h`Qa~5Nr5RJCCIf<ut#mU(H3BtiT^U-rdZ?
z`P%!^HwGL@TKo1$RRvNGT(_3dg+ZSLp3qr8VJCGKP9?IOV+|KOqJ@ay=X^9f;xoTh
zlpWKh%_i`$RO4VT)Gc`X<psj1i_((gZu&8fca&TA0?6&VEEh^R$>5CVKhJfRKlo($
z7WTQSNU96f#^}h6cts_TH%FVuA>u}>#?(1HpzqNzbV2iVb3<rCg2`4&!rfLr){IWW
zc9`sWgSS?N(gfP28CZytv1X|HhyuaoOZ82Bo1fPQI3gz*y>yie+R&sc)(PkUt8#HJ
z$M*S?>obx8aj7CL5dr~xDd;h)uazwF4DSNxzGf8jR8w<)lTDJSoIACDdv@o``nSE~
zk*BY-AJIEU7+EOPze_U$A*>C(FoeXRunFP%MG+k6paInZKYmLsEskXDSQH!?IC7Hy
z4z?@4+zn7AX*y7~9)Zabwv>qIzhj~{&_?2S;RNoGxE2)0G@E4$))lyN<Gz_iV~r38
z75%06)UG2B9qY*mYMimFG51fZljOZM<UVr^>NSXYY!>RDdwTcdlT3Gs^0TzlM__e?
zL4tsa{5x8b@hZ-Mg(tgpCRp%f>(}wD^}N&dsWUewxjEn6WE`iFzm#0BYM{j&E1VE?
z<K0Ks%FH!!$3YvLmxehh;cpTbhNvx8a;}XJC>R@XcfJp@5qq$xfu=o<QY*S$%%49k
z=uJAtm348G{A}aU9p-09K>`tsUamEduw~KCji*<?MZ#ZRQZ;teEWXMjVIrrqxOiMK
zoXs+Yc41zc;GJ^2onctgWqDlbm9?6_s{|{PGnaa*j^9gtBF<w|v4qhSbfyhuW$szK
z`|(Zg3kR9O?Fv55jdL$T5boS7c3p3u^~}ZY%|zj;#FW{wuD>jI9u+ie)Dn}ouu^v~
zv5(ME-?~6G^01?c_~d6td08HhxaaRpJDw0ZBs<c0D}O!4oIA~>LaIAHc~^b4RAPK+
zi^XsZ!}2HZkw@4^aN)nOuI1y@+>f!I2-?cZ5&Bs4T2FE?Pt!nkrG?~iBL|;h@LSvw
z97@HH6tc_VNt7(`YeI|gYY*V!AH~MgT+i1z8+BNBAj~hm6zA<1H<7(ut9dpvyJ+<k
zdPYzaiLti2?US8OG!PqBMs9%NOYxK%NnFu<b;|K#0M%#Z67I}9!i(g6A1^05T=S$n
zUtalI`!TEG<5J2~H!B>yrJn)Y6z-Ema3_1PYJ4eTC>9emOW>T_@SD1mWOL0&Q;8fJ
zc&((@ZU6^0l+a*H+m)`uEFY~WoLV8Sg#75YHCNlbg>s8or`-;|JiE_F>N|#W<_4C^
zo-zIr%saxu`zt+#)KGGM1SM|Rq5js2HM-Ks-FYW;bAL>;kLrXJXKfv)g(W+V5NsPA
zHnkm(%;man>#p~W|9r4wd+AFhD*u#MCSC#k3|-b|Nf9iKDY{Scx)O+c<WZ6m<nR}q
zC&i!*_?|CpCc>LpUsbbfK2}H`8Kc;E%8S>hRXh7GfWoQxtH(|Wk<-ourMlPE_eyUl
z!xoZDTCL})6FT+rB{d$Zz<guhJ7#ldH=e#7=y>K*m|V(f+n3>OZKsDwmmhwxozr2p
zGoCJVvb?LH+&$7lqNWyq;)@)CZLRPM&EAc{rslC7GIdsM@*@~_gf%LH`1gU1K%oGW
z7%Gexr=WmOSRkc)la=;d>rjszYYGv^2@Aj^#+N#kBYM1yE0hm+0N+0N?(s^<`zV2^
zY+iEyFh_7gEbO5zKLVj6j8D{GOo#|n%>@7<5xpU#FtAX7!vW;r8s}iIfw6NO;qlNc
z0Bx2WjvZ@$f}`z}d!3n~N2?g5>Ov{nh~CH^%Poh(_6S^#F!bPvzft1n(eZ%Q(|;xS
z3{;11QmUn}uom^=%Z>W@H1*hzti2<YOljU}?Ccdb-IIn^5Sy1;s+}#HHDr`UTQ8Z}
z5+jJrL*iUVOXi)kqG|WNk_PpUucEeZ8kn4X(XKkGM`J5EdCby2;}N>oIntQ2GRm{G
zP)lDCnoS&psqILFcxdNs^hU$qE{eu+XGN@TPV#%9uahnjp`uu+7q)qQ0#JcD%U4!$
zPPt3(8&7XD%Qq}?LKp~WMl~!}GTsE4HhE!~@1LgdkV%&oq-1^kj<?`s`h}9}+&h{Y
zuJ_IqE|%QB@y66h;0S&lWhDxu|6Q6SFNjCWu@Wz#gM~hFtg*W?D-;XZtj?C<rt@4-
zy<Usp7$WyqB!(rH&vTaEDf4fMkon|1w-ne8DVEiQJn<n5!z>q|;wZ(KG7x9YMJ%IR
z<ze_4?=S_D$GOlu?Sw%d4A#rK=eWa**rMMfqIt4-uio~i;s4Nsy2u?b`5N*{Q!~6b
z|8`>ES*N~4LNS|RJbsFYFRdumMEe_*G$ZK~;qRFXyx4q8Bzkhi7Om8tl(46bkDQ-O
z+h%lqU^-!?p45$D9@x&Pc@l`GuHTvuM;%v-npj-kqO|7~(rjQd%)IVNku*)WKu3{c
zdxL(~YX8gey--88BRF`J*@*s&rI)0pigTwFxb!l*^Yr7b!7G|N(~UbW4zil}cpJlE
zT~Ob?*oSXaj49lLdmosTU6d%eCV0^>?MhffL=@+l+NqQk)>>nXV5KON;u;Y=lUMHU
z<D%j0ZJFfY<FV==vx(Y$(R=eker;nFBz3PO;nEcxZ+T?8oHgjvDWu8G7viYz=3TgL
z&_1iNva|(1cF83&pU9GNgP`V^jMKSCIPayf<dze)OJk#LIVfAGr)@15%@&5Mu*>MR
z<&v*1cZc13ayPP3?$Md^YF|$$s|x6LIcw3IeoKseb`5ik_nz=t;@Xf-^$?MM_!Wvx
z^tW@{3!{##A!5#>YvFHL5S?&}CuS;Wv+R_4#g~%?p5$dr59^*_{8^GlT~%wAuJ?RC
zzlWJLeC*liej-Qejhghw2GTNx%14jjXELLBd~*0JPNg?0All4Nl8<h6^x&S_b#FW=
zT<E}7qtABhzJ*-WO-B#OCcIJA)lT$i2W7kCYUXHm6#~J_8Zc2Ga%oA*tGapJwf6Q|
z#A1%q5q)}uu?a)`g|(>`BTWh{r*T)W1mBi|NO=$H>X1$Do2tC<8X;ot^>fs~_vZ}t
z4rO>XJV`bykD|7n&oI^Gv0zO-?|K`m#TdnNdCfA$&n@v1f1#bXiOLhT4qAcBlmhvs
zDGBs0bt@@bib+m!B=Um-L^5NJ@n7Kn$FI?)DtgY_=XXzUI+kyK!+PQ<taJoHN7$_J
zzZvFg8cH{D0VTUY{Z)$9B%!fTHcfV6f2@x*7>!3z9-ok$s!v1)3u57y+>T@Q5ZPt8
zo-^KH;1l0FzMs0lrlqm1c~{Uv#yE^&Fq{0NWVQo^L1oh?G=FZINAq!(;5s}+Hnyd&
z68z!aw@Iv&>cpOmq=tFzbq*FCbB4nbQB%%43O3eUK_oF&GE;PZVUQ)9Dmwl|6p9jV
z@C5e*NiT_^+1Nb>jfeog#8CoFj_#wPftS^9N4-R31=e2P_5JkKPB+CYcv$7LzPD-c
znbdn}UMul3KCjfo*#?*i^WfqFeD|+DdhhK2^=!Z9$KBqoZKiUv$KQ-Hj^N-C_BHx%
zOuVQz@>a+RynStH0z$%x^IQplkrH(he$8xwbS3B*QSB84mSag=Ce-@Jl`W?(#7H&0
zau?{GI!5NoePd!sJ0p4IvVEMA4$o8`#g%<hTdvr<SB5P!w}>ov*d_kN5_N<n`d4dM
zd0;Y<1N=ZOTCV=*UZupY*A#(^fQz?OpFe&{VCLxEuT@~xSMiXOontKVyf2$<Nx?T^
zcMi6KZ6$(ZUP&?C4k{&4%W-$9ZcluguiQ{57^gP;yfU2sI7j1gUk|$HWOf#Hebv=x
zq~jlbjLy}sCO&3m?BBoO&u=~LPRJ>Jap>|g`79YzSpZ*vd4+>u;0J5?v;D83G)Akw
zTNd4del#Tu&26MucMB<Nj`h$_te|-8uypDpl-hIGe4M><9CLNvePP5xkV4&C!#FQ8
zH4o2VCHopL%17PY<&)Adkowm1+^cN{)_cyC<}H47ldD8$nyZRv7mn1;ra3yFkDM>w
z3h9-FrGF&wthW+sY|ol_yC9Qr;&Le#sfkrbX>sn|<PYy80yb^s-4!CAMTm;A#v49G
z2h2Ua!)!QjBvLu&#9sMOZ+3;{Rfdq!<`S*Sa?qnAdglmJ7xp(!WBJ%_w`0H>ZM6u8
z6Y59!xukoYY-z7NWMf3yhfapr;(U9Bk)E45rX%Ko0D|O(BK=C&SGZ;4bA1T+A}(=U
zc=2%FSFcX`qK((jEy{9pjD*D4C<fx^i>27FiB)wc)Yc+)F(xd_tgDp!yu1s}=GcNo
zD^g!HSw20B8P|B9o*l!>=fHevLNCaeTzT@0U?E(6x(V0tRZ~*2<izS(W-qa|BZxi1
zOhx|1fI*-Tz=9oGsSSmrG@vM991F@|Dp|ohCw{n%rHb6^?(}(>0T<9^dyJ0sAUgAF
z@CJ2+O)Byi&MyS~0br9>2xC)(&;afN8~g*8b@RROhcbe+=ZGKmfo|&>&WwOCTfbC@
z)5%T<ja+3~IK3QG26kNkOJ!Uv&PkfyBe-#dT`Kw)M!Vzyn!*oy;@D7@);IQ`9=0tv
z#t83yUFa}AbFt4IR?)!vw$fK-ef1d1Rjv8wT^SSvcfFO>lIw|lp4BuzYqopYu=G~t
zdHC7KgS44s4NUY$cn(lUn1I56;W#*wRmmrH1!-0Jj}PDsC%Y1|Wwj;YV?DGb;XZ;R
zN7zkp<X<cs<l_MUB;<tLR#pJJ-?!Hi3Ch;6hx;g!Csru0l)Jk@%IpOc{0mf9puW9p
zkF9FoT}JDQGaRFmKb_jEvsj5!EUxt^%_02KlBM<Qx)E{wPKGDO=zCUfxsjgQp1iKA
zgGrw?roVU@B>F0nS`)q~@nCd(+2_<j$E|RybcJ*)9CDspOR@6<Wc=fC?kL}+Vm{YQ
z$IW3CC@J2q3}x!;2&pAv|E|Yc8?Pyr=jXi^{LNghke?er*%QY~#}wd|X|z8#`lV^#
zEku0qK7ASE>fGhjQLk=Oi+E}e?1(9zy9*A2hGX<%GYN?9*9AKvPac}3Gz6-yemOfj
z_;nALnaAeq5v)AQTmeSFzeXVCW5p?O0ax!;^=$TScHA?AHgF!)mW$;JN6pp|eMn#f
zWx%tEfbhkD&|1#vW2>cjwwB-Cp`4Kvukk$nbo1uz5;c7vcG@RlKG~r>_=rj=nQTaX
z5_9SAoD`#;`H=J4y<p|Z7g^WOrk`6ZrqQ>;{d@bEPCS0EHT4B0o>O1nyO-BJRWf#+
zZ&}CO-aw_F;6Y}CjTFI@7J8ys@+=FX*ya(m9%aeE{z7adFFcIG10tEm!Kd_FeLmc?
zrHN1}*xxw!Ws_}~MqZN_nHJ9@<eA86pfw|QE%_1Ym5?{iRwzzhd7*>~&aENBPrl)A
zskWw{Cs)e~r`Vyd6Crqu7??&>Q0cC`l=XL}IuKt;oG`e$q0E&RAwzj)W3>?XZZbOJ
zwlvBJHezKp{QWZbnA;{13}-q%$x_9|lfFgn>+i04kd$l1aK|~jG|!o2+#|&$XN*s8
zWH;8Y6r>=xM@p*=wi*ifH9RocW2j-TCzjd_f8>^~8J*qY;F>=u^oCpO#Sttz!c_U2
zxkmn<c(fVbb*{}b8j0;QEhHy7o$I)#C5@N}(s-N^9kTdY#QY(~ZvGp|P$DD63(=X&
zx#rKC-{VDnF3?<GB}>d&E}08i!p4y~W%DO$K*;3}A3GfJ7t2kx3%yQ2@9i`YBB8B|
zj1A1kBP6PJZ^-r$#HZM)%w?}7UQF(@-eO0)^wz(T4OLIFUvnOMTwb}CON`^Se?j|X
zDQBhffYDRuN?|3T@ea|b6o>oK3-_)D8RU9@oPROG<Nr<)-eyzXgoYMNJ|T@&uj?v*
zKO&yQfg4^QyDpPYxWxR7Gbe^`GF3BEUEmE}%I(Fg*P^i`J7=P%hP(S|BNgro+#SA@
zBuWOQ)EZ85D~|I|q{NrCRn!wE%J1p(kwLacp|jJPI$X_g`>TWU@=G-v$*i##af~O#
z=7p65qOsa%Pe*?KieL22MC1s*9c4KHZpVKe;gaXkq#aP=$z8(h5I;}-+L;8l#n`<Y
zrHgl%@W}6Y_f+Lt9wUYMD|Iu5r_``~6ONMN4@_tGk?Dl;yP}l*L#1_<^v+z&P-VS;
z%;enmD3t_*npHqncNkM$LhmB(s^-N?&qeGHCv`ZCtBEZuOY2=F&GC?(KGbhNLoLJO
z6>saPEMXs`+<yAy#)|W*moK(iC-h+7#IM>K(9C~`1?+Dk%*dF25`a9})${-&yrID=
ziSsw_gkts`I17&vr$T$V$7urNB4t(0mQyWXacZA0J;TBNpk4`m^4<jzyz_Rd#E6VM
zm6u8Hnyn=sc%w_79m|aEnKp0|mZN_x5t3WDOs)P^$Rpuy61_EPPl?vFdQV>fbt>&M
z_u0X^Qul01=32gv9P!RD_gnP#+q<iIBF3gu-q7bqbl(xy$ls86=phq^9*<WDkxGsW
zmjQitSdRwY1Q5La*<TmPM}qaLh9vcfrjIZ@;KF~Q8`Lx!N`bh*l4LNAC=eECz9Q)D
z-D$m#%FDpJCnrm#)Z3v|Z`Eh+tE_M?T9C}?l!{}$goun*=*@t84&m<;i_;1QwD{GA
z)@KaL-qO9isw47jb|cILCr%?k$K0PZR=mz^d)$G`QG<vL7IFdMXWvffQAYv8qmR6A
zOP9z?tp0qES8qqC(E0^$g*P;l?QB)LU7=|`qxTZeahVrWNew=%YA05r@GS@UjV|A-
z%EV$+#iWHt>Nk>fkFXMxn55&ExQ<4pLMj>9^T|w((~neKmS^^SK=-NnVpn3r^Fdz?
z-?zpeSWdJtaBpA5?qe`EIf8dbSVk~liS@6ePCgb*0k{G1JemTy&lv@5mGs|l`?`Pc
z_^4veI~-xgdK$GpA1b>u?<y6t>xUc``IY14+GygLE)-gZiJv~Fdd@QbTrG1Td%AwL
zacY`2Z?)#D8vD-nmT|-9J#ta5QQyMf;cI_8|K2F^qgr5jJnSo-cB6P;q|&yD{FEW=
z!=4<pl75rGMV{+eP<Z9N@#aCZkby12G12aI64d~ar6UME!WIJpXa7#9fToIjCxm#?
zgV6X6V*B`OiFcTK@236Q_yl<cQnL-%n~-qjYX*=9xucIBL=8v<zbY>t2p&8)*&`O#
z9~)5@FA#iHj{i06k;J2b)#6Xg%wI|ddaI9j35{PS<J)`|WX(F?Nk3su-EB_&TED`R
zwdkt1R$j+Fju-P0`)4{m1n>l|DGd3UzP?=^{qjpZjr~@t(@2?5g{pW_UInA$c@GmV
zQ$yQ_V$6fbdpFBokZU&JPz66kJn?vSdPG;DhQ<qZ?vjzCOZ2Y3@=&m|k-|;(*xUEm
zB<|$(4sa39#bhwM^WvJnPcOr`Gw8S#W(^>3MLa+#*(^1mt4wRZVk}y{N33OY{|fh@
zd(N|aCsr$MRfdMpWm!$h26mWRl#O{ogI((9)tmJYORQ*Wf_oN*dTsVpX%+;N6sB4l
zTIoLYdpm0@pTw{4Ve@-j^Ejegj<D<gX43)!fzkpNF;E<ef`X7EVxIg1{`~AjvB`{r
zvP70pCkw|sGaCzhD+34&QV%YvQqw7MkHF>#%L@Vji_w4td<(|_-@+hIomBqE^H0``
ziXxf;M<-<ywJOwXo=3>wsY>%R&dL0a*B4o~I4fmlVm_n~Xm)PVR8tKKlq(LBKM)|q
z)1Z{pPZOeMdp5YSD0bIX`L@5_i3d~+je$y7ftR+AGv^~97Z#+t8Z6+NxB(n5JnY?0
zgu8Qv-^gpsn&mFd%ifSQNb8?XP{|)+Qd3{V?(FMSx?`;L<xX0vvRr)1l5eHVMo4SC
z#2H0cuir8%+$rZ6*8+1s=fEQ>v5Ib-Mw|5KI*VrQ%%MDydPLQCnH|?IQYPgwmCn<T
zLudM-S{B)_DvmvSYhfL*@@5`c-X8WrBf+@CbtU=n<rBkpdki#vFD<F#{S*r;DM;)^
zTf<tjOhh|P)+y1Xtznj#>HNf&c3g!%(c!H_S=IUGUER!vpV!H>-(3qmq9cy5@BVIO
z4FrI&WQ<kVk6A({r+PG5J7fEMp5YA~s1sKMx<H*buJls&sX2^g#AU)*J3*vJ1Up5K
zFya2j{yxxUejQY@K+Q~VaW(&>OY|rs2k{rGN<Kz~6c5-_x0R6--P+Bv;0Ave?6LpB
zFVmUQQZiWE|6xG3q@V+CCN&EyeeI_Q_mZRK4bEu$XzG&nz6>qt>Svw0!~Zcri8qAx
zLPYARI00+t2Y!<%lB%jCt^DYUh+JsQW{o2?vuVynHqZF8fsc5FrsnIJdh$mbW^y8$
zZ{%Ua2#%oU2%820?4<iw<$=^hI|HWf(k`Jd#LszhbsTao?pq7Vy>TjBH;iVYxS_UJ
ze`v7w{bkITnxqK|k=K{4gC7y}7E^rC3^{kVQJ3}kY8t%!0n!$)M>D_ub>3^Y-hi_m
z`GrK?UurAM-}ErWFzZvZXUnz1J0C5cN`A6ZVRP|%vVCiecTfi3s6e;=F-?YP(GNGb
zvabkK;fFP)CEp}u(haA%F8%3_VwCSDowhz?_yePoYh?I^I6p7{iF3s0tNdOXEE|Z^
zFv`UV2Kgyzf6n++bZ0$gV|II}^3L9iNRbvX-gg%b4J@CYVfc{P#SeeyqrZV!AMFk#
z5?;;uw(y*oLfASgzUz4D1Ue)qO+(lFfouZFV^}xqI%!r1J%RF9396PDPD-Z4o>@|o
z>>H-cxse&D8JxE{XPu~zlg?Flm^}2SR;G1`-@?A9y6FFIzx>nv=CLsrrVoByM|92+
z1{wTsEHVKNC3jru6AgoW8$mXOvz94*p|AJHPV^neCy9{47|z{^yba;kYkU(|Y}YZ1
zfCbDR3%$-a=A{2xN*khNc>$U2^~q<0`Gs35!9c=j1+h&D*|$iZsm0D$FnZeuW7f}3
zE*qCirb+jf7(L>oq*Z-x&TdP3HVrOV;~^;mSy0eVF^_fDJ%yl%77jkg8eDWM?+KnX
zMP!*m6v~}yCtoNA6QUD!7q;vn;AA?eCh9P>tY5PBI+5AfO^};bsDyFjGYXeh!@guq
z>RjutGuda4s)Si@m4zoXBlA}u_pY^Tn|0nAbCyCqpPOwAo8O5nUs(9^4cFA{V%QNp
zJi@YsiTs7ppcbn{DtoZOtey655TbQKqNS+bZndRU_PD42x7QF?35g*$4nd67Vq`Sm
zEynrQYKZ0Bu;_ko-I+%)>FFE+{S>e6@c{~tb~p^DR-t!XBBzzC+vo&^RSHXUD}>EB
z=8NvSLBhu11M%sY`WhQk$(+{k<rAG1*m{1u_1*p|?O#krUZ?PKT2h9t`M+n=M2Tmg
zysy3NUUaj1`WBxn^~Seh*_98+?z2nG9zpgIMj>!!$G;A1#C>5+z^{qIqA0LYAS~c|
z`_E=rYdb4K8(`)&NaxYl$D-%5d3%91HH1;nQp@8Gwv>EVf!V2|v!Q&Tjh_@tv3dkb
zN0@ISe-W*cj1(vJ16EI48SRR8*Ex>Mg!#|h$mgkv*xlfm>>M^TjKg`*+WY#<y%G08
z4)JRzZ?V4CV=%m5*7r1>I3hi=g=BVGZH%Zul=g^SQldxLE`PTpRRc{5_@tJ%sQ08e
zB89xq=bV1L*YY@WSeLoYD5NHlHEm5Z;V$bNtusDpf*JKwV*{6j7m{lPJpAfv8-+^a
zTf=R_0^96U>#)-$uw}=F?`p`>G7mk=*E(G>g?HxW181^n%lrN*wX6%_5R23F{L+u_
z$y3re4RYrpTLACOOv;Dp9<BVYWIBI*Y+X^tvBj~ENs{NehB4IeXT`{VIs!_0Hi<9M
z1<xs$427S<DV}wamt4Ya6yJraV_{Lc_2XFGR!*#b{m7+qqZ(iD>mK%PMeXE2`wa23
z7=1te=HN&hAsKTI>%*LyhK8IjpPcn2b6rm@b7v1w$lN_JvUcX~z<OCo)%-9IM<NM@
zz<1=VJuKbrT`?~15Wso|Pl^%J^0dGloUH@40q9b7b0^@08kyg#{{Ftq_fu&EU{E*=
zA|MR75Md%fkP8L}v@mce4cLU5ImX@o;OH7bC=^uKz~4VPV?rSixeLIFIvx-pUV3oc
zjl45(^bC03&i~H&`2ivnlC!t5u?9}d0guPI2m!aJdbpZf0w?zX^tpn^=r~&2U=FkH
z_O^By2)M7x(#6Ti93o_Q#nsN*8F*^#Ztr3Ryy|G~VMi$BW9{w&5pr<`=)_zGau_>z
zYanj}oWcYAZ|?=<JpdAfJgmLIx2+GV1wGG}E{-nF5WpWUBn_|#f)WMrgMeFWlpx@6
zQiTYqK|}y&)p_8EABZS`E688qz#s^?8Qc(p2At*qRu(`-5Fr2^uuX^%aGH@FM97{H
z25!c4fCxE4gq$EkK)V2Dt`H$Nh!B7k(6s=So)94~fZ0IBKET%{z*#*)diGWrJAjp-
zy88LppM&20u~+qf((_aQA9U9L(b+&xI-1)8)x!U(+oe4$!Q+h3aDa5M1@J}y4u->J
z&0WuZXCIVMNdHF`;1_U=lf9+1v#p~wfVhwr#@b02%&3`rA5a8=A%x*SouqU?A4twc
z2nq`Q|05Cpf2{`r*s{WaU<(85d{N-D859CVhz2}uB7ofs2j+PM(AQu>0(gSY!SV>;
zi-dzK@FfsX{{Jlz{&<c6%7O{3<3~aQ8<k*CkO88?Kmsa4i$X+z6(0&17)YRT5ilX)
z5I7o12qZvMQ4k~&s00|PBES(+BEYPRf&zK)3O)yB85D3#l?d=YSSMN(4P3v&!0Q1q
z2oXZ?3L*g#1tt^>aqun@ApCnBKv|&OgFGO6aA3bHn2^9|0Iz63?V|uV3I+!V{NAoG
z&@K{~Mu3l}4xS-_t0=%M@OqHoU^&zQ6TmCj{z0dK3<Pih7)l7Xi3Hv`V9F2V!0Unh
zfGr7wH&Bp8Ale{l;1x&*{Q)-jJrN=g`|O{pzGnagzgGbVC<b0bf>;5VgRg^P@T0qa
zT!AhFJkZ~{bWnq^2v7+4R18EPl!Jfj2TOs#17QMc0`LE+1w;x8B|La`SOFS^hWtpN
zAbghrP!4`g2fPO40+flL(t$jHGVvoF$jYHifb#Hj0`WSy2kHQ2<3|E1M?sN59|E|7
zGH^sf0~6nYTmYY!pngg}Wa7saWC|b?ga=nJ|5G~X>+jFOGXEq{F2G&|uLo{oa83Z^
z;eXCyX~@CXcEH3d0|AIQP;qjAjD6230o?wcQ-uIK_Ya<`L4c{|Aa@=D_&tFfAuxLb
zTI2wx2@wVr@xeqce4wzv`5Xv;9>5GC00R$l<`7`F%t6iq=s7S)2+Z=pis1nEU2y=y
zdUywFC*=1z8&Cwm+WY`+4e;$S_g&>2Do7h(9DRo$C?`M@9TY#%SIEOwfz#>^DYFAO
z`MtoQ`a>NwODJRyu<rW<2MDmK;GhmiP@5g(oFKq%`GXvw;SQT~0fg&23{;kY`~lAi
z0i|>hr2$y-KLrfP=)t=Oknei<z-JE@`mTHdHF;3N4VZ|&*W(UMRo`JAp!z&``d#IU
z9JUW?VPN+7{sh$BpmzO!cVGk@xEcVAFu=s|9sXUF0}^;p3e@gEVCx|72?55*LCy;T
zOaliwA7EkxbB89ufl2y{76l&uP@)RJnE(D7pmz_y00k@j@mw1Av;FeV34=ni(m)|0
zOLM??1}bc6`2Sf9*o6AOVu=5<7*r6nM9cxJN))K^CnX7DcZkjZPz34w0|>JB2i|Jn
zWC`2@J*5WBPKP{FgF}F!`yC1xd568D2AFz>U89DC0CUs<@6=Eb*kQM*i9i5{-}l!5
z4RF{gYJd_r>=8BK`&eKmJ!}pJfgj3(8Zg*_ne`BkfWUvW2p|Oi(IkKo{7@Ry0F(f8
z<FGuSDG!I78h{iqHy*;#puq^{|HqF5T7S^~-7xxbI^_RMVFed=zy$kY?21AJ02K`U
z0T%9$BNP!Zz%u%m&v%_-rAdPz#PVB`@<Jb#ROGFfo;uW}sm|>_z;%;SofaNymbk95
z7ynT`zjd2~Kt;X!wa-ZG(|XlNy0_1(PIuqY5Ey;pDSCHs$kh@PQAt(sY;@5(_trbq
zOv2f(6$Y#dXBkP$gRauZavAZ!@Iz;bgeE)o$+JwJwX)<yUAHup-TiY>anQa0UFv@t
zA%_Qw{+Igxr>WpSNd3=q-hXf0gVW}Jcie-D;y+9MfkOW8QV$vf|6S@q^XI=yJ!okD
zcd7q*b@HFncn6D#|8C#^r)|B11>=9W@Bh=b-hor#Kil^QZjk>j_5W#G@4(gdpY8iU
z4c+fQ5hJ8xZv{q<fxrdm2t4@Z6i*;NdEkvbbmNNtJY&iNzkUM7$4>)+Y*$AYjHA5;
z#0w#a7KHOd>@XNt4>2Jjr+*#^y13i&0PzBM7b{Oo;Kx*ce#_O$24Z1u=>W8HSOX6b
z5e2KacX5^jescwKT22fOg#&>wU@tog3Wq`sdBCXdAzI$<);5Gdj0jB#g!~Tw0q%Mv
z;D)w=`~w4hv*167^C1k5f`P90Utv(-?3zDeU|8X|aw2G8vHTlM6kJ074g<p{zrcVS
zFqZHe4BXN78w@E5dU1a%2Z#OvLy7_s@n6fqVNf9Q^E(Uy24;S#4-T;N_jchhQ83W*
zYdN68-`j-)k?TL&0GRqm8*ouD4)SY#2pH(m{tbozV;H}|z{uS1Fi|iX@oPC@*dP1@
zHs=FdE`BeE0s~RMy^Dta(FPI*M1+4UhZF`6t^D;}BnpfW{RTsW9{*opFj4d$Jb;Oc
zfN`eZ$^j*Rg8||WMgo2*2StkhK?f8C1qbi1<$$2z@3;Wc2gY@ND<|?t|3ZO)=^yop
zK!K?EZ|{o0fkQffhath86~DG2A_|5Ve}##{!SLR1FvOqr0V4TF8=`11aPsTBXgC<m
z{0)W#!<E0mP~c?t8|=^V27|)>kUtny_>b`c4!7TB2$-$F2+J>Rz+l3E&<lg1z_8G-
z<={|oE5~mz_@6QaeDn3YoWkJ9KgI$K4(?+3y*^Pe%<*d*2mrx9U<fd3_**&DA9E)R
zAqp-iek~^q`=if+pXvHz-T+$tV@$(@MWBE11quCQeue>C4Sw%SVD$bm$HI=7Ct)aH
zbp4JqFy;KgQ<U%@^BN3=`m@hPpkVOxS00EUz~J+5FckEUHh_%=fAqNs8r&Q9%e!zC
zc&gJsFpRsoy`#1J!H>Ub+51=ndJx!8<>CUW?t}22lCupE-32)GJ-+9GF?Yuts7yEt
PA&Ma6<di?JK=}UvS(gM`

literal 0
HcmV?d00001

diff --git a/benchmarking/ttft_vs_arrival_rate.pdf b/benchmarking/ttft_vs_arrival_rate.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..6238a38e1420c222c92aa462202943a218f04aca
GIT binary patch
literal 16898
zcmb_^2RxPEAAeT3Mz)l3Wo6&ZHOd~@BdZjyYm|B2gqD$2Hd#ehN=B3@m6;WxsWQq4
zO)3$I|9Niuek=F)|9bsj|Lb*n&U2nKKI=K3_c_n`Bw?<jCy!LXz$D5B;iYvjBpd<v
zbvXc2Q-dp72m2A>irUU(XD?rOxT3kUJ23!`0u?ObJ9fZ`KCVzj<=-0U`TCIIXi5b|
zJ2RI9L;@ME^z&9Xh^%i#b|w?yn4bma&SWx?<O9b-w=hL34`)|TA9pxz^(o1hU_~Uu
z_kdw_^Z+TuATnIh&>N7TwR+WFy&A$De<KIEmH~)vh<XwcQ1_F3MN49U?;#Qa@B`5g
z#y26ldOB<Q27wU~;192?tbj!0P*^w?t*n4n#zI*~APm4E=q3P;|4FB&kB=|7LqctS
zmkRXwml%zSKJH`>IC6DpJx?#dI9yTB3rItUNbq$fLi7iaNknIFSkSrFJtn?`HGD%~
zZ2dgxgtA|X3_ZV6{-D{u2x<Q64tYdLK$YE1@NOV?UPFBbkHo%H=zDQ2iL~@FJxM&%
z57eVGxB})Ly?ytC^!58|h9AAEB%6wKGo3!OnfV0LBJ=X`!ti~~FIAgI4rqtZ8*i6o
z+gtwNj76@r@956{zUsOqgARGTYP4o<UcZFsi>n(PX~)9aCAW^AByQAxM>CpYpIkM_
zsqHzVNLDMl9%g^}x{Kb!cHSQmJIs6Ho+NCYz7TafZ*gbQ;_FX7hmBMY7^=BD?9SgH
zQ4bTRNntFGOqCQZv0}J@7=E}wD{ARc{PyCu3$CU%O|Ay89&5zlRLmiDS+C`wv^nEC
z8DXN_wYpPM+9~Zifz2_iPpUJFPs#`jxs=M}vKoHOv<NTdsH$*}yf2c-aE!&Gm2knW
zj?XM=^qRS*180Ejd|9RTSh6O|h?Ab*$jvVfO@#@c=%wSn+Nh+CJs(a<a7Z9-GJTWj
zw_8q#>&dp87Q0x7LzAD=dG6k%mq)+%v0k5M-<e=vr(GReVVDz`_=%>CWzKsa<N7qY
zP^b5Z#_2x|ty&oK#E5t|Mm*HRCFNz>v-9}UUELUWU&O31xROR9>&odcR>P6ZSseX1
zE0YQ$`*_q~8*>1sSGH6WzavJ~;h}}m!<ggR^h({251K|5^&1CWye&T*n!ix!=_-=5
z;S`@LDV*L=<j1gsY?7)I5Y3j62!3UT$O9i3gv9SSHL0eiXAFxVa3|E_XO!+XZri^{
zGrGf$Zu^BC?T>&s(90&%JciUmG}-wI*KQ-82_XW_)wX3@leT-3Lrq}ax7(P^0=yox
z((<2*48eACJmcc|X0x$uf%Z-h>(;Pvx9fy@(^IU5)0t5hG+4?AC-3wKZha8Ou4!K}
zx5V1Iuw6plmHDA$^FH?kf@oNEUPaX*-3(D?;$DQgSWuA?J=BZ8&K-KUW&=g$6k*|t
z2fISbL{AD`I^p}}{Ehk+mkPE`xd$zOq|7!HN(fySFHh3gV{UQpMGm@%S3LZxT6x+{
zNV#TWs^;6$(h7f&wW_bOrf)XLF1QanyE9uaFVBAac3mOUsL`WEC#%R0{GrN)ZXC+i
zX}1-gpo)m0XBeVVbZ2=X0nIkC#S!u=o}hRFjAXW>leMj%jU+JCr=<sM;_tjLdd6Tv
zxA?eZd57jaEWF${g4NSNQhkm#PAFi4fRAV_cPri}hAO)5u4g~8+d@4m0Y9yG!=g)A
zS<n-&`J$*lZP>2J7pfeuqj}wY=}q18y(H7EL9bIH=61C`L%I*gaL#%*jl8Rq({>1u
z%sJU(aBZ_{pKsnnTTguj0>`l@IonR9e)oy6NwO&P6V=~hl7!0B?=9D_E3YK<)>}w0
zJu!Ti^;qizoFS7|Ozxp>N0s|<2T2=K`|8Knfn4^t6Vm0nZz*+?B`@(_efEUcF)A&L
z$Qz<u^a`dr_kL?|xJ!=P^;?HyQ0Ac<+v8=oR8(po?$0fq&6SHg+^^E0ZM&d8kdRJ(
zoKj6<V9?GlIndjiMB8OC9E+LPe8O~gXRcmjp64Yx>pNVo+b?VQH6F;C?^}9+DcG2!
zoh7aBdq~y5zV~QoCu7ku$o|E?TsjILj7vKYX9)ka>tyxW>7fmTFYRc)9d=>1af!F{
zBJv6%c55p;pWf*-mFLmZLtOGwc;jmxdUV(Dto%oMt)2EHL;aW2wO)njlMQE64|}o)
zA2pqx`8JE%W~D9g3^gm}$f4Pgta!48gJs4{hMvd%X)sO0!IpEUR_uH_TCw==y5>!9
zzIv;e`TYo6O3aP1T9mr89KUqbWVn7Ca$s&uKqW0RHeWI}dE>+GX_GiLcH}_K+^<3l
zVHx>)CFG*=U9ThhN%`8bS(^`}Y!=IlOFhCpmZ;rblfm|@YLy=9f~t!+>zk8@t-6oa
z-*J4ovkb9wzk#3!XY@%e4i9!*0;k=?k$I=d#EaI-={~`qgI2z}nVi^p!LpKmQ0+>>
zm(6o3`qz84Js88Y-aJtLA-27!WSOR^Fh$ekb8n(yEW%%6E+2t{M{|Tnz4_Kyh#{xR
z@gy}?-g;%(Qqycwest+l^<C_#J(sl2FT>w+9JrjXLVBAlIhFj$q;d>JUz*2t-fktq
zDDh#qKcQMLIHFQ}swC>|REd6ep1yQ@(Wf>~@4Uw^OcGTT)Exxcu9GbJLpTDN)Ss9O
zsed)#o^9mIG4#H}j;e1xyPfMmh~F_7Hu$P6a~p5mQqM#G`0nLJ{-c^N&wJ$#pC292
zaaiD(n_mj)3bS#l^8a-I>7&X%(o5dFo4q}p!PzI~r*uO64nGOtNRmbdiAh{i|Ek?>
zcz3Uv(BPKGA8!#tg~G&Qjvp)VG%Wvg+*#t&NJi>O|6><w&x^b<o3E{vox&{!f1NRY
zd33qbVdsOh(VJ#RCxRD}>u;?b`AXj#>t(-0L>SMcaYTL3jTf@eG`F?&JzivMdyy<U
zMguF<ofnuGU%DTRJ{#%cGV-l;v_JadWP3w;ud;nl&+fE)euG;`#j0K==V;LJdyaL+
z?XFNY-)!c6w}clX+%#eQ!b#0Xur5HUL^`N^)Zqr~@y(GG9&0)_ZAquixu<OW7*Bn?
z5b<tnMzE;gbN$dttdrmAqz<hh9jmSpJvx_}pkji~rm9VS9Ab5v&V?f+f%#Ybu3py9
z3(Vbu{9yT(ZXUh@{DCi|Enmc6tvEhBB>L@`Ston+%%j0RU)RrG<)_~=E}jeJ!g~(w
zZmio-&)Ch>5j`gxMUNugP5Hu}SbsKmUaDb-oMvsQ#OBEr-&~ujZ6{7F(mlBz&6ho=
zs;gY=Ii*|ND?F#!?yEJ_bJ=`%Wi#KneUtI&JE7M|XZiak4)@heCYN%0U*ON$ap_5P
zg)_q|>_Iwfg9wM9P-D6~p;t$Qt>S*nkL^FNG{32<^~s`Jz%_H03*XK<#@2l3V4HC3
znvP_6lld06GcqciS2#^C)3b^nVd3+IRwhuQH&=>XuhZ(dSn4FtVh4k@Q1NJ)K8&!9
zHAX;8(W$hdiO2TrlP~vOa)R!8HNE}NHL9Mad21)TQYQu@aX)%gur&NDlDrae;1!c|
zq(lX$o5Op*;y&+XA=_J)UW;7=+pp~+T>Xl6?9H&1li_)U)p*woS4{HE<M+c3j%i#T
z*E+hhd~R6_S@Su+|JgnLm)H<?xwJbf-Unbu7-VPh5r{`Fitx(8iI+&_T{X4iB3tTj
zEI<f%6_u@GT?&MQU>ypBLH`246zm5T?twf0QwE{lzwjR(2{7M(U_Wf!Ew)Aklg^Ri
z8=T*iL+4`)c;=S9W90b<)!)Q^@p5v_i59CK<GWhyXJu?@dg(ChfeT7Ps#6m8e_%R8
z6t>K6^8d<PbCxc}LOeNIC!_0(`|JHjcd+^jx=`Weci_GXQBgR|f77ULc{7;~$=X#5
zOK;q7sa}86-)a85aHF~^5^2P_L37^60>5>qw?tRs(lesF;duIdp7-avg0~w>&cXva
z{X2PG(m&te66cdAN<MYJ9P!zo?Er863^$tlk*MKaE|O*9wz9-45&d5U2lp$zS=u5X
zLiREbF-j@Tvu>#;RdcDPzjcgwA5nbv;al{hiua)xK1G#1E;2nK^tC}uuDNK~K$lUJ
zui;Ub&{er2!N&tPkJF7feh9ZSD_48Z;W=y*V#M1$(M@kG;H_E~dflGTY|T8wcVG0e
zLrgcnN_ot}_~3&k%<kYV+gdCmBR;4XV)ssM;WBDjFl?irXqZ_n>+n9Yvcb>8y^e}N
zsbT(g2{fsRQ(KftXWFJpA~aM(z0;NStzqS$M(SQyA*cBg)vzRsXmpGFXlKNuDvX~m
zZ`?tqD`K-uCbX%Icbl?<SXpQ;IwunpUhYoceeq01c6@IC9fIZ)&kQV^_lvO4<%v6+
zvBQNkTiy$6F;;CS`w$&ivh36jo>T2E?4CSsI&fHhb9wROG9}XUesz_u$L9MSzMt&z
zsQw|<uXqc&Pb88D&$O0V6tv*^%fOXz|7|>ItkQ0Td1dtmuHvW(*o0m>Ogd_>#*UaX
zT@2h^G}pR~yF=x)ja<`Xt3;ECF3wSRQMjtmd)w?bYHaZ`Veg(5tYEFcY{2phFi@fC
z_od&eMPt|T(xI<nR%wb#r$Q5JD8|l{-B_31cBAc<4GnQ@HdKa&)^>l11Gk=`u~^$x
zaTM$QAr7L4C%fD0P<y^Mjg1BZ=bSebHDw1lg^+HbZam!3#mmu6g$QWn{TC5<#DD8V
zJ)sMlbhM|9o0vwdXun(U-qg9=A9XaOY15vL8_ayHB84v^&(QgXS4kama@<G9tHmea
zIzFJve|nc2UqiNJ3HL*B|6!ecRg+ygo2nlTyo#Kp`}#)cJrxR}wf|ofqR^;ykeFmd
zq8DY=o<FChzv0A)TF$Qf1(VAI9Urs2k}fxX=JprlcK;H?R3x}m=Imz9eLFf*`-zik
z-n&M34bxX!6v)zkg)dG99L;Gt3(LL{-R^bWpe*OLvFKZbq@mZ8VF#|qgEV$-@Ur!=
z@xeQ7qi0EnZ{0iiBE9)e^u0ypF>8FO%)w+FXO`~4txbBu?<;!?OFg>mWd@bvOdPq-
zw481IXg}0BdTRR0-mxcgP2KY?wGDzIwnI}g?a%I=d~TUK@eu#@O8SFEzv|PGAtU-n
zBn|uymBM#lc-r&AD&&XNp!nhsy4faMAu0lcc1Qja7#gz<!@^psK0K?*^sd2biZ<7N
zxGVTvXVJHS6&$T9uUw@T?L%6|te{xY%N7;iB+L-|q-GA^QHwv{ek;jg#=1!42F`oz
zhut+vSh}8E<n#7=5sC4j14~<J)%iV}<Bav##p(PTo=*0Lq)JD>48?exibxo4Dqy&^
zrzJs=1!WLo*^-j8R9V4b!=KL_Krj&(yJ%B9vG5GNw~d=1ekq$l{0;YfNjRrx8)33c
zViQWx?CWUjmD*1Z6UeVuFKe{nngqS1&?7v$^rQS~;!6JfCPJ+by6pmP^_g3a*WZdx
zP2D%2k$S07R`gt}Lo-s_%znc4q9(I-`MC|~vi^F6=_5Vn!w2f_$j*KJ(k&i0AQ)(~
ztGEfz{*jo94|lzyD(Tfc#_?rXJv=#6so}JIxNHW#-*vG{M0e|(h|$G!rP9}UC6=_$
zs8x+_Svmjm`@)x{WyZMe$Hk~<2imOqOFOvrOoC{PHNw0POUvaJCHco&|L-J7u(A?Q
zr%3|aP7#kd1oCM7HtL*!<{!(0JdIQ+p@KJ*l>hs^GU;w&3pHn=>r=Ht(C*I{nMbxo
zd^~a8$>nynqBxJhfw#PVk8XO^ZDOwWp}9uB&SA<{G~pn%<0vAqmwewTI;?Q}kC~nv
zlG>G>{6kczq5?6{2&MlfOFhNN2S5z%`5lo89F2?5bH5aAuge@hF(4)J=9J)bKHUT9
zb;kBq!bzCalPBKH_*Lb6R`u$2b9?1bkP-VjZL*Ko<zvCo=iPda_EN2HPr9kpPFdjj
z8F40;&zH(w7*Ysg@0ZNoGsn5}c3+<GW2^$R64|zpD2*Xp{(j=&k)=4)s{_W4UY4at
zMAY`_?4Fup)Q{ap$l#wG(`0_LtJ%XL=FDMTI?a!ttM46Q{x~pvpyMiINoJ3#v|Hsn
zvZaC#f3&D@$LC8kuM3|Y)aw0KDJwB+@GJ^VlKSY;Hu1P)G--J_p57?2!b9}St1CWF
z6`XEcspz`6nv%*qPwzIJH|KeDv*ive$5$_1ZRvpICvW$+^spRE_u>oMwa6e`I3#Jr
zZu5NLlIe80+Vj475r=-V2o>2=K_dXQ{<rLPlTAnrqD&{}a|;w_E|1%)_ZC^&8-HwI
zyMJ3;)**6&?h!4w{tTD)``9zwBB-N^Q>df0DAgI2=UIMcH+=TC5L?jBdtRx^bgAys
zK3rqDGbjJ@^cMWEf+dzrbUh!EpUu*p6jQ-zPnOL%r12>2qvgmJuRY;BUw4&B<>X<e
zIPc9IPCIz?5VzTts&VW4q7L`Vxwo9E^a|2^+$qj^o$a(#dVr|K!sV@{Dhf`i5~H)B
zgL!A%j)qtoav~y*mUVh`)8hJ4Z@KfgWg7|WCfH&!D^(4#GokaNHTCm2|HLo+-y#pc
zT#;oz)=#_R#0H~f$Bk=+ho*vV*5#+L7KT1|5v;eq^DZ@Rc>3|SRBK-J+)BUYJsu_v
ziJBV{E(9@JMdUYp%;2{aY@y_FcarUroLQuPbNMSnp0JFOeSzU!Tib{`+0i1mGi-W_
z+EQ6NbaCoxoTz6K94d%g8xK8mdl6eNI&w|S^1fdBqkgX0hcfi{tZH7q3Fq=IT@3hM
z#^U|`1-EJ7k++7gxnm~N%NmJeys52r8`Ujt8X=D*z4gkM%D=t+e1zAI12H-o+udKq
zHZ^VUV}F11gZt=iQ4hzVi{6Awdb`>mHL#hOq%bY$FuT`aKJqP}=)F_l|9z9Gs5K`Q
zfl+~8O6dPKHZ%?g@DBn*pRA|15mu~Wb4rxops}w*M>K;)oXG{?9{~Iclo=}|qh;y3
zH+n{1VyuF{jhBzlm*JF)@q#AAiyiz|s1QUAkga1vMB}Q>0YOT(eOL@Yg(wst4rO70
z90%_XBRb%wQ#jr!J(ecPnMu593+aZFLqMYn{Hl35ey_k@&kg<sT<%n8p@Pd$==C^p
zGkgj_Jzc?(I}p1!4`?)8yjX*Kws4{@CCfBvMzr%0w@_A4mY7(jTl;{6E8Ojsm2qps
zT=gdI%k!^<x22tCagIv%ds;T;lbgW55_qQ9j&U0I?UemKj%UrrPi^_!6$Tgxp6Bl3
zJAL9DxvSz2m0z^7(?{g9o+Q__q`?EMi)K3$P!m(i$x^wer{@Ob0`bS$-?8B0MR_N`
z$%KUCA`0FIPt$H8X|6a9eG}HbH6;Px%FOrF!sX++*C+Si2_y?&-_8}Fm9443Eqecr
zOfg5cdD*qXJWC6|k{uVP%Fds7y??Jf6?suZi5UEUTa&uEYJ!d^>pOg;;*9ubF+X9w
zQhB%Omlbr`(&om;YtZ6-oMHN`$h0eC66JXnVGXCXKKYEki)e<IYFom4LN>*a-xuSO
zxmASDp?5kf5fU18ksHswAs4GNO5(pa!%hZlCBC;Ykcz#$E#WOXK{{9F$oU{Xxepz<
zJyI#^<M0tn%h=8f=hN=(^uCt{Q*nDhFUNKBl`FT2a@Q?G%Q%5F)LY@=z-`CM)H(`P
zrd&;W%EYpsKQbH0`X=aCyZ?o&>6v!2aQC-!)jbh--n#j*SRA8C{EMmg^W2^?ik7#8
z9CD5y;yN?5d2%yXhWiPDm#!-djLXpu+o(v88f?U4*C9bmBcE0crh!SGy?GC3&h`dd
z?jE}R-S?ojWr@t~SY#XG*uA8iuZ<kJ{3APS_f_msD?X~Q#~~{?=GN(Wi5)eA86QP!
z9LWk*%0l{|Sx`ALI+yNWevto7D?O4i$#f>4rTG|sc}y|1slSr##)vvfvr>FQTC2^+
zLLi$<lhb)JnfFqW`FZ>1mlhx2&7&9&_{LpeAqdVgS2JjN8{DORtFb}neVTQ7Qi8iU
zcLVQ`yNjUH<b$g;6#~{e=||qT$CUJ3io30IcZZqD;`Vf7d7CyLD}nt>X>pH_k{M)5
zFrU*t_wByc$6^;7%r%E!GWa(6)Jrr<#pmhg*w-THRuor{lM&u2KVwY))qwpW&I+NM
zH+slnB4?iZ8CSnM-tpCJnLOZd|Ilp`Q-Q{76U~`h>=k-dckiNh<ityd6v$O>$-b?J
zZgS?}d^+FKLAT`x={5)E;=ygzcH7QgchQMI<rScQhyJPYbSwU83wN{Ebk5WKt7x>0
zX!wh|r~~gb0<YD7@vAw%6FpVnO+}|vz$W-Y`QJJ<Nz`PgLo(4#2S=XQfNKQx+U(vm
zu(IFiS>PiU;m%Mmi;ZCt(LvE$M;;7pdZ~-!bssyo-%{E|G}FxQJi<yaUi$E7LSm?Y
z+5x$X9zpwzdQ4jQ<qvbqUntK=74W_BF=Jl;jCV4dZm&FxR=-!u0xFF0=;loQLt~y7
z+K1-6uFNfM=<&iBQelV+X2q`upu{(EHo}?cSFAOeVXzlwlBvMO#&e*KI!&^FJjuXP
z6Ran~a7J<;uN~tq!k~Gg#+?z8eCHs;CO@eYFZ!&{r9V3CnQXXQda#Hqc!k|vGU-zA
z147O`3*oz%8Wk$3U|8gOc6D@<+Wi>-hMm4lD}e}weJyMMvgzIOm8jWAnjxFDSY6DT
zEBD&^JCj=yYvs{wC-enAwk@Ivx1ZR-rKadulVeLC6fFj}3S6EYf8tV6Sg==7WgAy_
z%2=s}*in_M?Wr|2=pW>LBJV|~xmRRlCViY~6fS?%Sg;iN^f>Xk#oO%sL>bwG!Uta1
zo^<5gHLyeBBFc2=4xPiuoinBCFQ#`22daFg!Ymcgh+T(D&^R<$&>*U;5h$Dm0*9e>
z;g00dR>))ewXdUbnRFxXUVsz0K>HUAo7quoN5@b>T}tcN96+OLfzPhQ&~QcZf#sCj
zM#`R2eM#(1Ed_o9*1Pv;uCazzOPmXjaSzRedvmnH`E;tXVg+=B&e0@yz1l^$L1KWf
zlL|dlKn-Rcs~4*Dq;uFP@Egl9cGEsc_k(TAZ921Y!%cTKDJsKKfi5WQI<SjwGT0=6
zGwICdhKq$RjjN^ZA|h|D;MjUxae*0^ew^GTR?H<g$rFq?*7^CqYt5U(cpKHN3_QBq
zGdp)rRneBJTHVb$7<=FyfjEBSk?O|Qtv!zT5?BAixI@;5Wc-YK&wRBQS_rgHJEqT0
zggq+@5cGO=&%0%_l-}9$kL>fYa5E_@{niig=QE5_@yE`T%KGJa%{?$eu+#t0;?5jD
zt?`aEtnI$l>^Rr^v9Z9(Fel$&PJ`zh9m%4bg~9`K_O6URUAVL2AEnxRU7$j6dh~GS
z)4=xqE-AbLG^YopFHIg)aOf9M8BRsFj~9QB>bdEZaVx@jdSU0&-o<4)VQIHTDx#!<
zypXtc;8NWrRW3TP-*m(@U+kM0-Hu*2lr(PMm+;KXvhk4}JmMC2_~RGhu~XsEH4;M%
z)8+K;grzq)A1v26{h?5sqA`y&%VRfNJ<pIodEf$MS<1PE+}K0X3I*~wRCtF!6pWLK
zKlUtW`+eR2wZUZwe>OI(^PUjX{kK+w3pmw+y1LHgqV~bE{^PRmcRPF98+9?)=G=1A
zU>>x>zbIATDFRcOqrxjS)P!7z*{GY}jHd_t`HqoW>>5Kpl(_SqHY#43HCUM2_JB{<
zQU;rqBCU8RO~T%4SmkK?UH0Ip*FLT|2^n3*)Jlo@KG>&Yu@gM=Lr>DLX=5@x@W)xu
zGMBdsJu<>&`^j+E<qfu=7umeYbT-k#ReRulMNa)Sa8moG&8$sX+^4s(?lRf9Qb9^Q
zzwh+c9W9@<d6HAuC$KAaKRySrU9n1(O7`)sAKiDZgq@C4FeUr8n4?{l0vD$zR@0=n
z(Lp}+R_(s!t<_?6tQvE%cm1<16Y?7l`d#Q%d@W`5jEab;z#`@KSi?VGWt`}bOU~`E
zNNXN$VB?VRxgj;AzE_AjOWFtBqP;PfRW8cWKWsJ~!LnE1JR#?Oq4SgaxAgH}i!HxQ
zZ%WI3Up5-`j)qoiiyIZvsCjqndNyy(@a!!J)`1_e<M7Z!b~VGP+oqD1Pn1vP3DI-r
z1$A64Brvce!wlO6V>7BnmN4-eauL~LAzG~nIX|3XShS{%q3w=6=Zr<KGwd_?_LN6$
ztBGrPZhMT-jnvL5x@pTjRfnc%K5*<7cf7_*s4B1XQ+KAvwubOdd_@pqQ}oa8HhM?H
z!2Rvv!ikSQ(}BlmoLUi-uV8<EI0Mt>nw>J_diLhAiCt`fJ?-i2a2EQ$-m7Y6r}Co7
zD+eVmGO%VMI;EcTMI^^*8#}$vB#cN{o0abn7ptu^#B-FGE76;I7_*|!6&bxc^QNAt
zRvUybf6S1R)G=i5jnNUfuNGB!@jb8UqGCYmr85FV_Kq^EA=8d~;k=prk4Z0kZ<Ler
zxrJ+FTMATLV@PKOJiq;zE>d#bKNy5~LPdR4P{MlHTU~Bq5R55$Q5xyVv0Bgqc*EW9
z*Wt_)zb}3xWZ5>1RI_DL8JddEM&j0CIl4)BE`ST^1MY-N8SD+8FYCJAw762j_*B2(
z4J}5nj?d&?2#?2(H&uH1b$th?<aVX#H1VYfnO`KtsBSkfHXx)J)Cfn2W!qhI%*?Wu
zxn?<1?b+Jg@Z8}^hfcg-{8H?jjn+$MZ}+Cnm_%GjK`w5#zO5P&XZUTO?w|wm!?F&d
zN??xJS67nZWNcN*^ZH(=sP1`KzjFH*Hsf%mcWW61Z^IfJLZjD#PUJ0(NUd<zfdH6e
z9{L;OxY`?XUC^DCnv_&sJ$9#A<m;%|T}SQVwS`ab*2Z^hM2=i3?T+j<80b)m=}J0%
zBSk*)h>qMi@~+z5@afV|!omw>-JRDM+Z3N4-XuHs_#{zutX1HJGjF>y@3>v%A<@f6
zf~<;KO2nUyonG0|8X!+Ee^jq8bpQDID+#X_QusXQGrb>Ggj5=<UM{K>^fC+BC%NCj
z{icd=FJtFi#WPOJTeLipH_<%-BikR@=vDIt;tUS#_3};lVYjO<(r2&UDY2yUCEL{U
ziaNU`VWWxXgh?`V&Tq3T$fT{i{V(QrW6z$hMH{-kt2e01Y7TZ(zP8M2<#s(-s+Uyo
zxP)oC%H6214_}dcC*9tIe3tum(aGL6Q!~?gTl70oJTG&Ji-T>GCr_3O^8nX=D+{aa
z5P{|IpFe(5J-$pM7jQIyidv|kB&Btj9~zCb0;ml^%T-()MP(CQt~*Rlj3p^uP(ho8
zfbe$lDspmj+30EyN5bnM44IkDO-F?qD&T27PLT2<hPLo%gRhH16LgkTdZvaY(ChIY
ziAL=7;PdYMxq^hoc9BVc)SJkFl@FmgKHLQLNb9hh;rZ$c2kCNnMMMQ^KGl|_C+OPm
zunw`b+0^+ex~#2BbTChDCfraaO4R&x=9Xl6qEBt;0FLeI)iYKX5-Lv@A`<7Sy?BN9
z7wp+~C}n5FT^Yr}u{xoS3r}wi7o4s?QAC4eros&sw15Tx{J&2qSmJ#ET6p&Zb_^cs
zGcf%eZjijxz*h2{ht8jDnWpMs9oQ8eX?^`Yd7=8u3ofPc!^FtD%$=oNA1tE`F5R{f
zeKMVex>AdEr|+=5&^%r=?%x@{v*p4?miC33sw=NMgc61Ac*XK{8d0rxr?#Z`e5`ca
zb3EO%F)`@mIoYT3?RE^7TZfcCoSM%MmcP0&=1x}nDVUH=EZ=d>PkH+B$L2O$+rb}v
z5H$3Qi#1OU4Geo>z=}U27ib|ei{6gpo_euWcTh7-A|(^wT1TFJ`K_-iZ~0lAQiF=j
zn>`Npgoit}en@MRL%j*Hn<amF+8)7znJ!qGe8S3wA;zb)F_yo;M-^mQ*aX#Tr?TBg
zwu^pY&utN4-nFR4(~u~^EtJ;rlASI8#D3wzxO2E++KGal-n@+Lqt{yY-3;U1#eXnm
zo~FcjD(ub5l~32}`};+NK7_VWQ4STHf?AI|k+(1;(P=W>>Xn^6>2~oYAwxELe5If1
z9^*!~(;8%l(Y(0xa5>xCuaiqXT3(`&;V&7YkIVLZ+l^~j!wm`M*!;jxA+y5I{4<%m
zQ=e9{x|MBOijy9kY8^oexYza*A9K8SyaLYUbe8SCE5Xfg{KQ$zoqcB(N})PHT?sy^
zXP4oe<YTi1&6R+OG!Tuve72~EUXv@XLN6Xi;`x3-F_9d#JN^>#eSp08{$3O1gMIJq
z%07>$2|M~LNbxI{3C@1S(eYb&mVMSRX#C-`>1)8%)R#d<lRbQo#*W<Y{M>Bq)Oxnx
zM+5g{^yTfCvF~wLCMOq`==MA9iJ>AxYJf^<9p1tu$&g)}@~OF*|I$fx<D^={<vNe)
zhH`DjLt#tfa6dJ*K7Vlqvg=e_g6vtrvBqm~!sr9#mChSG?jkjZ3gqoF0^6T=a|JY`
zkhC>=ozK+@8aK7?W#V$p5UFp3Z*xpsu=!%2I@?>TI+R#f?Y3XNpfUD6Q)?xSZRn4>
z_AsO7g?*34Gh`$P+|i%I-fpwRspfNBw|-B$eCpcJSy?~c*`){CA3rc$7gKvlg>5Q8
zOKBY&f9UHNOR$2+fpZyddaz>2iGR*z5ItOBZm<0W?6alq>^2BUx=omiW=07r5Uc|7
zXf$-&ik-Gx-We?m0hBXIT+>uYqJoXqb5uk<PL(|rK)?BO&H5h4#Tm6?!iG;=kgh)c
zV^(~i^?{Q^GHq>R=lG72N2FeH)uS9|MaONoI$W=~_b{9Fbaq?=+sh%7ewJcoek$Kf
zu~a}43b_tBVJ-0NU}5-vo_AS8JWesltYG{7<%aul4{U^+_C{5wiDrGaOuZ!f+G<Bg
zmcqHZ!T#<8n91~N`GC+HHMbSZQyOF4Vj`M6GjGsjtI=rpKe%L}%`e>d_=45;%0c=a
zr)qsR4H2$~Wz>jH#=>2;3&?5SFVW@Z^X`=@!Zv^<(s0Iyp$@AHZRwlCHqzKA3-(X-
z&zw;=lYBttMLjOXc5P<n*4?JIkWl=D`<(;kA&Tp98)w-8y4$Kh5T+Y8aQk=Bx}M*a
zc5VEwZ`JIzjkb%+H0SlL)7R<@h~i(Y^)n2F65FAu<s3j<&6!wO=-9~W5Dz%p9I|ru
z34nYDDS)i);Y<RJ6^)%&^C&D9rU<psAqEggo_=Is5*!J+A4|BR)gc!$CCCIC1Nc%?
zXKxUyq4jsyKig~l49}29B2Y-UJO-@ekxF305Qzd_7=&g(Lzp;|NuHD#4g~}P0w&<^
zpPVC15hBhUgqDDa4QPG<aUZ%qAOHgjJoz`^<yR3*QODEGjR?X<pg55|aOj{)fS)q~
zgs=eq{GfOeFQOZHHB0hz_aMWeFRTP#Z*OO~qDQcw2hj&SC6YXSUEvt86b~Vid|`^d
zK7dE^VPEi%2Z;#sZXgr|js?~iKn#Rh5Gh?jTRsF|FJB)x4m{EXVu9m8kq(@KuHZ^&
zn4%#ZdXbFbiY9PnKpZ4c5Gn(Q4*fX5@nEY8kl_M)fh)R#Uco>hPz4a^0ax?{InbFG
zT+th@=mThiBY{p7Nq`1G7F_WVTrm(x7IYc{PBnt?6h&K4SF#6?8w7)Xk947^^4B4V
ze@*2#Wm7csKQs%}=H=`j0A#x^9cu;<pdcPR3VaQMMu2YQQP3{9wzHqXPq`srigv%U
zAVdfRFL@F)ecZi>fN(`CGSS-x%9uC@Q8)q&VNk!v5B-99d*C<(;{Qj2{lE5u2EY{q
ztQZ4UbINFCI1Y`2;lV0MNf`l0fk_<=^b92|pc8ry)klL<XOt^i39A2pOR!(h(V#Ar
zpgw*jEciZ&L_h*i#()Gm!Yjj-faT%9TYv?FD?tg1f}`+Q7)ZcIalnK@C*ZYI0=BNC
z3<yAgJamPgqd@|}SxTTi)F)mU53WDUpeuzLv=R)uLPS7JK?w&84r+@9jQ{Kd)CJ>G
z@?iCc0>@mSgjK>*u6W?%aiAQBL;(hWj*9`~V!@n&#zXBea4fhg17Sf|N&>N6kT4)2
zKv!sdiqaqf0vdooVbGXZ(1s$EU&KLIiv2(%VW1lXBqE4xh&AX65=9@-a6c0ayQ;H)
zy84*`1pe#-1vm!Hun<*%a;Q0E2EWwx>k6oa{UU_Y14ao{0!Ne~@*z9;r+=sxR7f!<
z&=Yk3PcINr2n3AsY_$VC4iEp8AVc_R1CSm3nJBUbb^+PMZ;4_LkWKtb6kA!f3CJFP
zPY_*{d(a1D8^037ISzpZIs~*rHb5oe!K6pA3*fGBzolO`@#_i+1?U|{xkCBh5=GZP
zpF?&2NswJYT7|Cv9_#)cRB8hELzxS~#0-Q;@n<@4B!n!$M63s=9FYY%LpabZrOX%(
zCR9q!1P+u8axgGwqkj29I8Zp{sWlu$@lMbzj#?eV8ID>V!v)CoXFUR(a!>{I^3zWM
zRZ-gibU;9ulsi{o*FSSaIAD-c_H(8L>{0IAfVn_9*iTOc-U3Ro2OQ|;XYQvfL$8kO
z2`uU711~r@j7X{C4F}UGr4JuCW_3JY$Q@FiLM{)pI@-UE6SK-CrDWBAQpSe;<}@ko
zNpSG}pOOmz*1DRbI7_fB`dLnK%-|)Z+>t@pCX|D%x>)cL>t{Ley1>y=@`vDH%A(`~
z!R+xf7Xk-fmr@3~YM^||&A-8}P^Z6qogR3@es=Zmb7+`f&oz<1V~T&mUlp}AK_x|k
zGuX(0{HiAEe^vuWME<RY{-4zl3J_9o21rL4^!S@EgQ#1j=3f#ap4SvY?5`<=WciDk
zpN{=!1hAr!rz?bqz#9sQiV_uj2smO2KdX@vz-9g0SQ|vPLa1*$fM<SQzGN>?7kD6A
z0k43PgL{z4egP_qir)V`Qt&0YOT!caOmjU%0D-c<xAJp!gS$8r4uVludyodMAL`!I
z*GC76F5a%Af<mCcj1RV#aR?Lw>mUummsPTYNJKXnG@-zNZ(IEV*M$Xdf*bswG6)+&
zKe*3o84?5m0}=gIhJ-Bf?=mnY{auEJ-tfQbp`o|o?=o<9^v^PI2e9AYWoYRAf`0$d
zhd?Q>85aSV2e$loJ>VDqEJLEPU|RpX46U?=Uz9R5d;QTL5`gf(>mg8h5UTcPJp`Hx
zA7Hy_Z9N<`$NVt{0u8x`zsvAY)XksuFo?Bs!2knTQx8j}Eq;xT5Lg6m4SfJbtSy6F
z#9y=k7r%xN914QUf6@#*%-VVw{Ms^Q0HywFtAvF7#h?8tVexD0DX*2Y5*{oi{%WgC
zMOMma$fx`{hB5{~zrV|{U?KK*8GbE&c;p)UL*Oycto`R0cpNw;{CAl$<n8{b2M`W4
zv;S3wTx+`s%o?48^$+B~|7we0qYorlGX1R&BoO`IGC(46Yh;Q<;@8*)SX8aCaj=91
z5c@A$P*})Y|4AP>l=vIXWRkO|7m-ByzZ$GOLx^Br1B*akUuY_(I59&XH(!9FC^ujd
Y3?Ms`$doA%iN)fTVG<I$W_qyy1EW(3`2YX_

literal 0
HcmV?d00001

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 92d386c84..9b5230ffd 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -211,6 +211,7 @@ struct InferenceResult {
                       BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
   InferenceResult() : num_token_ids(0), num_gumbel_logits(0) {}
   InferenceResult(InferenceResult const &other);
+  friend std::ostream &operator<<(std::ostream &os, InferenceResult const &ir);
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index e1f3f1904..795f615d1 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -152,6 +152,7 @@ class FFConfig {
   Legion::Runtime *lg_hlr;
   Legion::IndexSpaceT<1> all_gpu_task_is;
   // Legion::FieldSpace field_space;
+  bool log_instance_creation;
   bool benchmarking, profiling, perform_fusion;
   bool inference_debugging;
   size_t simulator_work_space_size;
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index 4b1120887..a866e52cb 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -68,6 +68,7 @@ struct GenerationResult {
   std::vector<TokenId> output_tokens;
   double slo_ratio;
   double emission_time_ms;
+  int decoding_steps;
 };
 
 // Contains the configuration for how to emit requests to the server,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 09a8dafc7..825c8e995 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -253,9 +253,7 @@ enum TaskIDs {
   RM_PREPARE_NEXT_BATCH_SPEC_TASK_ID,
   RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
   RM_BACKGROUND_SERVING_TASK_ID,
-  LOAD_FLOAT_WEIGHT_TASK_ID,
-  LOAD_HALF_WEIGHT_TASK_ID,
-  LOAD_QUANT_WEIGHT_TASK_ID,
+  LOAD_WEIGHT_TASK_ID,
   // Custom tasks
   CUSTOM_GPU_TASK_ID_FIRST,
   CUSTOM_GPU_TASK_ID_1,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index 625cc9ee2..e4e077e78 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -43,6 +43,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _position_bias,
                                 bool allocate_weights,
                                 bool _streaming_cache,
+                                int _tensor_parallelism_degree,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 ParallelTensor const _input,
@@ -63,6 +64,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 bool _position_bias,
                                 bool allocate_weights,
                                 bool _streaming_cache,
+                                int _tensor_parallelism_degree,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 SpecIncMultiHeadSelfAttention const &other,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index 87f509831..75cb576dc 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -9,7 +9,8 @@ namespace FlexFlow {
 
 struct SpecIncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
-  int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
+  int embed_dim, num_q_heads, num_kv_heads, kdim, vdim,
+      tensor_parallelism_degree;
   float dropout, scaling_factor;
   bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
       position_bias;
diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h
index 4917df73c..35f0c8542 100644
--- a/include/flexflow/optimizer.h
+++ b/include/flexflow/optimizer.h
@@ -61,8 +61,8 @@ class SGDOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(Context ctx,
-                                   Runtime *runtime,
+  static void nccl_update_task_gpu(Legion::Context ctx,
+                                   Legion::Runtime *runtime,
                                    SGDOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
@@ -106,8 +106,8 @@ class AdamOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(Context ctx,
-                                   Runtime *runtime,
+  static void nccl_update_task_gpu(Legion::Context ctx,
+                                   Legion::Runtime *runtime,
                                    AdamOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
index b8af8e833..3436fc2a6 100644
--- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
@@ -31,15 +31,15 @@ class AllReduceMeta : public OpMeta {
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(Context ctx,
-                              Runtime *runtime,
+void inference_kernel_wrapper(Legion::Context ctx,
+                              Legion::Runtime *runtime,
                               AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output);
 
-void forward_kernel_wrapper(Context ctx,
-                            Runtime *runtime,
+void forward_kernel_wrapper(Legion::Context ctx,
+                            Legion::Runtime *runtime,
                             AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index b76291129..f86b234a0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -148,7 +148,6 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
-  // TokenTree speculative_token_tree;
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
   BatchConfig::BitMask causal_mask;
@@ -231,6 +230,50 @@ struct Request {
   }
 };
 
+struct NewProfileInfo {
+  long long timestamp;
+  BatchConfig::RequestGuid request_guid;
+  int request_step_idx;
+  int num_speculated_tokens;
+  int num_accepted_tokens;
+  double speculation_score;
+  int num_generated_tokens;
+  long long speculation_start_timestamp;
+  long long speculation_end_timestamp;
+};
+struct RequestProfileInfo {
+  int llm_prefilling_steps = 0;
+  int ssm_prefilling_steps = 0;
+  int llm_decoding_steps = 0;
+  int ssm_decoding_steps = 0;
+  long long start_time = 0, start_decoding_time = 0, finish_time = 0;
+  long long speculation_start_timestamp;
+  long long speculation_end_timestamp;
+  std::vector<int> speculated_size_per_step;
+  std::vector<int> accepted_tokens_per_step;
+  std::vector<int> generated_tokens_per_step__;
+};
+struct ProfileInfo {
+  // For SpecInfer: One step is comprised of one ssm speculation phase + a
+  // single llm verification phase (forward pass + verification) For Incr
+  // Decoding: One step is one LLM decoding phase
+  long long llm_step_start = 0, ssm_step_start = 0;
+  // Times for each LLM verification phase (in ms)
+  std::vector<double> llm_step_times;
+  // Number of requests in batch at each step
+  std::vector<int> requests_per_step;
+  // Times for each SSM speculation phase (in ms)
+  std::vector<double> ssm_step_times;
+  // Number of requests getting decoded at each step
+  std::vector<int> ssm_steps;
+  std::vector<double> tree_operation_step_times;
+  // Number of generated tokens at each step
+  std::vector<int> generated_tokens_per_step;
+  // To calculate the E2E time of serving
+  long long server_start_time = 0;
+  long long server_end_time = 0;
+};
+
 class RequestManager {
 public:
   enum State {
@@ -283,6 +326,8 @@ class RequestManager {
   void set_max_tree_depth(int max_tree_depth);
   int get_max_tree_width();
   void set_max_tree_width(int max_tree_width);
+  int get_expansion_degree();
+  void set_expansion_degree(int expansion_degree_);
   void set_speculative_sampling(bool speculative_sampling);
   void set_baseline_latency(double baseline_latency_ms);
   double get_baseline_latency();
@@ -309,7 +354,7 @@ class RequestManager {
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
-                          int eos_token_id,
+                          std::vector<int> eos_token_ids,
                           std::string const &path);
   std::vector<int32_t> tokenize(std::string const &text);
   void register_output_filepath(std::string const &);
@@ -329,6 +374,7 @@ class RequestManager {
   static void terminate_background_server_at_exit();
   // Methods to check and mark request completion
   void trigger_request_completion_future(RequestGuid const &guid);
+  bool is_eos_token(TokenId token_id);
   static void background_serving_task(
       Legion::Task const *task,
       std::vector<Legion::PhysicalRegion> const &regions,
@@ -366,6 +412,12 @@ class RequestManager {
   int get_num_active_requests();
   int get_empty_request_index();
 
+  std::unordered_map<RequestGuid, RequestProfileInfo> get_requests_profiling();
+  std::unordered_map<RequestGuid, GenerationResult>
+      get_request_generation_results();
+  ProfileInfo get_profiling_info();
+  std::vector<NewProfileInfo> get_new_profiling_info();
+
   // Comparters
   struct SharedTokenTreeNodePtrRequestGuidWeightedLess {
     bool operator()(
@@ -393,6 +445,7 @@ class RequestManager {
   int max_tree_depth;
   int max_tree_width;
   int k;
+  int expansion_degree = 3;
   // Profile based latency
   double baseline_latency_ms = 43;
   double ssm_spec_latency_ms = 17;
@@ -416,9 +469,9 @@ class RequestManager {
   bool verbose;
   ModelType model_type;
   int bos_token_id;
-  int eos_token_id;
+  std::vector<int> eos_token_ids;
   bool old_llama_tokenizer = false;
-  std::string output_filepath;
+  std::string output_filepath, csv_filepath;
   std::queue<Request> pending_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
@@ -457,34 +510,9 @@ class RequestManager {
   // TODO: maintain this field
   size_t num_processed_requests;
 
-  struct RequestProfileInfo {
-    int llm_prefilling_steps = 0;
-    int ssm_prefilling_steps = 0;
-    int llm_decoding_steps = 0;
-    int ssm_decoding_steps = 0;
-    long long start_time = 0, start_decoding_time = 0, finish_time = 0;
-  };
-  struct ProfileInfo {
-    // For SpecInfer: One step is comprised of one ssm speculation phase + a
-    // single llm verification phase (forward pass + verification) For Incr
-    // Decoding: One step is one LLM decoding phase
-    long long llm_step_start = 0, ssm_step_start = 0;
-    // Times for each LLM verification phase (in ms)
-    std::vector<double> llm_step_times;
-    // Number of requests in batch at each step
-    std::vector<int> requests_per_step;
-    // Times for each SSM speculation phase (in ms)
-    std::vector<double> ssm_step_times;
-    // Number of requests getting decoded at each step
-    std::vector<int> ssm_steps;
-    // Number of generated tokens at each step
-    std::vector<int> generated_tokens_per_step;
-    // To calculate the E2E time of serving
-    long long server_start_time = 0;
-  };
-
   ProfileInfo profiling;
   std::unordered_map<RequestGuid, RequestProfileInfo> profiling_requests;
+  std::vector<NewProfileInfo> new_profiling_info;
   double total_request_run_time;
   bool load_pending_request_to_batch();
   void request_update_attainment(int index, bool attained);
diff --git a/include/flexflow/utils/communication_buffer.h b/include/flexflow/utils/communication_buffer.h
index 3c14284d6..5935c4859 100644
--- a/include/flexflow/utils/communication_buffer.h
+++ b/include/flexflow/utils/communication_buffer.h
@@ -24,6 +24,7 @@
 #include <rccl/rccl.h>
 #endif
 #endif
+#include "legion.h"
 
 // adapted from https://github.com/mlc-ai/relax
 
@@ -58,7 +59,9 @@ class CommunicationBuffer {
   int *barrier_flag;
 };
 
-CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
+CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
+                                                    Legion::Runtime *runtime,
+                                                    int num_devices,
                                                     int device_id,
                                                     ncclComm_t ncclComm,
                                                     void *allgather_src,
diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h
index a6771ee6a..4ccc6db48 100644
--- a/include/flexflow/utils/file_loader.h
+++ b/include/flexflow/utils/file_loader.h
@@ -21,6 +21,7 @@
 
 using namespace std;
 using namespace FlexFlow;
+using namespace Legion;
 
 class FileDataLoader {
 public:
@@ -36,29 +37,31 @@ class FileDataLoader {
   BatchConfig::TokenId *generate_requests(int num, int length);
 
   template <typename DT>
-  void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx);
+  void load_single_weight_tensor(FFModel *ff,
+                                 Layer *l,
+                                 int weight_idx,
+                                 size_t volume,
+                                 size_t num_replicas,
+                                 DT *weight,
+                                 Domain weight_domain);
 
-  void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx);
-#ifdef DEADCODE
-  void load_weights(FFModel *ff);
-#endif
+  void load_quantization_weight(FFModel *ff,
+                                Layer *l,
+                                int weight_idx,
+                                size_t volume,
+                                size_t num_replicas,
+                                char *weight,
+                                DataType data_type,
+                                Domain weight_domain);
 
   static void
-      load_float_weight_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
+      load_weight_task(Legion::Task const *task,
+                       std::vector<Legion::PhysicalRegion> const &regions,
+                       Legion::Context ctx,
+                       Legion::Runtime *runtime);
+  void load_weights_parallel(FFModel *ff,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
-  static void
-      load_half_weight_task(Legion::Task const *task,
-                            std::vector<Legion::PhysicalRegion> const &regions,
-                            Legion::Context ctx,
-                            Legion::Runtime *runtime);
-  static void
-      load_quant_weight_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
-  void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime);
 
   void load_positions(FFModel *ff,
                       Tensor pt,
@@ -79,6 +82,15 @@ struct WeightLoadTaskArgs {
   FileDataLoader *loader;
   Layer *layer;
   int weight_idx;
-  WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx)
-      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx) {}
+  size_t volume, num_replicas;
+  DataType data_type;
+  WeightLoadTaskArgs(FFModel *_ff,
+                     FileDataLoader *_loader,
+                     Layer *_l,
+                     int _idx,
+                     size_t _volume,
+                     size_t _num_replicas,
+                     DataType _data_type)
+      : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), volume(_volume),
+        num_replicas(_num_replicas), data_type(_data_type) {}
 };
diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h
index 8e50a4c3b..af3327b04 100644
--- a/include/flexflow/utils/memory_allocator.h
+++ b/include/flexflow/utils/memory_allocator.h
@@ -23,7 +23,9 @@ namespace FlexFlow {
 class MemoryAllocator {
 public:
   MemoryAllocator(Legion::Memory memory);
-  void create_legion_instance(Realm::RegionInstance &inst, size_t size);
+  void create_legion_instance(Realm::RegionInstance &inst,
+                              size_t size,
+                              char const *task_name = NULL);
   void register_reserved_work_space(void *base, size_t size);
   inline void *allocate_reserved_untyped(size_t datalen) {
     void *ptr = static_cast<char *>(reserved_ptr) + reserved_allocated_size;
@@ -60,6 +62,7 @@ class MemoryAllocator {
   void *instance_ptr;
   size_t reserved_total_size, reserved_allocated_size;
   size_t instance_total_size, instance_allocated_size;
+  bool log_instance_creation;
 };
 
 }; // namespace FlexFlow
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 2b2db8b95..959535e0d 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -276,7 +276,8 @@ void FlexFlow::top_level_task(Task const *task,
   ModelType model_type = ModelType::UNKNOWN;
   auto architectures = model_config["architectures"];
   for (auto const &str : architectures) {
-    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
       model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
@@ -296,9 +297,21 @@ void FlexFlow::top_level_task(Task const *task,
   int bos_token_id = model_config.find("bos_token_id") == model_config.end()
                          ? -1
                          : (int)model_config.at("bos_token_id");
-  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
-                         ? -1
-                         : (int)model_config.at("eos_token_id");
+  // int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+  //                        ? -1
+  //                        : (int)model_config.at("eos_token_id");
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
 
   assert(model_type != ModelType::UNKNOWN &&
          "Invalid LLM model type passed (or no type was passed).");
@@ -322,7 +335,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_verbose(verbose);
   rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
diff --git a/inference/simplified_infer/CMakeLists.txt b/inference/simplified_infer/CMakeLists.txt
new file mode 100644
index 000000000..35ee40711
--- /dev/null
+++ b/inference/simplified_infer/CMakeLists.txt
@@ -0,0 +1,74 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_SpecInfer)
+set(project_target1 specinfer)
+
+
+set(CPU_SRC1
+  ${FLEXFLOW_CPP_DRV_SRC}
+  specinfer.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})
+
+
+project(FlexFlow_IncrDecoding)
+set(project_target3 incr_dec)
+
+
+set(CPU_SRC3
+  ${FLEXFLOW_CPP_DRV_SRC}
+  incr_dec.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+
+set(BIN_DEST "bin")
+install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})
diff --git a/inference/simplified_infer/incr_dec.cc b/inference/simplified_infer/incr_dec.cc
new file mode 100644
index 000000000..ed6125d0f
--- /dev/null
+++ b/inference/simplified_infer/incr_dec.cc
@@ -0,0 +1,473 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <cassert>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string trace_file_path;
+  std::string trace_output_path;
+  std::string log_file_path;
+  std::string csv_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_output_length,
+                      bool &do_sample,
+                      int &request_per_second,
+                      bool &add_special_tokens,
+                      std::string &target_partition) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // traces
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-trace-output-path")) {
+      paths.trace_output_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-target-partition")) {
+      target_partition = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-log-output-path")) {
+      paths.log_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-csv-output-path")) {
+      paths.csv_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--add-special-tokens")) {
+      add_special_tokens = true;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 512;
+  int max_output_length = 512;
+  int num_warmup_requests = 0;
+  double warmup_delay = 15.0;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::INCREMENTAL_DECODING;
+  int sampling_seed = 0;
+  int request_per_second = -1;
+  bool add_special_tokens = false;
+  std::string target_partition = "FEATURE_EXTRACTION";
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   use_full_precision,
+                   verbose,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_output_length,
+                   do_sample,
+                   request_per_second,
+                   add_special_tokens,
+                   target_partition);
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  // Get dataset
+  std::ifstream input_file(file_paths.trace_file_path);
+  assert(input_file.good() && "Prompt file does not exist.");
+  nlohmann::ordered_json j = nlohmann::ordered_json::parse(input_file);
+  input_file.close();
+
+  // Find the partition with name "FEATURE_EXTRACTION"
+  auto &partitions = j["partitions"];
+  auto it =
+      std::find_if(partitions.begin(),
+                   partitions.end(),
+                   [target_partition](nlohmann::ordered_json const &partition) {
+                     return partition["partition_name"] == target_partition;
+                   });
+  nlohmann::ordered_json &partition = *it;
+  if (it == partitions.end()) {
+    std::cerr << "Partition " << target_partition
+              << " not found in the trace file." << std::endl;
+    assert(false);
+  }
+  // check that the max prompt + response length sum in the eval_entries in the
+  // partition does not exceed the max_sequence_length
+  int max_prompt_response_length = 0;
+  for (auto &eval_entry : partition["eval_entries"]) {
+    int prompt_length = eval_entry["prompt_length"];
+    int response_length = eval_entry["response_length"];
+    if (response_length >= max_output_length) {
+      std::cerr << "Error: A response length from the targt partition in the "
+                   "dataset (="
+                << response_length
+                << ") exceeds the max_output_length(=" << max_output_length
+                << ")." << std::endl;
+      assert(false);
+    }
+    max_prompt_response_length =
+        std::max(max_prompt_response_length, prompt_length + response_length);
+  }
+  if (max_prompt_response_length >= max_sequence_length) {
+    std::cerr << "Error: max prompt + response length sum (="
+              << max_prompt_response_length
+              << ") in the eval_entries in the partition exceeds the "
+                 "max_sequence_length(="
+              << max_sequence_length << ")." << std::endl;
+    assert(false);
+  }
+
+  // Get model configs
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  std::vector<int> eos_token_ids;
+  if (model_config.find("eos_token_id") != model_config.end()) {
+    if (model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : model_config["eos_token_id"]) {
+        eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      eos_token_ids.push_back(model_config["eos_token_id"]);
+    }
+  } else {
+    eos_token_ids.push_back(-1);
+  }
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // set request manager properties
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, false, 16);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(false);
+  rm->set_baseline_latency(50);
+  rm->set_ssm_spec_latency(20);
+  rm->set_llm_verify_latency(50);
+  rm->set_spec_infer_old_version(true);
+  rm->set_greedy_schedule(false);
+  rm->set_equal_schedule(false);
+  rm->set_max_tree_depth(8);
+  rm->set_max_tree_width(16);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(false);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.log_file_path);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              false,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  rm->start_background_server(&model);
+
+  int total_num_requests = 0;
+  {
+    // Iterate through eval_entries
+    std::vector<GenerationRequest> requests;
+    std::vector<double> timestamps, ratios;
+    if (partition.contains("num_warmup_requests")) {
+      num_warmup_requests = partition["num_warmup_requests"];
+    }
+    for (auto &entry : partition["eval_entries"]) {
+      std::string text = entry["prompt"];
+      int max_new_tokens_ = entry["response_length"];
+
+      bool is_warmup_request = total_num_requests < num_warmup_requests;
+      double request_delay =
+          1000.0 *
+          (request_per_second > 0 ? (1.0 / (double)request_per_second) : 0);
+      double emission_time_ms =
+          is_warmup_request
+              ? 0.0
+              : (warmup_delay +
+                 request_delay * (total_num_requests - num_warmup_requests));
+
+      GenerationRequest inference_req(text,             // prompt
+                                      -1.0,             // slo_ratio
+                                      emission_time_ms, // emission_time_ms
+                                      add_special_tokens);
+
+      requests.push_back(inference_req);
+      timestamps.push_back(emission_time_ms);
+      ratios.push_back(1.0);
+      total_num_requests++;
+
+      if (verbose) {
+        break;
+      }
+    }
+    TraceEmissionMachine emission_machine(timestamps, ratios);
+    std::vector<GenerationResult> result =
+        model.generate(requests, emission_machine);
+    assert(result.size() == requests.size());
+    assert(result.size() == total_num_requests);
+    assert(result.size() == partition["eval_entries"].size());
+    int i = 0;
+    for (auto &entry : partition["eval_entries"]) {
+      entry["original_response"] = entry["response"];
+      entry["original_response_length"] = entry["response_length"];
+      std::string ff_out = result[i].output_text;
+      int tot_length = result[i].output_text.length();
+      entry["response"] = ff_out;
+      entry["response_length"] = result[i].output_tokens.size();
+      entry["specinfer_decoding_steps"] = result[i].decoding_steps;
+      i++;
+    }
+
+    // Write the modified JSON to a file
+    std::ofstream output_file(file_paths.trace_output_path);
+    if (output_file.is_open()) {
+      output_file << j.dump(2);
+      output_file.close();
+      std::cout << "Modified JSON has been saved to "
+                << file_paths.trace_output_path << std::endl;
+    } else {
+      std::cerr << "Unable to open file for writing." << std::endl;
+    }
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  std::string header =
+      "llm,partition,max_requests_per_batch,max_tokens_per_"
+      "batch,request_per_second,is_warmup_request,request_guid,"
+      "request_step_idx,timestamp,num_generated_tokens";
+  // csv filepath
+  // create csv filepath and add header if it doesn't exist
+
+  bool csv_file_exists = std::filesystem::exists(file_paths.csv_file_path);
+  if (!csv_file_exists) {
+    // Create new file and write header
+    std::ofstream file(file_paths.csv_file_path);
+    if (!file.is_open()) {
+      std::cerr << "Failed to open file: " << file_paths.csv_file_path
+                << std::endl;
+      assert(false);
+    }
+    file << header << "\n";
+    file.close();
+  }
+
+  // Append the new row
+  std::ofstream file(file_paths.csv_file_path, std::ios::app);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open file: " << file_paths.csv_file_path
+              << std::endl;
+  }
+
+  std::vector<NewProfileInfo> new_profiling_info = rm->get_new_profiling_info();
+  for (auto const &info : new_profiling_info) {
+    file << llm_model_name + ",";
+    file << target_partition + ",";
+    file << std::to_string(max_requests_per_batch) + ",";
+    file << std::to_string(max_tokens_per_batch) + ",";
+    file << std::to_string(request_per_second) + ",";
+    bool is_warmup_request =
+        (info.request_guid - 1000000) < num_warmup_requests;
+    file << std::to_string(is_warmup_request) + ",";
+    file << info.request_guid << "," << info.request_step_idx << ","
+         << info.timestamp << "," << info.num_generated_tokens << "\n";
+  }
+  file.close();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/simplified_infer/specinfer.cc b/inference/simplified_infer/specinfer.cc
new file mode 100644
index 000000000..58f302075
--- /dev/null
+++ b/inference/simplified_infer/specinfer.cc
@@ -0,0 +1,692 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include <cassert>
+#include <filesystem>
+#include <string>
+#include <wordexp.h>
+
+using namespace FlexFlow;
+using namespace Legion;
+using RequestGuid = BatchConfig::RequestGuid;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string trace_file_path;
+  std::string trace_output_path;
+  std::string log_file_path;
+  std::string csv_file_path;
+};
+
+struct ModelNames {
+  std::string llm_model_name;
+  std::vector<std::string> ssm_model_names;
+};
+
+struct ModelMeta {
+  ModelNames model_names;
+
+  ModelType llm_model_type;
+  std::string llm_tokenizer_path;
+  std::string llm_weights_path;
+  std::string llm_model_config_path;
+
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
+
+  std::vector<ModelType> ssm_model_types;
+  std::vector<std::string> ssm_model_config_paths;
+  std::vector<std::string> ssm_model_weights_paths;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      ModelNames &model_names,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      int &ssm_tp_degree,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_output_length,
+                      int &max_tree_width,
+                      int &max_tree_depth,
+                      int &expansion_degree,
+                      bool &do_sample,
+                      int &request_per_second,
+                      bool &add_special_tokens,
+                      std::string &target_partition) {
+  for (int i = 1; i < argc; i++) {
+    // llm model name
+    if (!strcmp(argv[i], "-llm-model")) {
+      model_names.llm_model_name = std::string(argv[++i]);
+      for (char &c : model_names.llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // ssm models names
+    if (!strcmp(argv[i], "-ssm-model")) {
+      std::string ssm_model_name = std::string(argv[++i]);
+      for (char &c : ssm_model_name) {
+        c = std::tolower(c);
+      }
+      model_names.ssm_model_names.push_back(ssm_model_name);
+      continue;
+    }
+    if (!strcmp(argv[i], "-ssm-tp-degree")) {
+      ssm_tp_degree = std::stoi(argv[++i]);
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // trace
+    if (!strcmp(argv[i], "-trace")) {
+      paths.trace_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-trace-output-path")) {
+      paths.trace_output_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-target-partition")) {
+      target_partition = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-log-output-path")) {
+      paths.log_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "-csv-output-path")) {
+      paths.csv_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-output-length")) {
+      max_output_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-width")) {
+      max_tree_width = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tree-depth")) {
+      max_tree_depth = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--expansion-degree")) {
+      expansion_degree = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--request-per-second")) {
+      request_per_second = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--add-special-tokens")) {
+      add_special_tokens = true;
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void get_model_meta(FilePaths &file_paths,
+                    ModelMeta &model_metadata,
+                    bool use_full_precision) {
+  if (model_metadata.model_names.llm_model_name.empty() ||
+      model_metadata.model_names.ssm_model_names.size() == 0) {
+    assert(false && "SpecInfer needs at least one LLM and one SSM for "
+                    "speculative inference");
+  }
+  model_metadata.llm_model_config_path =
+      join_path({file_paths.cache_folder_path,
+                 "configs",
+                 model_metadata.model_names.llm_model_name,
+                 "config.json"});
+  model_metadata.llm_tokenizer_path =
+      join_path({file_paths.cache_folder_path,
+                 "tokenizers",
+                 model_metadata.model_names.llm_model_name});
+  model_metadata.llm_weights_path =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 model_metadata.model_names.llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+
+  std::ifstream llm_config_file_handle(model_metadata.llm_model_config_path);
+  if (!llm_config_file_handle.good()) {
+    std::cout << "LLM Model config file "
+              << model_metadata.llm_model_config_path << " not found."
+              << std::endl;
+    assert(false);
+  }
+  nlohmann::ordered_json llm_model_config =
+      nlohmann::ordered_json::parse(llm_config_file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+
+  model_metadata.llm_model_type = ModelType::UNKNOWN;
+  auto architectures = llm_model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
+      model_metadata.llm_model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_metadata.llm_model_type = ModelType::FALCON;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_metadata.llm_model_type = ModelType::MPT;
+      break;
+    }
+  }
+  model_metadata.bos_token_id =
+      llm_model_config.find("bos_token_id") == llm_model_config.end()
+          ? -1
+          : (int)llm_model_config.at("bos_token_id");
+  // model_metadata.eos_token_id =
+  //     llm_model_config.find("eos_token_id") == llm_model_config.end()
+  //         ? -1
+  //         : (int)llm_model_config.at("eos_token_id");
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
+
+  for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
+    std::string ssm_config_path = join_path({file_paths.cache_folder_path,
+                                             "configs",
+                                             ssm_model_name,
+                                             "config.json"});
+    std::string ssm_tokenizer_path =
+        join_path({file_paths.cache_folder_path, "tokenizers", ssm_model_name});
+    std::string ssm_weights_path =
+        join_path({file_paths.cache_folder_path,
+                   "weights",
+                   ssm_model_name,
+                   use_full_precision ? "full-precision" : "half-precision"});
+
+    std::ifstream ssm_config_file_handle(ssm_config_path);
+    if (!ssm_config_file_handle.good()) {
+      std::cout << "SSM Model config file " << ssm_config_path << " not found."
+                << std::endl;
+      assert(false);
+    }
+    nlohmann::ordered_json ssm_model_config =
+        nlohmann::ordered_json::parse(ssm_config_file_handle,
+                                      /*parser_callback_t */ nullptr,
+                                      /*allow_exceptions */ true,
+                                      /*ignore_comments */ true);
+
+    ModelType ssm_model_type = ModelType::UNKNOWN;
+    auto architectures = ssm_model_config["architectures"];
+    for (auto const &str : architectures) {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+          str == "MistralForCausalLM") {
+        ssm_model_type = ModelType::LLAMA;
+        break;
+      } else if (str == "OPTForCausalLM") {
+        ssm_model_type = ModelType::OPT;
+        break;
+      } else if (str == "RWForCausalLM") {
+        ssm_model_type = ModelType::FALCON;
+        break;
+      } else if (str == "MPTForCausalLM") {
+        ssm_model_type = ModelType::MPT;
+        break;
+      }
+    }
+    int ssm_bos_id =
+        ssm_model_config.find("bos_token_id") == ssm_model_config.end()
+            ? -1
+            : (int)ssm_model_config.at("bos_token_id");
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
+    model_metadata.ssm_model_types.push_back(ssm_model_type);
+    model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
+    model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
+  }
+
+  assert(model_metadata.llm_model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  for (auto mt : model_metadata.ssm_model_types) {
+    if (mt == ModelType::UNKNOWN) {
+      assert(false && "One of the SSM model types passed is invalid.");
+    }
+  }
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  FilePaths file_paths;
+  ModelMeta model_metadata;
+  bool use_full_precision = false;
+  bool verbose = false;
+  int ssm_tp_degree = 1;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 512;
+  int max_output_length = 512;
+  int expansion_degree = 3;
+  int max_tree_depth = 8;
+  int max_tree_width = 16;
+  RequestManager::DecodingMode decoding_mode =
+      RequestManager::SPECULATIVE_DECODING;
+  bool do_sample = false;
+  int sampling_seed = 0;
+  int request_per_second = -1;
+  int num_warmup_requests = 0;
+  double warmup_delay = 15.0;
+  bool add_special_tokens = false;
+  std::string target_partition = "FEATURE_EXTRACTION";
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   model_metadata.model_names,
+                   use_full_precision,
+                   verbose,
+                   ssm_tp_degree,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_output_length,
+                   max_tree_width,
+                   max_tree_depth,
+                   expansion_degree,
+                   do_sample,
+                   request_per_second,
+                   add_special_tokens,
+                   target_partition);
+
+  get_model_meta(file_paths, model_metadata, use_full_precision);
+
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+  assert(ssm_tp_degree >= 1 &&
+         ssm_tp_degree <= ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::ifstream input_file(file_paths.trace_file_path);
+  assert(input_file.good() && "Prompt file does not exist.");
+  nlohmann::ordered_json j = nlohmann::ordered_json::parse(input_file);
+  input_file.close();
+
+  // Find the partition with name "FEATURE_EXTRACTION"
+  auto &partitions = j["partitions"];
+  auto it =
+      std::find_if(partitions.begin(),
+                   partitions.end(),
+                   [target_partition](nlohmann::ordered_json const &partition) {
+                     return partition["partition_name"] == target_partition;
+                   });
+  nlohmann::ordered_json &partition = *it;
+  if (it == partitions.end()) {
+    std::cerr << "Partition " << target_partition
+              << " not found in the trace file." << std::endl;
+    assert(false);
+  }
+  // check that the max prompt + response length sum in the eval_entries in the
+  // partition does not exceed the max_sequence_length
+  int max_prompt_response_length = 0;
+  for (auto &eval_entry : partition["eval_entries"]) {
+    int prompt_length = eval_entry["prompt_length"];
+    int response_length = eval_entry["response_length"];
+    if (response_length >= max_output_length) {
+      std::cerr << "Error: A response length from the targt partition in the "
+                   "dataset (="
+                << response_length
+                << ") exceeds the max_output_length(=" << max_output_length
+                << ")." << std::endl;
+      assert(false);
+    }
+    max_prompt_response_length =
+        std::max(max_prompt_response_length, prompt_length + response_length);
+  }
+  if (max_prompt_response_length >= max_sequence_length) {
+    std::cerr << "Error: max prompt + response length sum (="
+              << max_prompt_response_length
+              << ") in the eval_entries in the partition exceeds the "
+                 "max_sequence_length(="
+              << max_sequence_length << ")." << std::endl;
+    assert(false);
+  }
+
+  // Sanity check for SpecInfer old version
+  assert(max_tree_depth <= 8);
+  assert(max_tree_width >= 3);
+  // Total verified tokens
+  assert(max_tokens_per_batch >= max_requests_per_batch * 21);
+
+  // Create SentencePiece tokenizer or OPT tokenizer
+  srand(sampling_seed);
+  GenerationConfig generationConfig(do_sample, 0.8, 0.6, false, 16);
+  InferenceManager *im = InferenceManager::get_inference_manager();
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(max_requests_per_batch);
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_ssm_batch(max_tokens_per_batch);
+  rm->set_max_tokens_per_prefilling_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->set_max_output_length(max_output_length);
+  rm->set_max_tree_depth(max_tree_depth);
+  rm->set_max_tree_width(max_tree_width);
+  rm->set_expansion_degree(expansion_degree);
+  rm->set_verbose(verbose);
+  rm->set_streaming_cache(false);
+  rm->register_tokenizer(model_metadata.llm_model_type,
+                         model_metadata.bos_token_id,
+                         model_metadata.eos_token_ids,
+                         model_metadata.llm_tokenizer_path);
+  rm->set_decoding_mode(decoding_mode);
+  rm->set_slo_violation_early_termination(false);
+  rm->set_baseline_latency(50);
+  rm->set_ssm_spec_latency(20);
+  rm->set_llm_verify_latency(50);
+  rm->set_spec_infer_old_version(true);
+  rm->set_greedy_schedule(false);
+  rm->set_equal_schedule(false);
+  rm->register_output_filepath(file_paths.log_file_path);
+
+  // Create LLM model
+  FFModel tree_model(ffconfig, ffconfig.cpu_offload);
+  if (model_metadata.llm_model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(tree_model,
+                              model_metadata.llm_model_config_path,
+                              model_metadata.llm_weights_path,
+                              TREE_VERIFY_MODE,
+                              generationConfig,
+                              false,
+                              use_full_precision);
+  } else if (model_metadata.llm_model_type == ModelType::OPT) {
+    OPT::create_opt_model(tree_model,
+                          model_metadata.llm_model_config_path,
+                          model_metadata.llm_weights_path,
+                          TREE_VERIFY_MODE,
+                          use_full_precision);
+  } else if (model_metadata.llm_model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(tree_model,
+                                model_metadata.llm_model_config_path,
+                                model_metadata.llm_weights_path,
+                                TREE_VERIFY_MODE,
+                                use_full_precision);
+  } else if (model_metadata.llm_model_type == ModelType::MPT) {
+    MPT::create_mpt_model(tree_model,
+                          model_metadata.llm_model_config_path,
+                          model_metadata.llm_weights_path,
+                          TREE_VERIFY_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "Invalid LLM model type passed (or no type was passed).");
+  }
+
+  // Create SSM models
+  int num_ssms = model_metadata.ssm_model_types.size();
+  std::vector<int> ssm_model_ids;
+  std::vector<FFModel> ssm_models;
+  FFConfig bm_config = ffconfig;
+  std::cout << "SSM TP Degree: " << ssm_tp_degree << std::endl;
+  // bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
+  //     bm_config.pipeline_parallelism_degree = 1;
+  bm_config.data_parallelism_degree = 1;
+  bm_config.tensor_parallelism_degree = ssm_tp_degree;
+  bm_config.pipeline_parallelism_degree = 1;
+  for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
+    FFModel beam_model(bm_config);
+    ssm_models.push_back(beam_model);
+  }
+
+  for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
+    FFModel &beam_model = ssm_models[ssm_id];
+    if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) {
+      LLAMA::create_llama_model(beam_model,
+                                model_metadata.ssm_model_config_paths[ssm_id],
+                                model_metadata.ssm_model_weights_paths[ssm_id],
+                                TREE_SEARCH_MODE,
+                                generationConfig,
+                                false,
+                                use_full_precision);
+    } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::OPT) {
+      OPT::create_opt_model(beam_model,
+                            model_metadata.ssm_model_config_paths[ssm_id],
+                            model_metadata.ssm_model_weights_paths[ssm_id],
+                            TREE_SEARCH_MODE,
+                            use_full_precision);
+    } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::FALCON) {
+      FALCON::create_falcon_model(
+          beam_model,
+          model_metadata.ssm_model_config_paths[ssm_id],
+          model_metadata.ssm_model_weights_paths[ssm_id],
+          TREE_SEARCH_MODE,
+          use_full_precision);
+    } else if (model_metadata.ssm_model_types[ssm_id] == ModelType::MPT) {
+      MPT::create_mpt_model(beam_model,
+                            model_metadata.ssm_model_config_paths[ssm_id],
+                            model_metadata.ssm_model_weights_paths[ssm_id],
+                            TREE_SEARCH_MODE,
+                            generationConfig,
+                            use_full_precision);
+    } else {
+      assert(false && "Invalid SSM model type passed.");
+    }
+
+    rm->register_ssm_model(&beam_model);
+  }
+
+  rm->start_background_server(&tree_model);
+
+  int total_num_requests = 0;
+  {
+    // Iterate through eval_entries
+    std::vector<GenerationRequest> requests;
+    std::vector<double> timestamps, ratios;
+    if (partition.contains("num_warmup_requests")) {
+      num_warmup_requests = partition["num_warmup_requests"];
+    }
+    for (auto &entry : partition["eval_entries"]) {
+      std::string text = entry["prompt"];
+      int max_new_tokens_ = entry["response_length"];
+
+      bool is_warmup_request = total_num_requests < num_warmup_requests;
+      double request_delay =
+          1000.0 *
+          (request_per_second > 0 ? (1.0 / (double)request_per_second) : 0);
+      double emission_time_ms =
+          is_warmup_request
+              ? 0.0
+              : (warmup_delay +
+                 request_delay * (total_num_requests - num_warmup_requests));
+
+      GenerationRequest inference_req(text,             // prompt
+                                      -1.0,             // slo_ratio
+                                      emission_time_ms, // emission_time_ms
+                                      add_special_tokens);
+      requests.push_back(inference_req);
+      timestamps.push_back(emission_time_ms);
+      ratios.push_back(1.0);
+      total_num_requests++;
+
+      if (verbose) {
+        break;
+      }
+    }
+    TraceEmissionMachine emission_machine(timestamps, ratios);
+    std::vector<GenerationResult> result =
+        tree_model.generate(requests, emission_machine);
+    assert(result.size() == requests.size());
+    assert(result.size() == total_num_requests);
+    assert(result.size() == partition["eval_entries"].size());
+    int i = 0;
+    for (auto &entry : partition["eval_entries"]) {
+      entry["original_response"] = entry["response"];
+      entry["original_response_length"] = entry["response_length"];
+      std::string ff_out = result[i].output_text;
+      int tot_length = result[i].output_text.length();
+      entry["response"] = ff_out;
+      entry["response_length"] = result[i].output_tokens.size();
+      entry["specinfer_decoding_steps"] = result[i].decoding_steps;
+      i++;
+    }
+
+    // Write the modified JSON to a file
+    std::ofstream output_file(file_paths.trace_output_path);
+    if (output_file.is_open()) {
+      output_file << j.dump(2);
+      output_file.close();
+      std::cout << "Modified JSON has been saved to "
+                << file_paths.trace_output_path << std::endl;
+    } else {
+      std::cerr << "Unable to open file for writing." << std::endl;
+    }
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  std::string header =
+      "llm,ssm,partition,expansion_degree,max_tree_depth,max_tree_width,max_"
+      "requests_per_batch,max_tokens_per_batch,request_per_second,is_warmup_"
+      "request,request_guid,"
+      "request_step_idx,"
+      "timestamp,speculation_start_timestamp,speculation_end_timestamp,num_"
+      "speculated_tokens,num_accepted_tokens,num_generated_tokens";
+  // csv filepath
+  // create csv filepath and add header if it doesn't exist
+
+  bool csv_file_exists = std::filesystem::exists(file_paths.csv_file_path);
+  if (!csv_file_exists) {
+    // Create new file and write header
+    std::ofstream file(file_paths.csv_file_path);
+    if (!file.is_open()) {
+      std::cerr << "Failed to open file: " << file_paths.csv_file_path
+                << std::endl;
+      assert(false);
+    }
+    file << header << "\n";
+    file.close();
+  }
+
+  // Append the new row
+  std::ofstream file(file_paths.csv_file_path, std::ios::app);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open file: " << file_paths.csv_file_path
+              << std::endl;
+  }
+
+  std::vector<NewProfileInfo> new_profiling_info = rm->get_new_profiling_info();
+  for (auto const &info : new_profiling_info) {
+    file << model_metadata.model_names.llm_model_name + ",";
+    file << model_metadata.model_names.ssm_model_names[0] + ",";
+    file << target_partition + ",";
+    file << std::to_string(expansion_degree) + ",";
+    file << std::to_string(max_tree_depth) + ",";
+    file << std::to_string(max_tree_width) + ",";
+    file << std::to_string(max_requests_per_batch) + ",";
+    file << std::to_string(max_tokens_per_batch) + ",";
+    file << std::to_string(request_per_second) + ",";
+    bool is_warmup_request =
+        (info.request_guid - 1000000) < num_warmup_requests;
+    file << std::to_string(is_warmup_request) + ",";
+    file << info.request_guid << "," << info.request_step_idx << ","
+         << info.timestamp << "," << info.speculation_start_timestamp << ","
+         << info.speculation_end_timestamp << "," << info.num_speculated_tokens
+         << "," << info.num_accepted_tokens << "," << info.num_generated_tokens
+         << "\n";
+  }
+  file.close();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index cde24f7b2..6be63c7fb 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -49,7 +49,8 @@ struct ModelMeta {
   std::string llm_weights_path;
   std::string llm_model_config_path;
 
-  int bos_token_id, eos_token_id;
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
 
   std::vector<ModelType> ssm_model_types;
   std::vector<std::string> ssm_model_config_paths;
@@ -276,7 +277,8 @@ void get_model_meta(FilePaths &file_paths,
   model_metadata.llm_model_type = ModelType::UNKNOWN;
   auto architectures = llm_model_config["architectures"];
   for (auto const &str : architectures) {
-    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
       model_metadata.llm_model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
@@ -294,10 +296,21 @@ void get_model_meta(FilePaths &file_paths,
       llm_model_config.find("bos_token_id") == llm_model_config.end()
           ? -1
           : (int)llm_model_config.at("bos_token_id");
-  model_metadata.eos_token_id =
-      llm_model_config.find("eos_token_id") == llm_model_config.end()
-          ? -1
-          : (int)llm_model_config.at("eos_token_id");
+  // model_metadata.eos_token_id =
+  //     llm_model_config.find("eos_token_id") == llm_model_config.end()
+  //         ? -1
+  //         : (int)llm_model_config.at("eos_token_id");
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
 
   for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
     std::string ssm_config_path = join_path({file_paths.cache_folder_path,
@@ -326,7 +339,8 @@ void get_model_meta(FilePaths &file_paths,
     ModelType ssm_model_type = ModelType::UNKNOWN;
     auto architectures = ssm_model_config["architectures"];
     for (auto const &str : architectures) {
-      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+          str == "MistralForCausalLM") {
         ssm_model_type = ModelType::LLAMA;
         break;
       } else if (str == "OPTForCausalLM") {
@@ -344,15 +358,15 @@ void get_model_meta(FilePaths &file_paths,
         ssm_model_config.find("bos_token_id") == ssm_model_config.end()
             ? -1
             : (int)ssm_model_config.at("bos_token_id");
-    int ssm_eos_id =
-        ssm_model_config.find("eos_token_id") == ssm_model_config.end()
-            ? -1
-            : (int)ssm_model_config.at("eos_token_id");
-    if (ssm_bos_id != model_metadata.bos_token_id ||
-        ssm_eos_id != model_metadata.eos_token_id) {
-      printf("Warning: bos/eos token id mismatch between LLM and one of the "
-             "SSMs!\n");
-    }
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
     model_metadata.ssm_model_types.push_back(ssm_model_type);
     model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
     model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
@@ -473,7 +487,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
-                         model_metadata.eos_token_id,
+                         model_metadata.eos_token_ids,
                          model_metadata.llm_tokenizer_path);
   rm->set_decoding_mode(decoding_mode);
   rm->set_slo_violation_early_termination(slo_attainment_early_termination);
diff --git a/inference/trace_generator/trace_generator.cc b/inference/trace_generator/trace_generator.cc
index c45c0537f..14abf5976 100644
--- a/inference/trace_generator/trace_generator.cc
+++ b/inference/trace_generator/trace_generator.cc
@@ -58,7 +58,8 @@ struct ModelMeta {
   std::string llm_weights_path;
   std::string llm_model_config_path;
 
-  int bos_token_id, eos_token_id;
+  int bos_token_id;
+  std::vector<int> eos_token_ids;
 
   std::vector<ModelType> ssm_model_types;
   std::vector<std::string> ssm_model_config_paths;
@@ -211,7 +212,8 @@ void get_model_meta(FilePaths &file_paths,
   model_metadata.llm_model_type = ModelType::UNKNOWN;
   auto architectures = llm_model_config["architectures"];
   for (auto const &str : architectures) {
-    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+        str == "MistralForCausalLM") {
       model_metadata.llm_model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
@@ -229,10 +231,21 @@ void get_model_meta(FilePaths &file_paths,
       llm_model_config.find("bos_token_id") == llm_model_config.end()
           ? -1
           : (int)llm_model_config.at("bos_token_id");
-  model_metadata.eos_token_id =
-      llm_model_config.find("eos_token_id") == llm_model_config.end()
-          ? -1
-          : (int)llm_model_config.at("eos_token_id");
+  // model_metadata.eos_token_id =
+  //     llm_model_config.find("eos_token_id") == llm_model_config.end()
+  //         ? -1
+  //         : (int)llm_model_config.at("eos_token_id");
+  if (llm_model_config.find("eos_token_id") != llm_model_config.end()) {
+    if (llm_model_config["eos_token_id"].is_array()) {
+      for (auto &eos_token_id : llm_model_config["eos_token_id"]) {
+        model_metadata.eos_token_ids.push_back(eos_token_id);
+      }
+    } else {
+      model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]);
+    }
+  } else {
+    model_metadata.eos_token_ids.push_back(-1);
+  }
 
   for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) {
     std::string ssm_config_path = join_path({file_paths.cache_folder_path,
@@ -261,7 +274,8 @@ void get_model_meta(FilePaths &file_paths,
     ModelType ssm_model_type = ModelType::UNKNOWN;
     auto architectures = ssm_model_config["architectures"];
     for (auto const &str : architectures) {
-      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM" ||
+          str == "MistralForCausalLM") {
         ssm_model_type = ModelType::LLAMA;
         break;
       } else if (str == "OPTForCausalLM") {
@@ -279,15 +293,15 @@ void get_model_meta(FilePaths &file_paths,
         ssm_model_config.find("bos_token_id") == ssm_model_config.end()
             ? -1
             : (int)ssm_model_config.at("bos_token_id");
-    int ssm_eos_id =
-        ssm_model_config.find("eos_token_id") == ssm_model_config.end()
-            ? -1
-            : (int)ssm_model_config.at("eos_token_id");
-    if (ssm_bos_id != model_metadata.bos_token_id ||
-        ssm_eos_id != model_metadata.eos_token_id) {
-      printf("Warning: bos/eos token id mismatch between LLM and one of the "
-             "SSMs!\n");
-    }
+    // int ssm_eos_id =
+    //     ssm_model_config.find("eos_token_id") == ssm_model_config.end()
+    //         ? -1
+    //         : (int)ssm_model_config.at("eos_token_id");
+    // if (ssm_bos_id != model_metadata.bos_token_id ||
+    //     ssm_eos_id != model_metadata.eos_token_id) {
+    //   printf("Warning: bos/eos token id mismatch between LLM and one of the "
+    //          "SSMs!\n");
+    // }
     model_metadata.ssm_model_types.push_back(ssm_model_type);
     model_metadata.ssm_model_config_paths.push_back(ssm_config_path);
     model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path);
@@ -397,7 +411,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_streaming_cache(streaming_cache);
   rm->register_tokenizer(model_metadata.llm_model_type,
                          model_metadata.bos_token_id,
-                         model_metadata.eos_token_id,
+                         model_metadata.eos_token_ids,
                          model_metadata.llm_tokenizer_path);
   rm->set_decoding_mode(decoding_mode);
   rm->set_slo_violation_early_termination(slo_attainment_early_termination);
diff --git a/inference/utils/mem_analysis.py b/inference/utils/mem_analysis.py
new file mode 100644
index 000000000..5168e7003
--- /dev/null
+++ b/inference/utils/mem_analysis.py
@@ -0,0 +1,115 @@
+import pandas as pd
+import re, os, math, argparse
+
+# Usage:
+# Run FlexFlow code with --log-instance-creation flag and redirect the output to a file
+# python mem_analysis.py --file_path /path/to/log_file.txt
+
+def extract_data(file_path):
+    # Define regex patterns
+    memory_allocator_pattern = re.compile(r'MemoryAllocator.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task_name: (.+)')
+    mapper_pattern = re.compile(r'Mapper.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task: (.+)')
+    parallel_tensor_pattern = re.compile(r'ParallelTensor.*memory_kind: (\w+).*memory_id: (\w+).*size: (\d+).*capacity (\d+).*task_name: (.+)')
+
+    # Initialize lists to store extracted data
+    memory_kinds = []
+    memory_ids = []
+    sizes = []
+    capacities = []
+    tasks = []
+
+    # Read the file
+    with open(file_path, 'r') as file:
+        for line in file:
+            if 'MemoryAllocator' in line:
+                match = memory_allocator_pattern.search(line)
+                if match:
+                    memory_kinds.append(match.group(1))
+                    memory_ids.append(match.group(2))
+                    sizes.append(int(match.group(3)))
+                    capacities.append(int(match.group(4)))
+                    tasks.append(match.group(5))
+            elif 'Mapper' in line:
+                match = mapper_pattern.search(line)
+                if match:
+                    memory_kinds.append(match.group(1))
+                    memory_ids.append(match.group(2))
+                    sizes.append(int(match.group(3)))
+                    capacities.append(int(match.group(4)))
+                    tasks.append(match.group(5))
+            elif 'ParallelTensor' in line:
+                match = parallel_tensor_pattern.search(line)
+                if match:
+                    memory_kinds.append(match.group(1))
+                    memory_ids.append(match.group(2))
+                    sizes.append(int(match.group(3)))
+                    capacities.append(int(match.group(4)))
+                    tasks.append(match.group(5))
+
+    # Create a DataFrame
+    df = pd.DataFrame({
+        'Memory Kind': memory_kinds,
+        'Device ID': memory_ids,
+        'Size': sizes,
+        'Capacity': capacities,
+        'Task': tasks
+    })
+
+    return df
+
+def human_readable_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB")
+    i = int(math.floor(math.log(size_bytes, 1000)))
+    p = math.pow(1000, i)
+    s = round(size_bytes / p, 2)
+    return f"{s} {size_name[i]}"
+
+def print_grouped_by_device(df):
+    grouped_df = df.groupby(['Memory Kind', 'Device ID']).agg({'Size': 'sum', 'Capacity': 'first'})
+    # Check that all entries that share the same memory id have the same capacity
+    for (memory_kind, memory_id), group in df.groupby(['Memory Kind', 'Device ID']):
+        capacities = group['Capacity'].unique()
+        if len(capacities) > 1:
+            print(f"Warning: Device ID {memory_id} in Memory Kind {memory_kind} has multiple capacities: {capacities}")
+    # Convert sizes to human-readable format
+    grouped_df['Size'] = grouped_df['Size'].apply(human_readable_size)
+    grouped_df['Capacity'] = grouped_df['Capacity'].apply(human_readable_size)
+    print("############## Memory usage (by device) ##############")
+    print(grouped_df)
+
+def print_grouped_by_task(df):
+    # Group by 'Memory Kind', 'Device ID', and 'Task', and sum the 'Size' column
+    task_grouped_df = df.groupby(['Memory Kind', 'Device ID', 'Task']).agg({'Size': 'sum'}).reset_index()
+    # Sort the DataFrame by 'Memory Kind', 'Device ID', and 'Size' in descending order
+    task_grouped_df = task_grouped_df.sort_values(by=['Memory Kind', 'Device ID', 'Size'], ascending=[True, True, False])
+    print("\n\n############## Memory usage (by task) ##############")
+    for (memory_kind, memory_id), group in task_grouped_df.groupby(['Memory Kind', 'Device ID']):
+        print("\n-------------------------------------------------------------")
+        print(f"Memory Kind: {memory_kind}, Device ID: {memory_id}")
+        group['Size'] = group['Size'].apply(human_readable_size)
+        print(group[['Task', 'Size']].to_string(index=False))
+        print("-------------------------------------------------------------")
+
+def print_notes():
+    print("\n\n############## Notes ##############")
+    print("* Check that each GPU retains enough capacity in GPU_FB_MEM to hold the weights from Z_COPY_MEM (total size / tp_degree)")
+    print("* Check whether the memory usage is balanced across devices")
+    print("* `set_tensor` generally refers to the memory used to load the model weights")
+    print()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Analyze memory usage from a FlexFlow log file.')
+    parser.add_argument('--file_path', '-fp', type=str, help='Path to the input log file')
+    args = parser.parse_args()
+
+    # Change working directory to the directory holding the script
+    # script_dir = os.path.dirname(os.path.abspath(__file__))
+    # os.chdir(script_dir)
+    
+    df = extract_data(args.file_path)
+    print_grouped_by_device(df)
+    print_grouped_by_task(df)
+
+    print_notes()
\ No newline at end of file
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index 2820cf485..24bb15889 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -40,6 +40,7 @@
     "zero_copy_memory_per_node": "-ll:zsize",
     "num_cpus": "-ll:cpu",
     "legion_utility_processors": "-ll:util",
+    "log_instance_creation": "--log-instance-creation",
     "profiling": "--profiling",
     "benchmarking": "--benchmarking",
     "inference_debugging": "--inference-debugging",
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 0e59bb547..e58ed57bc 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -280,3 +280,7 @@ def convert_hf_model(model, dst_folder):
                 .replace("model_", "")
             )
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
+        # LM head weight
+        model.lm_head.weight.detach().cpu().numpy().tofile(
+            os.path.join(dst_folder, "output_weight")
+        )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 58c9dc9aa..37606e875 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -28,7 +28,6 @@
 )
 from flexflow.core import *
 from transformers import AutoConfig, AutoModelForCausalLM
-from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
 import sys, torch, shutil, hashlib
 from typing import Union, List
@@ -96,6 +95,7 @@ def __init__(
         self.supported_models = {
             "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
             "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "MistralForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
             "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
             "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
             "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
@@ -272,7 +272,7 @@ def download_hf_tokenizer_if_needed(self):
                 f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
             )
             # Load/download the tokenizer files
-            target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt"]
+            target_tokenizer_files = ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.json", "merges.txt", "tokenizer.model"]
             if os.path.exists(self.model_name):
                 hf_tokenizer_path = self.model_name
             else:
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 39d7b1ec5..1ac9a511d 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -2711,7 +2711,7 @@ void flexflow_request_manager_register_tokenizer(
          "Cannot convert nullptr char * to std::string");
   std::string const tokenizer_filepath_str(tokenizer_filepath);
   handle->register_tokenizer(
-      model_type, bos_token_id, eos_token_id, tokenizer_filepath_str);
+      model_type, bos_token_id, {eos_token_id}, tokenizer_filepath_str);
   DEBUG_PRINT(
       "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath);
 }
@@ -2831,7 +2831,6 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_) {
   FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_);
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
-  // handle->load_weights(model);
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
   handle->load_weights_parallel(model, ctx, runtime);
diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc
index 5314e9dfe..38127a1cf 100644
--- a/src/mapper/mapper.cc
+++ b/src/mapper/mapper.cc
@@ -292,9 +292,7 @@ void FFMapper::select_task_options(MapperContext const ctx,
     output.initial_proc = all_cpus[0];
     return;
   }
-  if ((task.task_id == LOAD_FLOAT_WEIGHT_TASK_ID) ||
-      (task.task_id == LOAD_HALF_WEIGHT_TASK_ID) ||
-      (task.task_id == LOAD_QUANT_WEIGHT_TASK_ID)) {
+  if (task.task_id == LOAD_WEIGHT_TASK_ID) {
     output.initial_proc = all_cpus[0];
     return;
   }
@@ -655,17 +653,18 @@ void FFMapper::map_task(MapperContext const ctx,
                                task.regions[idx],
                                created,
                                &footprint)) {
-      if (log_instance_creation) {
-        for (size_t idx = 0; idx < created_instances.size(); idx++) {
-          log_ff_mapper.print("Instance[%zu]: memory:" IDFMT "	proc:" IDFMT
-                              "	size:%zu	task:%s",
-                              idx,
-                              created_instances[idx].memory.id,
-                              created_instances[idx].processor.id,
-                              created_instances[idx].size,
-                              created_instances[idx].task_name.c_str());
-        }
-      }
+      // if (log_instance_creation) {
+      //   for (size_t idx = 0; idx < created_instances.size(); idx++) {
+      //     log_ff_mapper.print("Instance[%zu]: memory: " IDFMT "	proc: "
+      //     IDFMT
+      //                         "	size: %zu	task: %s",
+      //                         idx,
+      //                         created_instances[idx].memory.id,
+      //                         created_instances[idx].processor.id,
+      //                         created_instances[idx].size,
+      //                         created_instances[idx].task_name.c_str());
+      //   }
+      // }
       // Report failed to creation
       log_ff_mapper.error(
           "Out of memory! FlexFlow failed to reserve block of size %s"
@@ -693,6 +692,16 @@ void FFMapper::map_task(MapperContext const ctx,
       clog.memory = target_mem;
       clog.processor = task.target_proc;
       created_instances.push_back(clog);
+      log_ff_mapper.print(
+          "Created Instance[%lu]: memory_kind: %s memory_id: %llx	"
+          "proc: " IDFMT "	size: %zu	(capacity %lu) task: %s",
+          created_instances.size() - 1,
+          Legion::Mapping::Utilities::to_string(clog.memory.kind()),
+          clog.memory.id,
+          clog.processor.id,
+          clog.size,
+          clog.memory.capacity(),
+          clog.task_name.c_str());
     }
   } // for idx
 }
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 1add43ecd..ae66d9b86 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -38,7 +38,8 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "AddBiasResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ceb1a6514..2ce5605b6 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -37,7 +37,8 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "AddBiasResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index fbeb5497c..a88963aaa 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -228,8 +228,8 @@ ArgTopKMeta::ArgTopKMeta(FFHandler handler,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, op) {
   max_input_size = BatchConfig::MAX_NUM_TOKENS * 32000; // TODO: use vocab_size
-  gpu_mem_allocator.create_legion_instance(reserveInst,
-                                           sizeof(half) * max_input_size);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, sizeof(half) * max_input_size, "ArgTopKMeta");
   half_precision_output = gpu_mem_allocator.allocate_instance_untyped(
       sizeof(half) * max_input_size);
 }
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 8a1cf0b3b..bd0b2bd19 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -493,7 +493,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
   size_t prob_size = batch_size;
   assert(data_type == DT_FLOAT || data_type == DT_HALF);
   size_t total_size = prob_size * sizeof(float);
-  gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, total_size, "ArgMaxMeta");
   probs = gpu_mem_allocator.allocate_instance<float>(prob_size);
 }
 ArgMaxMeta::~ArgMaxMeta(void) {
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index e7baef6d1..42d1a96f3 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -161,7 +161,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
            ? sizeof(cub::KeyValuePair<int, float>) * batch_size
            : sizeof(cub::KeyValuePair<int, half>) * batch_size) +
       prob_size * sizeof(float);
-  gpu_mem_allocator.create_legion_instance(reserveInst, total_size);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, total_size, "ArgMaxMeta");
   d_offsets = gpu_mem_allocator.allocate_instance<int>(d_offsets_size);
   d_out = data_type == DT_FLOAT
               ? gpu_mem_allocator.allocate_instance_untyped(
@@ -200,7 +201,8 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
         stream));
   }
 
-  gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, temp_storage_bytes, "ArgMaxMeta");
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
 }
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 6307362ea..d95cf1469 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -355,7 +355,7 @@ void FusedOp::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  launcher.concurrent = true;
+  // launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
@@ -446,7 +446,7 @@ void FusedOp::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.concurrent = true;
+  // launcher.concurrent = true;
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   switch (domain.get_dim()) {
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 9ba9f7b8b..8b39b8b37 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -1048,7 +1048,7 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
         Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+            ctx, runtime, m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       default: {
diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index 0878eb6fe..3635fda9c 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -607,7 +607,7 @@ GumbelTopKMeta::GumbelTopKMeta(FFHandler handler,
       BatchConfig::MAX_NUM_TOKENS *
       max(BatchConfig::MAX_SPECULATIVE_TREE_BRANCHES, CUDA_NUM_THREADS);
   gpu_mem_allocator.create_legion_instance(
-      reserveInst, sizeof(curandState) * state_max_length);
+      reserveInst, sizeof(curandState) * state_max_length, "GumbelTopKMeta");
   state = gpu_mem_allocator.allocate_instance<curandState>(state_max_length);
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 5e07fa214..449940155 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -1010,9 +1010,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              totalSharedSize);
-      gpu_mem_allocator.create_legion_instance(reserveInst, instance_size);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta");
     } else {
-      gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta");
     }
 
     // in tree_verify, enable devQKVProjArray;
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 7472b61f0..e220e8285 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -568,9 +568,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       assert(gpu_mem_allocator.reserved_total_size -
                  gpu_mem_allocator.reserved_allocated_size >=
              totalSharedSize);
-      gpu_mem_allocator.create_legion_instance(reserveInst, instance_size);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta");
     } else {
-      gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+      gpu_mem_allocator.create_legion_instance(
+          reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta");
     }
 
     // in tree_verify, enable devQKVProjArray;
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index c30c9f71c..c495e42eb 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -40,7 +40,7 @@ LinearMeta::LinearMeta(FFHandler handler,
   }
   // Allocate an all-one's vector
   gpu_mem_allocator.create_legion_instance(
-      reserveInst, data_type_size(data_type) * batch_size);
+      reserveInst, data_type_size(data_type) * batch_size, "LinearMeta");
   one_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * batch_size);
   int parallelism = batch_size;
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 690655645..ed0b0f9a5 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -42,7 +42,8 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualRMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 2c82308ab..65125bae1 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -43,7 +43,8 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualRMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 24ab7051e..9636929d9 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -42,7 +42,8 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "RMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 7c9f4a9f9..8555e58be 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -43,7 +43,8 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
   size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "RMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 44979c48f..4289a9236 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -37,7 +37,8 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 6;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "LayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index f1b7a537b..046a4bc25 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -38,7 +38,8 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index e5ebdce6e..05e66db02 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -37,7 +37,8 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   eps = ln->eps;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "ResidualLayerNormMeta");
   mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
   rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
diff --git a/src/ops/sampling.cpp b/src/ops/sampling.cpp
index 3d8f10352..03e37333e 100644
--- a/src/ops/sampling.cpp
+++ b/src/ops/sampling.cpp
@@ -204,7 +204,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
                                     idx_size + sorted_idx_size) +
                      data_type_size(data_type) * sorted_logits_size +
                      sizeof(hiprandState) * state_size;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "SamplingMeta");
   begin_offset = gpu_mem_allocator.allocate_instance<int>(begin_offset_size);
   end_offset = gpu_mem_allocator.allocate_instance<int>(end_offset_size);
   idx = gpu_mem_allocator.allocate_instance<int>(idx_size);
@@ -262,7 +263,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
   //   assert(false && "input type in float and half");
   // }
 
-  gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, temp_storage_bytes, "SamplingMeta");
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
 }
diff --git a/src/ops/sampling.cu b/src/ops/sampling.cu
index 494a5ab3f..686817096 100644
--- a/src/ops/sampling.cu
+++ b/src/ops/sampling.cu
@@ -228,7 +228,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
                                     idx_size + sorted_idx_size) +
                      data_type_size(data_type) * sorted_logits_size +
                      sizeof(curandState) * state_size;
-  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "SamplingMeta");
   begin_offset = gpu_mem_allocator.allocate_instance<int>(begin_offset_size);
   end_offset = gpu_mem_allocator.allocate_instance<int>(end_offset_size);
   idx = gpu_mem_allocator.allocate_instance<int>(idx_size);
@@ -286,7 +287,8 @@ SamplingMeta::SamplingMeta(FFHandler handler,
     assert(false && "input type in float and half");
   }
 
-  gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, temp_storage_bytes, "SamplingMeta");
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
 }
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 303fb9aa7..2e5cc9fa7 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -204,6 +204,8 @@ Tensor FFModel::spec_inc_multiquery_self_attention(
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
   li->add_int_property("streaming_cache", streaming_cache);
+  li->add_int_property("tensor_parallelism_degree",
+                       config.tensor_parallelism_degree);
   layers.push_back(li);
   return li->outputs[0];
 }
@@ -255,6 +257,8 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   bool position_bias = (bool)value;
   layer->get_int_property("streaming_cache", value);
   bool streaming_cache = (bool)value;
+  layer->get_int_property("tensor_parallelism_degree", value);
+  int tensor_parallelism_degree = (int)value;
 
   return new SpecIncMultiHeadSelfAttention(model,
                                            layer->layer_guid,
@@ -275,6 +279,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
                                            position_bias,
                                            false /*allocate_weights*/,
                                            streaming_cache,
+                                           tensor_parallelism_degree,
                                            layer->name);
 }
 
@@ -298,6 +303,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _position_bias,
     bool allocate_weights,
     bool _streaming_cache,
+    int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -316,7 +322,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
-      position_bias(_position_bias), streaming_cache(_streaming_cache) {
+      position_bias(_position_bias), streaming_cache(_streaming_cache),
+      tensor_parallelism_degree(_tensor_parallelism_degree) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
 
@@ -399,6 +406,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     bool _position_bias,
     bool allocate_weights,
     bool _streaming_cache,
+    int _tensor_parallelism_degree,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -418,7 +426,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
       o_dim(_embed_dim), qoSeqLength(_input->dims[1].size),
       kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query),
       scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling),
-      position_bias(_position_bias), streaming_cache(_streaming_cache)
+      position_bias(_position_bias), streaming_cache(_streaming_cache),
+      tensor_parallelism_degree(_tensor_parallelism_degree)
 // bias_initializer(_bias_initializer)
 {
   numOutputs = 1;
@@ -508,6 +517,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     other.position_bias,
                                     allocate_weights,
                                     other.streaming_cache,
+                                    other.tensor_parallelism_degree,
                                     other.name) {}
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
@@ -535,6 +545,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.position_bias,
                                     allocate_weights,
                                     params.streaming_cache,
+                                    params.tensor_parallelism_degree,
                                     params.name) {}
 
 void SpecIncMultiHeadSelfAttention::init_inference(
@@ -660,8 +671,10 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1;
   assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
-  int num_q_heads = attn->num_q_heads;
-  int num_kv_heads = attn->num_kv_heads;
+  int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree;
+  int num_kv_heads =
+      attn->num_kv_heads / attn->tensor_parallelism_degree +
+      (attn->num_kv_heads % attn->tensor_parallelism_degree != 0);
   assert(attn->o_dim == output.domain.hi()[0] - output.domain.lo()[0] + 1);
 
   Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -891,6 +904,7 @@ SpecIncMultiHeadSelfAttentionParams
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
   params.streaming_cache = this->streaming_cache;
+  params.tensor_parallelism_degree = this->tensor_parallelism_degree;
   if (this->name != nullptr) {
     strcpy(params.name, this->name);
   }
@@ -927,6 +941,7 @@ size_t hash<FlexFlow::SpecIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.qk_prod_scaling);
   hash_combine(key, params.position_bias);
   hash_combine(key, params.streaming_cache);
+  hash_combine(key, params.tensor_parallelism_degree);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index e797d40d3..92bcbc546 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -634,8 +634,10 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                                   // be added here later
 
     // We always directly allocate memory for small speculative models
-    gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst,
-                                             total_size);
+    gpu_mem_allocator.create_legion_instance(
+        beam_search_reserve_inst,
+        total_size,
+        "SpecIncMultiHeadSelfAttentionMeta");
     beam_token_infos =
         gpu_mem_allocator
             .allocate_instance<TreeSearchBatchConfig::BeamSearchPerTokenInfo>(
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index f748dafd6..cf3426b3e 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -643,8 +643,10 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
           gpu_mem_allocator.allocate_reserved<BatchConfig::CommittedTokensInfo>(
               committed_tokeninfo_size);
     } else {
-      gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst,
-                                               total_size);
+      gpu_mem_allocator.create_legion_instance(
+          committed_token_reserve_inst,
+          total_size,
+          "TreeIncMultiHeadSelfAttentionMeta");
       committed_token_infos =
           gpu_mem_allocator.allocate_instance<BatchConfig::CommittedTokensInfo>(
               committed_tokeninfo_size);
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index c7b7a7433..7f38e2714 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -134,7 +134,7 @@ void AllReduce::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  launcher.concurrent = true;
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -173,7 +173,7 @@ void AllReduce::init_inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.concurrent = true;
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
@@ -278,7 +278,7 @@ void AllReduce::backward(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          inputs[0]->machine_view.hash());
-  launcher.concurrent = true;
+  // launcher.concurrent = true;
   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                     0 /*projection id*/,
                                                     READ_WRITE,
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp
index 8d7e20e39..1e60728fa 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cpp
+++ b/src/parallel_ops/kernels/allreduce_kernels.cpp
@@ -25,7 +25,9 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
+void inference_kernel_wrapper(Legion::Context ctx,
+                              Legion::Runtime *runtime,
+                              AllReduceMeta const *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output) {
@@ -37,6 +39,7 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
   size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
                           num_elements,
@@ -44,12 +47,15 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
                           ncclSum,
                           m->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 #else
   assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
+void forward_kernel_wrapper(Legion::Context ctx,
+                            Legion::Runtime *runtime,
+                            AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output) {
   hipStream_t stream;
@@ -59,6 +65,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
                           input.domain.get_volume(),
@@ -66,6 +73,7 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
                           ncclSum,
                           m->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 #else
   assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
 #endif
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 52bd05b06..8644a5a3c 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -29,7 +29,8 @@ AllReduceMeta::AllReduceMeta(FFHandler handle,
                      tensorrt_llm::MAX_RANKS_PER_NODE;
   gpu_mem_allocator.create_legion_instance(
       reserveInst,
-      sizeof(void *) * (handle.num_devices + 1) + barrier_ptr_size * 2);
+      sizeof(void *) * (handle.num_devices + 1) + barrier_ptr_size * 2,
+      "AllReduceMeta");
   allgather_src = gpu_mem_allocator.allocate_instance_untyped(sizeof(void *));
   allgather_dst = gpu_mem_allocator.allocate_instance_untyped(
       sizeof(void *) * handle.num_devices);
@@ -57,7 +58,9 @@ AllReduceMeta::~AllReduceMeta() {
 namespace Kernels {
 namespace AllReduce {
 
-CommunicationBuffer *get_or_create_comm_buffer(AllReduceMeta *m,
+CommunicationBuffer *get_or_create_comm_buffer(Legion::Context ctx,
+                                               Legion::Runtime *runtime,
+                                               AllReduceMeta *m,
                                                int num_devices,
                                                int device_id,
                                                ncclComm_t ncclComm,
@@ -68,7 +71,9 @@ CommunicationBuffer *get_or_create_comm_buffer(AllReduceMeta *m,
     return iter->second;
   } else {
     CommunicationBuffer *comm_buffer =
-        create_comm_buf_with_local_ptr(num_devices,
+        create_comm_buf_with_local_ptr(ctx,
+                                       runtime,
+                                       num_devices,
                                        device_id,
                                        ncclComm,
                                        m->allgather_src,
@@ -118,8 +123,8 @@ inline bool CanApplyTwoShotAllReduce(int64_t num_elements,
 }
 
 // Customized all-reduce kernel backed by CUDA Peer memory.
-void inference_kernel_wrapper(Context ctx,
-                              Runtime *runtime,
+void inference_kernel_wrapper(Legion::Context ctx,
+                              Legion::Runtime *runtime,
                               AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
@@ -133,68 +138,72 @@ void inference_kernel_wrapper(Context ctx,
   assert(input.domain == output.domain);
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   size_t num_elements = bc->num_tokens * hidden_dim_size;
-  int num_devices = m->handle.num_devices;
-  int device_id = m->handle.device_id;
+  // int num_devices = m->handle.num_devices;
+  // int device_id = m->handle.device_id;
   ncclComm_t ncclComm = m->handle.ncclComm;
   DataType dtype = input.data_type;
 
-  tensorrt_llm::AllReduceStrategyType strategy =
-      tensorrt_llm::SelectImplementation(
-          num_elements * ((get_bits(dtype) + 7) / 8), num_devices);
+  // tensorrt_llm::AllReduceStrategyType strategy =
+  //     tensorrt_llm::SelectImplementation(
+  //         num_elements * ((get_bits(dtype) + 7) / 8), num_devices);
 
-  if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
-      !CanApplyCustomAllReduce(num_elements, dtype)) {
+  // if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
+  //     !CanApplyCustomAllReduce(num_elements, dtype)) {
     // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
-    ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
-    runtime->concurrent_task_barrier(ctx);
-    checkNCCL(ncclAllReduce(input.ptr,
-                            output.ptr,
-                            num_elements,
-                            nccl_data_type,
-                            ncclSum,
-                            ncclComm,
-                            stream));
-    runtime->concurrent_task_barrier(ctx);
-    return;
-  }
-
-  // Initialize the all-reduce kernel arguments.
-  tensorrt_llm::AllReduceParams params;
-  params.ranks_per_node = num_devices;
-  params.rank = device_id;
-  params.local_rank = device_id;
-  CommunicationBuffer *comm_buffer =
-      get_or_create_comm_buffer(m,
-                                num_devices,
-                                device_id,
-                                ncclComm,
-                                const_cast<void *>(input.ptr),
-                                stream);
-  params.barrier_flag = ++(*comm_buffer->barrier_flag);
-  for (int i = 0; i < num_devices; ++i) {
-    params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
-  }
-  for (int i = 0; i < num_devices; ++i) {
-    params.peer_barrier_ptrs_in[i] =
-        reinterpret_cast<uint32_t *>(comm_buffer->barrier_in[i]);
-  }
-  for (int i = 0; i < num_devices; ++i) {
-    params.peer_barrier_ptrs_out[i] =
-        reinterpret_cast<uint32_t *>(comm_buffer->barrier_out[i]);
-  }
-
-  if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) {
-    // Two-shot all-reduce does not support this case.
-    // So we fallback to the one-shot strategy.
-    strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
-  }
-
-  tensorrt_llm::customAllReduce(
-      params, output.ptr, num_elements, dtype, strategy, stream);
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
+  runtime->concurrent_task_barrier(ctx);
+  checkNCCL(ncclAllReduce(input.ptr,
+                          output.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          ncclComm,
+                          stream));
+  runtime->concurrent_task_barrier(ctx);
+  //   return;
+  // }
+
+  // // Initialize the all-reduce kernel arguments.
+  // tensorrt_llm::AllReduceParams params;
+  // params.ranks_per_node = num_devices;
+  // params.rank = device_id;
+  // params.local_rank = device_id;
+  // CommunicationBuffer *comm_buffer =
+  //     get_or_create_comm_buffer(ctx,
+  //                               runtime,
+  //                               m,
+  //                               num_devices,
+  //                               device_id,
+  //                               ncclComm,
+  //                               const_cast<void *>(input.ptr),
+  //                               stream);
+  // params.barrier_flag = ++(*comm_buffer->barrier_flag);
+  // for (int i = 0; i < num_devices; ++i) {
+  //   params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
+  // }
+  // for (int i = 0; i < num_devices; ++i) {
+  //   params.peer_barrier_ptrs_in[i] =
+  //       reinterpret_cast<uint32_t *>(comm_buffer->barrier_in[i]);
+  // }
+  // for (int i = 0; i < num_devices; ++i) {
+  //   params.peer_barrier_ptrs_out[i] =
+  //       reinterpret_cast<uint32_t *>(comm_buffer->barrier_out[i]);
+  // }
+
+  // if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) {
+  //   // Two-shot all-reduce does not support this case.
+  //   // So we fallback to the one-shot strategy.
+  //   strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
+  // }
+
+  // runtime->concurrent_task_barrier(ctx);
+  // tensorrt_llm::customAllReduce(
+  //     params, output.ptr, num_elements, dtype, strategy, stream);
+  // runtime->concurrent_task_barrier(ctx);
 }
 
-void forward_kernel_wrapper(Context ctx,
-                            Runtime *runtime,
+void forward_kernel_wrapper(Legion::Context ctx,
+                            Legion::Runtime *runtime,
                             AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output) {
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index b5d4c10ce..ba110f676 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -129,6 +129,16 @@ std::ostream &operator<<(std::ostream &os,
   return os;
 }
 
+std::ostream &operator<<(std::ostream &os, BatchConfig::BitMask const &bm) {
+  os << "BitMask {\n"
+     << "  non_tree_cache_size: " << bm.non_tree_cache_size << "\n"
+     << "  tree_or_prompt_size: " << bm.tree_or_prompt_size << "\n"
+     << "  current_layer_size: " << bm.current_layer_size << "\n"
+     << "  bit_mask: [" << bm.bit_mask << "]\n";
+  os << "}";
+  return os;
+}
+
 std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "@@@@@@@@@@@@@@ Batch Config (mode " << bc.get_mode()
      << ") @@@@@@@@@@@@@@" << std::endl;
@@ -241,6 +251,38 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   return os;
 }
 
+std::ostream &operator<<(std::ostream &os, InferenceResult const &ir) {
+  os << "InferenceResult {\n"
+     << "  num_token_ids: " << ir.num_token_ids << "\n"
+     << "  num_gumbel_logits: " << ir.num_gumbel_logits << "\n"
+     << "  token_ids: [";
+  for (int i = 0; i < ir.num_token_ids; i++) {
+    os << ir.token_ids[i];
+    if (i < ir.num_token_ids - 1) {
+      os << ", ";
+    }
+  }
+  os << "]\n"
+     << "  probs: [";
+  for (int i = 0; i < ir.num_token_ids; i++) {
+    os << ir.probs[i];
+    if (i < ir.num_token_ids - 1) {
+      os << ", ";
+    }
+  }
+  os << "]\n"
+     << "  gumbel_logits: [";
+  for (int i = 0; i < ir.num_gumbel_logits; i++) {
+    os << ir.gumbel_logits[i];
+    if (i < ir.num_gumbel_logits - 1) {
+      os << ", ";
+    }
+  }
+  os << "]\n"
+     << "}";
+  return os;
+}
+
 void BatchConfig::print() const {
   std::cout << *this << std::endl;
 }
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index bd6a862d0..14e806d49 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -650,14 +650,21 @@ void load_from_quantized_file(char *ptr,
 
 void FileDataLoader::load_quantization_weight(FFModel *ff,
                                               Layer *l,
-                                              int weight_idx) {
-  Tensor weight = l->weights[weight_idx];
-  size_t volume = 1;
+                                              int weight_idx,
+                                              size_t volume,
+                                              size_t num_replicas,
+                                              char *weight,
+                                              DataType data_type,
+                                              Domain weight_domain) {
+  // Tensor weight = l->weights[weight_idx];
+  size_t volume_ = 1;
   std::vector<int> dims_vec;
-  for (int i = 0; i < weight->num_dims; i++) {
-    dims_vec.push_back(weight->dims[i]);
-    volume *= weight->dims[i];
+  for (int i = 0; i < weight_domain.get_dim(); i++) {
+    int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1;
+    dims_vec.push_back(dim_i);
+    volume_ *= dim_i;
   }
+  assert(volume_ == volume * num_replicas);
   char *data = (char *)malloc(sizeof(char) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
@@ -672,7 +679,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
                                        head_dim,
                                        weight_filename,
                                        weights_folder,
-                                       weight->data_type,
+                                       data_type,
                                        use_full_precision);
     }
     // else {
@@ -694,13 +701,18 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     load_from_quantized_file(data,
                              volume,
                              join_path({weights_folder, weight_filename}),
-                             weight->data_type,
+                             data_type,
                              use_full_precision);
   }
 
-  ParallelTensor weight_pt;
-  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<char>(ff, dims_vec, data);
+  // ParallelTensor weight_pt;
+  // ff->get_parallel_tensor_from_tensor(weight, weight_pt);
+  // weight_pt->set_tensor<char>(ff, dims_vec, data);
+  char *ptr = weight;
+  for (size_t i = 0; i < num_replicas; i++) {
+    memcpy(ptr, data, volume * sizeof(char));
+    ptr += volume;
+  }
 
   free(data);
 }
@@ -708,17 +720,22 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
 template <typename DT>
 void FileDataLoader::load_single_weight_tensor(FFModel *ff,
                                                Layer *l,
-                                               int weight_idx) {
-  Tensor weight = l->weights[weight_idx];
+                                               int weight_idx,
+                                               size_t volume,
+                                               size_t num_replicas,
+                                               DT *weight,
+                                               Domain weight_domain) {
 
   // Create a buffer to store weight data from the file
-  size_t volume = 1;
+  size_t volume_ = 1;
   std::vector<int> dims_vec;
-  for (int i = 0; i < weight->num_dims; i++) {
-    dims_vec.push_back(weight->dims[i]);
-    volume *= weight->dims[i];
+  for (int i = 0; i < weight_domain.get_dim(); i++) {
+    int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1;
+    dims_vec.push_back(dim_i);
+    volume_ *= dim_i;
   }
-  assert(data_type_size(weight->data_type) == sizeof(DT));
+  assert(volume_ == volume * num_replicas);
+  // assert(data_type_size(weight->data_type) == sizeof(DT));
   DT *data = (DT *)malloc(sizeof(DT) * volume);
 
   std::string weight_filename = removeGuidOperatorName(std::string(l->name));
@@ -778,74 +795,67 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     }
   }
 
-  // Copy the weight data from the buffer to the weight's ParallelTensor
-  ParallelTensor weight_pt;
-  ff->get_parallel_tensor_from_tensor(weight, weight_pt);
-  weight_pt->set_tensor<DT>(ff, dims_vec, data);
+  // Copy the weight data from the buffer to the weight
+  DT *ptr = weight;
+  for (size_t i = 0; i < num_replicas; i++) {
+    memcpy(ptr, data, volume * sizeof(DT));
+    ptr += volume;
+  }
 
   // Free buffer memory
   free(data);
 }
 
-#ifdef DEADCODE
-void FileDataLoader::load_weights(FFModel *ff) {
-  for (Layer *l : ff->layers) {
-    if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
-      continue;
-    }
-    for (int i = 0; i < l->numWeights; i++) {
-      Tensor weight = l->weights[i];
-      if (weight == NULL) {
-        continue;
-      }
-      switch (weight->data_type) {
-        case DT_HALF:
-          load_single_weight_tensor<half>(ff, l, i);
-          break;
-        case DT_FLOAT:
-          load_single_weight_tensor<float>(ff, l, i);
-          break;
-        case DT_INT4:
-        case DT_INT8:
-          // load weights in quantization
-          load_quantization_weight(ff, l, i);
-          break;
-        default:
-          assert(false && "Unsupported data type");
-      }
-    }
-  }
-}
-#endif
-
-void FileDataLoader::load_float_weight_task(
+void FileDataLoader::load_weight_task(
     Legion::Task const *task,
     std::vector<Legion::PhysicalRegion> const &regions,
     Legion::Context ctx,
     Legion::Runtime *runtime) {
   WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
-  args->loader->load_single_weight_tensor<float>(
-      args->ff, args->layer, args->weight_idx);
-}
 
-void FileDataLoader::load_half_weight_task(
-    Legion::Task const *task,
-    std::vector<Legion::PhysicalRegion> const &regions,
-    Legion::Context ctx,
-    Legion::Runtime *runtime) {
-  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
-  args->loader->load_single_weight_tensor<half>(
-      args->ff, args->layer, args->weight_idx);
-}
-
-void FileDataLoader::load_quant_weight_task(
-    Legion::Task const *task,
-    std::vector<Legion::PhysicalRegion> const &regions,
-    Legion::Context ctx,
-    Legion::Runtime *runtime) {
-  WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args;
-  args->loader->load_quantization_weight(
-      args->ff, args->layer, args->weight_idx);
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 1); // one weight only
+  GenericTensorAccessorW weight = helperGetGenericTensorAccessorWO(
+      args->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  Domain weight_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+
+  switch (args->data_type) {
+    case DT_HALF: {
+      args->loader->load_single_weight_tensor<half>(args->ff,
+                                                    args->layer,
+                                                    args->weight_idx,
+                                                    args->volume,
+                                                    args->num_replicas,
+                                                    weight.get_half_ptr(),
+                                                    weight_domain);
+      break;
+    }
+    case DT_FLOAT: {
+      args->loader->load_single_weight_tensor<float>(args->ff,
+                                                     args->layer,
+                                                     args->weight_idx,
+                                                     args->volume,
+                                                     args->num_replicas,
+                                                     weight.get_float_ptr(),
+                                                     weight_domain);
+      break;
+    }
+    case DT_INT4:
+    case DT_INT8: {
+      args->loader->load_quantization_weight(args->ff,
+                                             args->layer,
+                                             args->weight_idx,
+                                             args->volume,
+                                             args->num_replicas,
+                                             weight.get_byte_ptr(),
+                                             args->data_type,
+                                             weight_domain);
+      break;
+    }
+    default:
+      assert(false && "Unsupported data type");
+  }
 }
 
 void FileDataLoader::load_weights_parallel(FFModel *ff,
@@ -857,41 +867,46 @@ void FileDataLoader::load_weights_parallel(FFModel *ff,
     if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) {
       continue;
     }
+
     for (int i = 0; i < l->numWeights; i++) {
       Tensor weight = l->weights[i];
       if (weight == NULL) {
         continue;
       }
 
+      if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF &&
+          weight->data_type != DT_INT4 && weight->data_type != DT_INT8) {
+        assert(false && "Unsupported data type");
+      }
+
+      ParallelTensor weight_pt;
+      ff->get_parallel_tensor_from_tensor(weight, weight_pt);
+
       // Create task arguments
-      WeightLoadTaskArgs args(ff, this, l, i);
-
-      switch (weight->data_type) {
-        case DT_HALF: {
-          TaskLauncher launcher(
-              LOAD_HALF_WEIGHT_TASK_ID,
-              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
-          futures.push_back(runtime->execute_task(ctx, launcher));
-          break;
-        }
-        case DT_FLOAT: {
-          TaskLauncher launcher(
-              LOAD_FLOAT_WEIGHT_TASK_ID,
-              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
-          futures.push_back(runtime->execute_task(ctx, launcher));
-          break;
-        }
-        case DT_INT4:
-        case DT_INT8: {
-          TaskLauncher launcher(
-              LOAD_QUANT_WEIGHT_TASK_ID,
-              TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
-          futures.push_back(runtime->execute_task(ctx, launcher));
-          break;
+      size_t volume = 1, num_replicas = 1;
+      if (weight_pt->sync_type == ParameterSyncType::NCCL) {
+        for (int i = 0; i < weight_pt->num_dims; i++) {
+          if (weight_pt->dims[i].is_replica_dim) {
+            num_replicas *= weight_pt->dims[i].size;
+          }
         }
-        default:
-          assert(false && "Unsupported data type");
+      } else if (weight_pt->sync_type == ParameterSyncType::PS) {
+        num_replicas = 1;
+      } else {
+        num_replicas = 1;
+      }
+      for (int i = 0; i < weight->num_dims; i++) {
+        volume *= weight->dims[i];
       }
+      WeightLoadTaskArgs args(
+          ff, this, l, i, volume, num_replicas, weight->data_type);
+      // launch task asynchronously
+      TaskLauncher launcher(LOAD_WEIGHT_TASK_ID,
+                            TaskArgument(&args, sizeof(WeightLoadTaskArgs)));
+      launcher.add_region_requirement(RegionRequirement(
+          weight_pt->region, WRITE_ONLY, EXCLUSIVE, weight_pt->region));
+      launcher.add_field(0, FID_DATA);
+      futures.push_back(runtime->execute_task(ctx, launcher));
     }
   }
 
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 326f446aa..30f42327f 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2388,6 +2388,7 @@ GraphOptimalViewSerialized
         sez.serialize(attn->position_bias);
         sez.serialize(attn->streaming_cache);
         sez.serialize(attn->num_kv_heads);
+        sez.serialize(attn->tensor_parallelism_degree);
         sez.serialize(strlen(attn->name));
         sez.serialize(attn->name, strlen(attn->name));
         break;
@@ -2903,7 +2904,8 @@ void FFModel::deserialize_graph_optimal_view(
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(num_inputs == 1);
-        int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads;
+        int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
+            tensor_parallelism_degree;
         float dropout, scaling_factor;
         bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
             scaling_query, qk_prod_scaling, position_bias, streaming_cache;
@@ -2938,6 +2940,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(position_bias);
         dez.deserialize(streaming_cache);
         dez.deserialize(num_kv_heads);
+        dez.deserialize(tensor_parallelism_degree);
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
@@ -2960,6 +2963,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.position_bias = position_bias;
         params.streaming_cache = streaming_cache;
         params.num_kv_heads = num_kv_heads;
+        params.tensor_parallelism_degree = tensor_parallelism_degree;
         strcpy(params.name, name);
         node = get_or_create_node<SpecIncMultiHeadSelfAttention>(inputs[0],
                                                                  params);
@@ -3223,21 +3227,21 @@ void FFModel::deserialize_graph_optimal_view(
     optimal_views[guid_to_nodes[guid]] = view;
   }
   assert(dez.get_remaining_bytes() == 0);
-  printf("Deserialized Views...\n");
-  for (auto const &it : optimal_views) {
-    printf("node[%zu]: type(%s) view(%d %d %d) ",
-           it.first.guid,
-           it.first.to_string().c_str(),
-           it.second.ndims,
-           it.second.dim[0],
-           it.second.start_device_id);
-    auto const &list = graph->inEdges.at(it.first);
-    for (auto const &it2 : list) {
-      Edge e = it2;
-      printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
-    }
-    printf("\n");
-  }
+  // printf("Deserialized Views...\n");
+  // for (auto const &it : optimal_views) {
+  //   printf("node[%zu]: type(%s) view(%d %d %d) ",
+  //          it.first.guid,
+  //          it.first.to_string().c_str(),
+  //          it.second.ndims,
+  //          it.second.dim[0],
+  //          it.second.start_device_id);
+  //   auto const &list = graph->inEdges.at(it.first);
+  //   for (auto const &it2 : list) {
+  //     Edge e = it2;
+  //     printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
+  //   }
+  //   printf("\n");
+  // }
 }
 
 }; // namespace FlexFlow
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index dd13bb2e0..864af656c 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -238,41 +238,41 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model,
   }
 
   // print optimized graph
-  for (size_t i = 0; i < model->operators.size(); i++) {
-    Op *op = model->operators[i];
-    if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
-      continue;
-    }
-    printf("operator[%zu]: type(%s) guid(%lu)\n",
-           i,
-           get_operator_type_name(model->operators[i]->op_type).c_str(),
-           model->operators[i]->op_guid);
-    for (int j = 0; j < op->numInputs; j++) {
-      assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
-      LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
-      printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
-    }
-    for (int j = 0; j < op->numOutputs; j++) {
-      LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
-      printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
-    }
-    for (int j = 0; j < op->numWeights; j++) {
-      LogicalRegion handle = op->weights[j]->region;
-      printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
-    }
-  }
+  // for (size_t i = 0; i < model->operators.size(); i++) {
+  //   Op *op = model->operators[i];
+  //   if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
+  //     continue;
+  //   }
+  //   printf("operator[%zu]: type(%s) guid(%lu)\n",
+  //          i,
+  //          get_operator_type_name(model->operators[i]->op_type).c_str(),
+  //          model->operators[i]->op_guid);
+  //   for (int j = 0; j < op->numInputs; j++) {
+  //     assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
+  //     LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
+  //     printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
+  //            j,
+  //            handle.get_index_space().get_id(),
+  //            handle.get_field_space().get_id(),
+  //            handle.get_tree_id());
+  //   }
+  //   for (int j = 0; j < op->numOutputs; j++) {
+  //     LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
+  //     printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
+  //            j,
+  //            handle.get_index_space().get_id(),
+  //            handle.get_field_space().get_id(),
+  //            handle.get_tree_id());
+  //   }
+  //   for (int j = 0; j < op->numWeights; j++) {
+  //     LogicalRegion handle = op->weights[j]->region;
+  //     printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
+  //            j,
+  //            handle.get_index_space().get_id(),
+  //            handle.get_field_space().get_id(),
+  //            handle.get_tree_id());
+  //   }
+  // }
 }
 
 void InferenceManager::init_operators_inference(FFModel *model) {
@@ -525,7 +525,7 @@ void FFModel::compile_inference() {
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc
index 06a7c468a..46bef18c8 100644
--- a/src/runtime/memory_allocator.cc
+++ b/src/runtime/memory_allocator.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/utils/memory_allocator.h"
+#include "flexflow/mapper.h"
 
 namespace FlexFlow {
 
@@ -21,14 +22,30 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 using Realm::RegionInstance;
+using namespace Legion;
+using namespace Mapping;
+
+Legion::Logger log_ff_mem_allocator("MemoryAllocator");
 
 MemoryAllocator::MemoryAllocator(Memory _memory)
     : memory(_memory), reserved_ptr(nullptr), instance_ptr(nullptr),
       reserved_total_size(0), reserved_allocated_size(0),
-      instance_total_size(0), instance_allocated_size(0) {}
+      instance_total_size(0), instance_allocated_size(0),
+      log_instance_creation(false) {
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  for (int i = 1; i < argc; i++) {
+    if (!strcmp(argv[i], "--log-instance-creation")) {
+      log_instance_creation = true;
+      break;
+    }
+  }
+}
 
 void MemoryAllocator::create_legion_instance(RegionInstance &inst,
-                                             size_t size) {
+                                             size_t size,
+                                             char const *task_name) {
   // Assert that we have used up previously created region instance
   assert(instance_total_size == instance_allocated_size);
   Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0),
@@ -38,6 +55,16 @@ void MemoryAllocator::create_legion_instance(RegionInstance &inst,
   Realm::RegionInstance::create_instance(
       inst, memory, bounds, field_sizes, 0, Realm::ProfilingRequestSet())
       .wait();
+  if (log_instance_creation) {
+    log_ff_mem_allocator.print(
+        "Created instance in memory_kind: %s memory_id: %llx size: %zu "
+        "(capacity %lu) task_name: %s",
+        Legion::Mapping::Utilities::to_string(memory.kind()),
+        memory.id,
+        size,
+        memory.capacity(),
+        ((task_name != NULL) ? task_name : "unknown"));
+  }
   instance_ptr = inst.pointer_untyped(0, 0);
   instance_total_size = size;
   instance_allocated_size = 0;
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index f48fab25b..2a72029c5 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3393,6 +3393,7 @@ void FFModel::create_operators_from_layers() {
                config.tensor_parallelism_degree > 1 &&
                (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
                 l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+                l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
                 // mlp layer
                 is_mlp_block(layer_idx) ||
                 // llama mlp layer
@@ -4106,6 +4107,7 @@ struct DefaultConfig {
   static int const epochs = 1;
   // const static int iterations = 1;
   static int const batchSize = 64;
+  static bool const log_instance_creation = false;
   static bool const profiling = false;
   static bool const benchmarking = false;
   static bool const inference_debugging = false;
@@ -4143,6 +4145,7 @@ FFConfig::FFConfig() {
   // iterations = DefaultConfig::iterations;
   batchSize = DefaultConfig::batchSize;
   profiling = DefaultConfig::profiling;
+  log_instance_creation = DefaultConfig::log_instance_creation;
   benchmarking = DefaultConfig::benchmarking;
   inference_debugging = DefaultConfig::inference_debugging;
   learningRate = DefaultConfig::learningRate;
@@ -4330,6 +4333,10 @@ void FFConfig::parse_args(char **argv, int argc) {
       cpusPerNode = atoi(argv[++i]);
       continue;
     }
+    if ((!strcmp(argv[i], "--log-instance-creation"))) {
+      log_instance_creation = true;
+      continue;
+    }
     if (!strcmp(argv[i], "--profiling")) {
       profiling = true;
       continue;
@@ -4534,47 +4541,16 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(LOAD_FLOAT_WEIGHT_TASK_ID,
-                                   "load_float_weight_task");
+    TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task");
     registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     if (pre_register) {
-      Runtime::preregister_task_variant<FileDataLoader::load_float_weight_task>(
-          registrar, "load_float_weight_task");
+      Runtime::preregister_task_variant<FileDataLoader::load_weight_task>(
+          registrar, "load_weight_task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FileDataLoader::load_float_weight_task>(
-          registrar);
-    }
-  }
-  {
-    TaskVariantRegistrar registrar(LOAD_HALF_WEIGHT_TASK_ID,
-                                   "load_half_weight_task");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    if (pre_register) {
-      Runtime::preregister_task_variant<FileDataLoader::load_half_weight_task>(
-          registrar, "load_half_weight_task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<FileDataLoader::load_half_weight_task>(
-          registrar);
-    }
-  }
-  {
-    TaskVariantRegistrar registrar(LOAD_QUANT_WEIGHT_TASK_ID,
-                                   "load_quant_weight_task");
-    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-    if (pre_register) {
-      Runtime::preregister_task_variant<FileDataLoader::load_quant_weight_task>(
-          registrar, "load_quant_weight_task");
-    } else {
-      if (enable_control_replication) {
-        registrar.global_registration = false;
-      }
-      runtime->register_task_variant<FileDataLoader::load_quant_weight_task>(
+      runtime->register_task_variant<FileDataLoader::load_weight_task>(
           registrar);
     }
   }
@@ -6476,7 +6452,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, FusedOp::init_task>(
           registrar, "FusedOp Init Task");
@@ -6715,7 +6690,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<OpMeta *, AllReduce::init_task>(
           registrar, "AllReduce init Task");
@@ -6767,7 +6741,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     // AllReduce forward and backward must run concurrentluy since they
     // use ncclAllReduce internally
-    registrar.set_concurrent();
     if (pre_register) {
       Runtime::preregister_task_variant<AllReduce::backward_task>(
           registrar, "AllReduce Backward Task");
diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp
index 59efaf525..a33ee35de 100644
--- a/src/runtime/optimizer_kernel.cpp
+++ b/src/runtime/optimizer_kernel.cpp
@@ -86,7 +86,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                 Legion::Runtime *runtime,
+                                                 SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
                                                  size_t size,
@@ -96,6 +98,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -103,6 +106,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
 
   // Step 2: SGD update
@@ -208,7 +212,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                  Legion::Runtime *runtime,
+                                                  AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
                                                   size_t size,
@@ -218,6 +224,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
   // Use NCCL to sync gradients
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
                           size,
@@ -225,6 +232,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
                           ncclSum,
                           meta->handle.ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu
index 72ee74940..6bc3d52b2 100644
--- a/src/runtime/optimizer_kernel.cu
+++ b/src/runtime/optimizer_kernel.cu
@@ -75,8 +75,8 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(Context ctx,
-                                                 Runtime *runtime,
+__host__ void SGDOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                 Legion::Runtime *runtime,
                                                  SGDOptimizer const *op,
                                                  OpMeta const *meta,
                                                  float const *w_grad_ptr,
@@ -187,8 +187,8 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(Context ctx,
-                                                  Runtime *runtime,
+__host__ void AdamOptimizer::nccl_update_task_gpu(Legion::Context ctx,
+                                                  Legion::Runtime *runtime,
                                                   AdamOptimizer const *op,
                                                   OpMeta const *meta,
                                                   float const *w_grad_ptr,
diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc
index 8f1be15fd..202983e8f 100644
--- a/src/runtime/parallel_tensor.cc
+++ b/src/runtime/parallel_tensor.cc
@@ -1,4 +1,5 @@
 #include "flexflow/ffconst_utils.h"
+#include "flexflow/mapper.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/attention.h"
 #include "flexflow/ops/concat.h"
@@ -19,6 +20,9 @@
 namespace FlexFlow {
 
 using namespace Legion;
+using namespace Legion;
+using namespace Mapping;
+Legion::Logger pt_logger("ParallelTensor");
 
 TensorBase::TensorBase(TensorBase const &rhs) {
   tensor_guid = rhs.tensor_guid;
@@ -647,11 +651,41 @@ bool ParallelTensorBase::is_valid_machine_view(MachineView const &view) const {
   return true;
 }
 
+size_t get_physical_region_size(PhysicalRegion const &pr,
+                                Context ctx,
+                                Runtime *runtime) {
+  // Get the logical region
+  LogicalRegion lr = pr.get_logical_region();
+
+  // Get the index space domain
+  Domain domain = runtime->get_index_space_domain(ctx, lr.get_index_space());
+
+  // Get number of elements in the domain
+  size_t num_elements = domain.get_volume();
+
+  // Get the field space
+  FieldSpace fs = lr.get_field_space();
+
+  // Get all fields in the field space
+  std::vector<FieldID> fields;
+  runtime->get_field_space_fields(ctx, fs, fields);
+
+  // Sum up the size of all fields
+  size_t total_field_size = 0;
+  for (FieldID fid : fields) {
+    size_t field_size = runtime->get_field_size(ctx, fs, fid);
+    total_field_size += field_size;
+  }
+
+  // Total size is number of elements times size of each element
+  return num_elements * total_field_size;
+}
+
 template <typename T>
 bool ParallelTensorBase::set_tensor(FFModel const *ff,
                                     std::vector<int> const &dim_sizes,
                                     T const *data) {
-  Context ctx = ff->config.lg_ctx;
+  Context ctx = Legion::Runtime::get_context();
   Runtime *runtime = ff->config.lg_hlr;
   // TODO: check data type matches
   // TODO: Currently we use a task launch, change to index launch for NCCL
@@ -678,6 +712,28 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff,
   InlineLauncher launcher(req);
   PhysicalRegion pr = runtime->map_region(ctx, launcher);
   pr.wait_until_valid();
+
+  if (ff->config.log_instance_creation) {
+    size_t pr_size = get_physical_region_size(pr, ctx, runtime);
+    if (pr_size != volume * num_replicas * sizeof(T)) {
+      std::cout << "Physical region size: " << pr_size << std::endl;
+      std::cout << "Volume: " << volume << std::endl;
+      std::cout << "Num replicas: " << num_replicas << std::endl;
+      std::cout << "Size of T: " << sizeof(T) << std::endl;
+    }
+    assert(pr_size == volume * num_replicas * sizeof(T));
+    std::set<Memory> memories;
+    pr.get_memories(memories);
+    assert(memories.size() == 1);
+    Memory memory = *(memories.begin());
+    pt_logger.print("Created instance in memory_kind: %s memory_id: %llx size: "
+                    "%zu (capacity %lu) task_name: set_tensor",
+                    Legion::Mapping::Utilities::to_string(memory.kind()),
+                    memory.id,
+                    pr_size,
+                    memory.capacity());
+  }
+
   switch (num_dims) {
 #define DIMFUNC(DIM)                                                           \
   case DIM: {                                                                  \
@@ -704,7 +760,7 @@ template <typename T>
 bool ParallelTensorBase::get_tensor(FFModel const *ff,
                                     T *data,
                                     bool get_gradients) {
-  Context ctx = ff->config.lg_ctx;
+  Context ctx = Legion::Runtime::get_context();
   Runtime *runtime = ff->config.lg_hlr;
   LogicalRegion weight_lr = LogicalRegion::NO_REGION;
   if (sync_type == ParameterSyncType::PS) {
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 55ee6ea5e..734855fa5 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -35,6 +35,7 @@ namespace FlexFlow {
 
 using namespace Legion;
 using tokenizers::Tokenizer;
+using RequestGuid = BatchConfig::RequestGuid;
 
 Legion::Logger log_req_mgr("RequestManager");
 
@@ -263,6 +264,19 @@ void RequestManager::set_max_tree_width(int max_tree_width) {
   }
 }
 
+int RequestManager::get_expansion_degree() {
+  assert(expansion_degree > 0 and
+         expansion_degree <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid expansion_degree");
+  return expansion_degree;
+}
+void RequestManager::set_expansion_degree(int expansion_degree_) {
+  assert(expansion_degree > 0 and
+         expansion_degree <= BatchConfig::MAX_TREE_WIDTH and
+         "Invalid expansion_degree");
+  this->expansion_degree = expansion_degree_;
+}
+
 void RequestManager::set_speculative_sampling(bool speculative_sampling_) {
   speculative_sampling = speculative_sampling_;
 }
@@ -350,6 +364,8 @@ double RequestManager::get_request_expected_latency(Request &request) {
 }
 
 Request &RequestManager::get_request_with_guid(RequestGuid guid) {
+  assert(all_requests.find(guid) != all_requests.end() &&
+         "Request with the given GUID does not exist.");
   return all_requests[guid];
 }
 
@@ -384,11 +400,11 @@ bool RequestManager::SharedTokenTreeNodePtrDoubleRequestGuidLess ::operator()(
 
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
-                                        int eos_token_id,
+                                        std::vector<int> eos_token_ids,
                                         std::string const &path) {
   this->model_type = type;
   this->bos_token_id = bos_token_id;
-  this->eos_token_id = eos_token_id;
+  this->eos_token_ids = eos_token_ids;
   std::filesystem::path tokenizer_folder(path);
 
   if (model_type == ModelType::LLAMA) {
@@ -472,6 +488,7 @@ size_t RequestManager::get_num_ssms() {
   return ssm_models.size();
 }
 
+
 RequestManager::RequestGuid
     RequestManager::register_new_request(GenerationRequest const &req) {
   // Add a new request
@@ -484,19 +501,19 @@ RequestManager::RequestGuid
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
-  for (int i = 0; i < tokens.size(); i++) {
-    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-  }
-  std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
+  // for (int i = 0; i < tokens.size(); i++) {
+  //   std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+  // }
+  // std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   request.set_slo_ratio(req.slo_ratio);
 
   if (get_num_ssms() == 0) {
-    std::cout << "No small speculative model registered, using incremental "
-                 "decoding."
-              << std::endl;
+    // std::cout << "No small speculative model registered, using incremental "
+    //  "decoding."
+    // << std::endl;
   } else {
-    std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
+    // std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
     assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
     init_token_tree(request.guid);
   }
@@ -515,6 +532,15 @@ RequestManager::RequestGuid
   gr.slo_ratio = req.slo_ratio;
   gr.emission_time_ms = req.emission_time_ms;
 
+  // Record time when request was enqueued
+  // Step idx -2: enqueueing; step idx -1: prefilling begins, step idx 0:
+  // prefilling finished
+  NewProfileInfo new_profile_info;
+  new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+  new_profile_info.request_guid = request.guid;
+  new_profile_info.request_step_idx = -2;
+  new_profiling_info.push_back(new_profile_info);
+
   {
     std::lock_guard<std::mutex> const lock(request_queue_mutex);
     pending_request_queue.push(request);
@@ -531,13 +557,13 @@ RequestManager::RequestGuid
   }
 
   {
-    std::string output = "New request tokens:";
-    output = "[" + std::to_string(request.guid) + "] " + output;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      output = output + " " + std::to_string(request.tokens[i]);
-    }
-    log_req_mgr.print("%s", output.c_str());
-    write_to_output_file("", output);
+    // std::string output = "New request tokens:";
+    // output = "[" + std::to_string(request.guid) + "] " + output;
+    // for (int i = 0; i < request.tokens.size(); i++) {
+    //   output = output + " " + std::to_string(request.tokens[i]);
+    // }
+    // log_req_mgr.print("%s", output.c_str());
+    // write_to_output_file("", output);
   }
 
   return request.guid;
@@ -580,6 +606,24 @@ int RequestManager::get_empty_request_index() {
   return -1;
 }
 
+std::unordered_map<RequestGuid, RequestProfileInfo>
+    RequestManager::get_requests_profiling() {
+  return profiling_requests;
+}
+
+std::unordered_map<RequestGuid, GenerationResult>
+    RequestManager::get_request_generation_results() {
+  return request_generation_results;
+}
+
+ProfileInfo RequestManager::get_profiling_info() {
+  return profiling;
+}
+
+std::vector<NewProfileInfo> RequestManager::get_new_profiling_info() {
+  return new_profiling_info;
+}
+
 BatchConfigFuture RequestManager::get_next_batch_config(
     InferenceResultFuture const &result, Context ctx, Runtime *runtime) {
   RequestManager *rm = this;
@@ -682,6 +726,20 @@ void RequestManager::request_update_attainment(int batch_index, bool attained) {
   request.attained &= attained;
 }
 
+bool isPrefixAndRemove(std::vector<int> const &prefix, std::vector<int> &vec) {
+  if (prefix.size() > vec.size()) {
+    return false;
+  }
+
+  if (std::equal(prefix.begin(), prefix.end(), vec.begin())) {
+    vec.erase(vec.begin(), vec.begin() + prefix.size());
+    return true;
+  }
+
+  return false;
+}
+
+
 void RequestManager::request_complete_clean_up(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
   profiling_requests[guid].finish_time =
@@ -694,35 +752,48 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   request.status = Request::COMPLETED;
 
   // Find the sos and eos in the sequence
-  auto bos_it = std::find(
-      request.tokens.begin(), request.tokens.end(), this->bos_token_id);
-  auto eos_rit = std::find(
-      request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id);
-  std::vector<int>::iterator eos_it;
-  if (eos_rit != request.tokens.rend()) {
-    eos_it = eos_rit.base();
-  } else {
-    eos_it = request.tokens.end();
-  }
+  // auto bos_it = std::find(
+  //     request.tokens.begin(), request.tokens.end(), this->bos_token_id);
+  // auto eos_rit = std::find(
+  //     request.tokens.rbegin(), request.tokens.rend(), this->eos_token_id);
+  // std::vector<int>::iterator eos_it;
+  // if (eos_rit != request.tokens.rend()) {
+  //   eos_it = eos_rit.base();
+  // } else {
+  //   eos_it = request.tokens.end();
+  // }
   // std::string output =
   //     this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
   std::string output = this->tokenizer_->Decode(request.tokens);
 
   {
     std::lock_guard<std::mutex> const lock(request_result_mutex);
-    request_generation_results[guid].output_text = output;
-    request_generation_results[guid].output_tokens =
-        std::vector<int>(bos_it, eos_it);
+    request_generation_results[guid].output_tokens = request.tokens;
+    assert(isPrefixAndRemove(request_generation_results[guid].input_tokens,
+                             request_generation_results[guid].output_tokens));
+    if (request_generation_results[guid].output_tokens.size() > 0 &&
+        is_eos_token(
+            request_generation_results[guid].output_tokens
+                [request_generation_results[guid].output_tokens.size() - 1]) &&
+        !request.add_special_tokens) {
+      request_generation_results[guid].output_tokens.pop_back();
+    }
+    request_generation_results[guid].output_text = this->tokenizer_->Decode(
+        request_generation_results[guid].output_tokens);
+    request_generation_results[guid].decoding_steps =
+        profiling_requests[guid].llm_decoding_steps;
+    // request_generation_results[guid].output_tokens =
+    //     std::vector<int>(bos_it, eos_it);
   }
 
   trigger_request_completion_future(guid);
 
-  std::cout << "Request " << guid << " completed: " << std::endl << std::endl;
-  std::cout << "<bos>" << output;
-  if (eos_rit != request.tokens.rend()) {
-    std::cout << "<eos>";
-  }
-  std::cout << std::endl << std::endl;
+  std::cout << "Request " << guid << " completed" << std::endl;
+  // std::cout << "<bos>" << output;
+  // if (eos_rit != request.tokens.rend()) {
+  //   std::cout << "<eos>";
+  // }
+  // std::cout << std::endl << std::endl;
   {
     RequestProfileInfo profile_info = profiling_requests[guid];
 
@@ -756,7 +827,8 @@ void RequestManager::request_complete_clean_up(int batch_index) {
       *os << "SSM decoding steps: " << profile_info.ssm_decoding_steps
           << std::endl;
     }
-    *os << output << std::endl << std::endl;
+    *os << std::endl;
+    // *os << output << std::endl << std::endl;
 
     if (!output_filepath.empty()) {
       output_file.close();
@@ -934,23 +1006,23 @@ bool RequestManager::update_llm_prefill_results(InferenceResult const &result) {
         request->tokens.push_back(
             result.token_ids[num_tokens + request->num_tokens_in_batch - 1]);
 
-        if (request->tokens.back() == eos_token_id) {
+        if (is_eos_token(request->tokens.back())) {
           request_complete_clean_up(request->batch_index);
         } else {
           // Temporarily offload request from the batch
           request_offload_from_batch(request->batch_index);
           prefilled_requests.push(request);
-        }
 
-        if (decoding_mode == SPECULATIVE_DECODING) {
-          // Add the last token to the token tree
-          assert(request->committed_tokens.empty() &&
-                 "The committed tokens should be empty.");
-          request->committed_tokens.push_back(Request::CommittedToken{
-              -1, (int)request->tokens.size() - 1, request->tokens.back()});
-          init_token_tree(request->guid);
-          add_root_to_spec_token_tree(request->guid, request->tokens.back());
-          update_bitmask_prompt(request->guid, 1);
+          if (decoding_mode == SPECULATIVE_DECODING) {
+            // Add the last token to the token tree
+            assert(request->committed_tokens.empty() &&
+                   "The committed tokens should be empty.");
+            request->committed_tokens.push_back(Request::CommittedToken{
+                -1, (int)request->tokens.size() - 1, request->tokens.back()});
+            init_token_tree(request->guid);
+            add_root_to_spec_token_tree(request->guid, request->tokens.back());
+            update_bitmask_prompt(request->guid, 1);
+          }
         }
       } else {
         // Next phase will still be prefilling
@@ -999,7 +1071,16 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
         request.decode_latency_ms <= get_request_expected_latency(request);
     profiling_requests[guid].llm_decoding_steps++;
     nb_requests_decoded++;
-    if (request.tokens.back() == eos_token_id or
+
+    NewProfileInfo new_profile_info;
+    new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    new_profile_info.request_guid = guid;
+    new_profile_info.request_step_idx =
+        profiling_requests[guid].llm_decoding_steps - 1;
+    new_profile_info.num_generated_tokens = 1;
+    new_profiling_info.push_back(new_profile_info);
+
+    if (is_eos_token(request.tokens.back()) or
         request.decode_length() >= get_max_output_length() or
         request.tokens.size() >= get_max_sequence_length()) {
       request_update_attainment(request_index, attained);
@@ -1148,6 +1229,18 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
+
+    // Record prefilling start time. We don't do this for speculative decoding,
+    // because in that case we start the timer in the ssm prefilling Step idx
+    // -2: enqueueing; step idx -1: prefilling begins, step idx 0: prefilling
+    // finished
+    if (decoding_mode == INCREMENTAL_DECODING) {
+      NewProfileInfo new_profile_info;
+      new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+      new_profile_info.request_guid = request->guid;
+      new_profile_info.request_step_idx = -1;
+      new_profiling_info.push_back(new_profile_info);
+    }
   }
   bc.num_tokens = num_tokens;
 
@@ -1212,6 +1305,15 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
+
+    // Record prefilling start time
+    // Step idx -2: enqueueing; step idx -1: prefilling begins, step idx 0:
+    // prefilling finished
+    NewProfileInfo new_profile_info;
+    new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    new_profile_info.request_guid = request->guid;
+    new_profile_info.request_step_idx = -1;
+    new_profiling_info.push_back(new_profile_info);
   }
   bc.num_tokens = num_tokens;
 
@@ -1580,6 +1682,12 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       }
       layer_index++;
     }
+    if (verbose) {
+      // print token tree
+      std::cout << "Token tree for request " << request_index << ": "
+                << std::endl;
+      std::cout << token_tree << std::endl;
+    }
     new_bc.requestsInfo[request_index].num_tokens_in_batch = token_tree_index;
 
     request.first_token_offset_in_batch = new_bc.num_tokens - token_tree_index;
@@ -1601,6 +1709,23 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
   return new_bc;
 }
 
+int get_tree_size(Request const &request) {
+  int size = 0;
+  for (auto &layer : request.speculative_token_trees[0].tree_layers) {
+    size += (int)layer.size();
+  }
+  return size;
+}
+
+bool RequestManager::is_eos_token(TokenId token_id) {
+  for (int eos_token : eos_token_ids) {
+    if (token_id == eos_token) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool RequestManager::update_llm_verify_results(
     InferenceResult const &llm_verify_result) {
   // We may have two types of InferenceResults, one is the results from
@@ -1684,7 +1809,7 @@ bool RequestManager::update_llm_verify_results(
     // metainfo stored in the RequestManager. Otherwise, update its bitmask.
     bool eos_token_found = false;
     for (auto const &committed_token : request.committed_tokens) {
-      if (committed_token.token_id == eos_token_id) {
+      if (is_eos_token(committed_token.token_id)) {
         eos_token_found = true;
         break;
       }
@@ -1751,6 +1876,14 @@ bool RequestManager::update_ssm_inference_results(
     append_bitmask(guid);
 
     profiling_requests[guid].ssm_decoding_steps++;
+
+    if (current_ssm_step == ssm_tree_depth) {
+      assert(profiling_requests[guid].ssm_decoding_steps % ssm_tree_depth == 0);
+      profiling_requests[guid].speculation_start_timestamp =
+          profiling.ssm_step_start;
+      profiling_requests[guid].speculation_end_timestamp =
+          Realm::Clock::current_time_in_microseconds();
+    }
   }
 
   // Stop conditions
@@ -2121,7 +2254,7 @@ void RequestManager::get_verify_results_sample(
       }
       std::cout << std::endl;
       std::string output = this->tokenizer_->Decode(request.tokens);
-      std::cout << "Output sequence: " << output << std::endl;
+      // std::cout << "Output sequence: " << output << std::endl;
     }
   }
 }
@@ -2160,6 +2293,7 @@ void RequestManager::get_verify_results_greedy(
 
     int current_token_index = 1; // Because we skip the root
                                  // We skip the first layer
+    bool found_eos = false;
     for (auto layer_it = token_tree.tree_layers.begin() + 1;
          layer_it != token_tree.tree_layers.end();
          ++layer_it) {
@@ -2202,31 +2336,68 @@ void RequestManager::get_verify_results_greedy(
             last_accepted_token_index = current_token_index;
             last_accepted_token_index_in_layer = current_token_index_in_layer;
             committed_token_index++;
+            if (is_eos_token(node_ptr->id)) {
+              found_eos = true;
+            }
           }
           current_token_index++;
           current_token_index_in_layer++;
         }
+        if (found_eos) {
+          break;
+        }
       }
       if (!token_accepted_this_layer) {
         // No token is accepted in this layer, we should stop the traversal
         break;
       }
+      if (found_eos) {
+        break;
+      }
     }
 
     // Add the last token (that is not verified by the LLM)
     // from_index: since this token is not in the token tree, the llm
     // doesn't have its KV cache, so the from_index should be a place
     // holder, which is -1
-    request.committed_tokens.push_back(Request::CommittedToken(
-        -1,
-        committed_token_index,
-        llm_verify_result
-            .token_ids[llm_result_offset + last_accepted_token_index]));
-    request.tokens.push_back(
-        llm_verify_result
-            .token_ids[llm_result_offset + last_accepted_token_index]);
+    if (!found_eos) {
+      request.committed_tokens.push_back(Request::CommittedToken(
+          -1,
+          committed_token_index,
+          llm_verify_result
+              .token_ids[llm_result_offset + last_accepted_token_index]));
+      request.tokens.push_back(
+          llm_verify_result
+              .token_ids[llm_result_offset + last_accepted_token_index]);
+    }
+
+    assert(request.committed_tokens.size() >= 2);
+    int nb_generated_tokens = (int)request.committed_tokens.size() -
+                              1; // exclude previous bonus token
+    int accepted_tokens = (int)request.committed_tokens.size() -
+                          1; // exclude previous bonus token
+    if (!found_eos) {
+      accepted_tokens--; // exclude the last bonus token (if we found eos, we
+                         // don't add it)
+    }
+    total_nb_generated_tokens += nb_generated_tokens;
+
+    NewProfileInfo new_profile_info;
+    new_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    new_profile_info.request_guid = guid;
+    new_profile_info.request_step_idx =
+        profiling_requests[guid].llm_decoding_steps -
+        1; // check if this has already been incremented
+    new_profile_info.num_speculated_tokens = get_tree_size(request);
+    new_profile_info.num_accepted_tokens = accepted_tokens;
+    new_profile_info.speculation_score = -1.0;
+    new_profile_info.num_generated_tokens = nb_generated_tokens;
+    new_profile_info.speculation_start_timestamp =
+        profiling_requests[guid].speculation_start_timestamp;
+    new_profile_info.speculation_end_timestamp =
+        profiling_requests[guid].speculation_end_timestamp;
+    new_profiling_info.push_back(new_profile_info);
 
-    total_nb_generated_tokens += request.committed_tokens.size() - 1;
     if (verbose) {
       std::cout << "Request " << request.guid << " committed tokens: ";
       for (auto const &committed_token : request.committed_tokens) {
@@ -2255,10 +2426,10 @@ std::vector<GenerationResult>
   for (size_t i = 0; i < requests.size(); i++) {
     requests[i].slo_ratio = emission_machine.sample_slo_ratio();
     requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms();
-    printf("Prompt[%ld] with slo %.3f: %s\n",
-           i,
-           requests[i].slo_ratio,
-           requests[i].prompt.c_str());
+    // printf("Prompt[%ld] with slo %.3f: %s\n",
+    //        i,
+    //        requests[i].slo_ratio,
+    //        requests[i].prompt.c_str());
     RequestManager::RequestGuid guid = rm->register_new_request(requests[i]);
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
@@ -2545,8 +2716,9 @@ void RequestManager::terminate_background_server() {
     // Write the last profiling statistics to output file
     std::string str = "[Profiling Statistics]";
 
-    long long total_time = Realm::Clock::current_time_in_microseconds() -
-                           profiling.server_start_time;
+    profiling.server_end_time = Realm::Clock::current_time_in_microseconds();
+    long long total_time =
+        profiling.server_end_time - profiling.server_start_time;
     int total_requests = 0;
     for (auto const &profiling_info : profiling_requests) {
       int request_id = profiling_info.first;
@@ -2709,6 +2881,25 @@ void RequestManager::terminate_background_server() {
     goodput_str += ")";
     str += goodput_str;
 
+    if (profiling_requests.size() != all_requests.size()) {
+      std::cerr << "profiling_requests.size()=" << profiling_requests.size()
+                << " != all_requests.size()=" << all_requests.size()
+                << std::endl;
+    }
+    assert(profiling_requests.size() == all_requests.size());
+    str += "\nDecoding Steps: ";
+    for (auto const &profiling_info : profiling_requests) {
+      int request_id = profiling_info.first;
+      Request &request = all_requests[request_id];
+      str += "Request " + std::to_string(request_id) + ": ";
+      str += std::to_string(profiling_info.second.llm_decoding_steps);
+      str += "/";
+      str += std::to_string(request.decode_length());
+      float speedup = (float)request.decode_length() /
+                      profiling_info.second.llm_decoding_steps;
+      str += " " + std::to_string(speedup) + "\n";
+    }
+
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     request_queue_cv.notify_all();
@@ -2854,7 +3045,8 @@ void RequestManager::add_tokens_to_spec_token_tree(
 void RequestManager::add_tokens_to_spec_token_tree_old_version(
     InferenceResult const &ssm_inference_result) {
 
-  std::vector<int> tree_width_vector = {1, 1, 3, 1, 1, 1, 1, 1};
+  std::vector<int> tree_width_vector = {
+      1, 1, this->expansion_degree, 1, 1, 1, 1, 1};
 
   int expand_width = tree_width_vector[current_ssm_step - 1];
 
diff --git a/src/utils/communication_buffer.cu b/src/utils/communication_buffer.cu
index cd6cc0db4..83b0385a3 100644
--- a/src/utils/communication_buffer.cu
+++ b/src/utils/communication_buffer.cu
@@ -23,7 +23,9 @@
 // For the i-th pointer, if i is the worker id of the given device,
 // then the returned i-th ptr_group is the local pointer,
 // or otherwise it is an peer memory pointer from the remote device.
-std::vector<void *> create_peer_ptr_group(int num_devices,
+std::vector<void *> create_peer_ptr_group(Legion::Context ctx,
+                                          Legion::Runtime *runtime,
+                                          int num_devices,
                                           int device_id,
                                           ncclComm_t ncclComm,
                                           void *allgather_src,
@@ -46,12 +48,14 @@ std::vector<void *> create_peer_ptr_group(int num_devices,
                             cudaMemcpyHostToDevice,
                             stream));
 
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllGather(allgather_src,
                           allgather_dst,
                           sizeof(void *),
                           ncclChar,
                           ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 
   std::vector<void *> peer_pointers(num_devices);
   checkCUDA(cudaMemcpyAsync(peer_pointers.data(),
@@ -85,7 +89,9 @@ void free_peer_ptr_group(std::vector<void *> ptr_group,
 // all-gathering peer pointers across devices. The size of allgather_src should
 // be sizeof(void*), and the size of allgather_dst should be sizeof(void*) *
 // num_devices.
-CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
+CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
+                                                    Legion::Runtime *runtime,
+                                                    int num_devices,
                                                     int device_id,
                                                     ncclComm_t ncclComm,
                                                     void *allgather_src,
@@ -100,21 +106,27 @@ CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
   comm_buf->num_devices = num_devices;
   comm_buf->device_id = device_id;
   comm_buf->local_ptr = local_ptr;
-  comm_buf->comm_ptrs = create_peer_ptr_group(num_devices,
+  comm_buf->comm_ptrs = create_peer_ptr_group(ctx,
+                                              runtime,
+                                              num_devices,
                                               device_id,
                                               ncclComm,
                                               allgather_src,
                                               allgather_dst,
                                               local_ptr,
                                               stream);
-  comm_buf->barrier_in = create_peer_ptr_group(num_devices,
+  comm_buf->barrier_in = create_peer_ptr_group(ctx,
+                                               runtime,
+                                               num_devices,
                                                device_id,
                                                ncclComm,
                                                allgather_src,
                                                allgather_dst,
                                                barrier_in_ptr,
                                                stream);
-  comm_buf->barrier_out = create_peer_ptr_group(num_devices,
+  comm_buf->barrier_out = create_peer_ptr_group(ctx,
+                                                runtime,
+                                                num_devices,
                                                 device_id,
                                                 ncclComm,
                                                 allgather_src,
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index 6857b5cbc..8fa17f153 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -87,7 +87,7 @@ def main():
     # Get Tokenizer
     hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
     hf_arch = getattr(hf_config, "architectures")[0]
-    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM" or hf_arch == "MistralForCausalLM":
         tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True)
     else:
         tokenizer = AutoTokenizer.from_pretrained(args.model_name)

From 2990c88332d94872a93ebe7509ffbde13f3ee36b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Fri, 15 Nov 2024 17:28:38 +0000
Subject: [PATCH 629/667] cleanup

---
 benchmarking/average_accepted_tokens.pdf      | Bin 15738 -> 15767 bytes
 benchmarking/benchmark_incr_dec.sh            |   4 +-
 benchmarking/benchmark_specinfer.sh           |   2 +-
 benchmarking/plot_results.ipynb               | 176 +++++++++---------
 .../queueing_time_vs_arrival_rate.pdf         | Bin 18042 -> 18567 bytes
 benchmarking/throughput_vs_tpot.pdf           | Bin 28243 -> 28221 bytes
 benchmarking/ttft_vs_arrival_rate.pdf         | Bin 16898 -> 17427 bytes
 .../inc_multihead_self_attention_kernels.cu   |   8 +-
 src/parallel_ops/kernels/allreduce_kernels.cu |   2 +-
 src/runtime/request_manager.cc                |   2 -
 10 files changed, 96 insertions(+), 98 deletions(-)

diff --git a/benchmarking/average_accepted_tokens.pdf b/benchmarking/average_accepted_tokens.pdf
index 896519d5d47abe1cf317cf20a46c0599432e6e0f..717e6e68a796d784a518772660ac3d1326e9d1bc 100644
GIT binary patch
delta 2378
zcmZuxd0dQn9Bv$8XdP)os);Ttz4tqB5u2T2x>FOiR(A@uQjSqx71k!SW~Meo5kfI(
zbWGdj)}>35RFdhWYf)`!r(JB#>-YZiexA?sd4A9L`97cD5I1(eP@bkkH?;Ed^mQ{0
z*voKX0MZW!!4&`uLMQ`c5-SKClm2aH@HA+iWa-8i{;#n)p{-6m6*RxocZ;LzGqYk7
zW?m`4<mo|av#(xYJ}<IvW~wA?X-rI6=c%s;Mv76_0;9`iC1+pFGUTZj1!S@YJD>AO
zvQYcwhTca9$xGeg7M}!pPZkd>3{|$QtU5)=X`s3mHjaIVOpZ21A8<sVP_bLrX)EjB
z$mZW02oFn@#Ew0TEQxY2+hABv#I{B3nK>+%@cAjFD|^%e?=hlPE*^Fa+1%GS5NSc9
z>vJ7WESW2R=A_3x-PDh47o`Di8mUR5r_vZIU}XV+N|<q2aI&f4N~7o#O)(m4JIhUa
zM=_>?SZ9Y>+&NKjvZL<&igil;HTIrl&*+ks*X{m@?<pUM=W}L6&N+>6W~_}=TU6iT
zaNpE!c&U=AQbNygT=N}<MnX`7Et+@D$zzYb(6U^#u9o%I!sADKL8h7e4zm_3lj+V`
z+D6zVPH%}?a!c6Bu!d=s4~7_N8T#SaX^~;fb|W9&`GXNH{hi|2lAE<YCPO0<vfY1Q
z&hiK89fA9Z8jsXBgEL0&w5mTpx$8rF<5O$JV*kgx+Y;+?@8c=)yBR-Sj9=nD?SO5L
zS9bPdr`!I=xu(b6j6UsqpS5YV(wU2ow?=fdE2wglqXK#rinmTK%^|gyMup}_&DM8b
zVD3`4NaJcww1C0yuZSNpE9<^d)Umgp92&9c%DEh@9v@@%9h4EfT?7R$Jh}-Ect3ex
z$Z`I!Sqo^AC0adU%e@YzS`v*1t&0e^NxFeAr=kf;i?`?<=WO-3^dzaHe6oPu?wkl^
z(BhGB;Sp>u$7MBgrMfp5(w@Hd?j&zhsRUap;UJaZ9El0wSBJ0HZ<QBSk`JF1`E|?`
z`N%yIv708}E}HI=8kk2*-D;sbM^If;80{b#u-9O^6><eO*Qc1+UW*b6mba=z(|yft
z@+T>6D@f#c^|O%YE9lD2kqH9Do&N7qUq1@u-bqz_&o&<7+%Qzu8hztCXr3+E^X%*=
z&CG3?W-PmgfFeB;ohWPjxup2-eogJ8`a-*Gt|nP~Y&EEO>uH;+o^LmeAJpyQ7Q6qu
zG-O!lD1O)`mzm;GCsFGtF}yY3&pB#-O^$Mqb3*9tP`g!?Rb_c)j`<&S!viZj2iC&{
za<4xn51Qj~Lie2D$n2(%_X5+Fbyp=Nm2tI#b|tM@*4FiBqf=loW=xJ2_@pbEaM*G?
zs$BO=RN~L^&hPB&h@Gwe^vo)q#)h{Qan_4cbaD&L+DA+*4e|R5`B@{^Lxu}V_takz
z@iwShr004j`ex?3yUEvyla@*{ptdd+Pt9?|F9tu)ea8}VD%f#Nu4|3z7V^!!bk~<h
z8ElN)5D^mh!@}){|EeZP)J9>RSoJLTSvYGRdnU(n+N+3})f{8K(BIWVw=k%<Hf*z;
zWBBeuZEvH^Ln4Fb!H~KJ>)emxggoWz`+fW5Rq|W`@%x7I`kRW!j5i8g9WE$%bA=LT
zgKd4$q5;o$MtgU7lNaB0UPzy!=X9*pDx)VYpEm2o8XDcp-*Z<GFH>G&79@S50^I%R
zAP5ubQ{-)-sW^nd0B;l1oJ27OFm@RJ4En!6AOe5{6+=M^1`;p@gFu9e!Ke%-y$Jgc
z<Eax0-y>3kW9O>}07P+|XG&;)!ybfSxC|YHAWZrUPmlmX9F~;@VHA?(Nv|Sg^#Bn6
ztVfRX1OZec1W^Sc7?sThfKZT<0fb?)SphJBGsh1?j?Dx^pzM+W7{;g=j!-ZJz$j)A
z7@Kg(*c}K8Qh69kQb7Wx#Dzk*Y-0cvN2ol4(l!8tCQdwN21Ag~$&mk70%M{S*Em2>
zV#OgywyFRQf3DlOb>iRX0*=8G2N}yBCvZw>m=HjTnu$>R!NgG65&}$ux_1I#Qqm$|
z2q4FoVJsQ~hp0p*HqrGMk3eNH5CADDfYLq5rV0WC@Wu4{udf0KA{0RoB4ksL(!T8O
Y>*_!R1~K09Rpv7=oUWzCvN5Co12G;ijQ{`u

delta 2358
zcmZuxdpJ~i7-p4+Y28vpr5QCI+Hh`jE^MWYNVz1p&_!-dxszpvPPIghZaXwROL9qA
z($GBy8=BHZNF|lFT#7VBB3kWe+ctAL=bzv6KJWRy_j|wh{Dizwu1HyNHr?DIz@Ov0
zJS>*(O$THzjv{mxhSJy51g47qWc|_h(C4V;9_fb}mC-x*b-%eO=JQl8+tGskvaR?V
zP8@jVzpdvfUSV~-YN?4$?)mfgmSx=CsR0`$*B*rDxU@)noiP4rk3jF!`%DKO=k2u=
zvW|D1`6*>M$noP8DN}Ih-Xi>0soVJZCw~cpt554yY;N&G64-q06;OCV5*hQNORQU(
z?taQYUz=gITFsK-9rrXLsK-!$c3zTO{Mx~dBc{ULGuG*54(RNT=)Yn8rg=~HS+~Ps
zPv8A%s#-gwo_7g2R=c15FlAPVXbEvFL&LRgNaJI5L{`WV^P?x#tD+v)Ig3IU%uNS6
z4J$pnt}4}kdU0{SiR(P0eV19D<hkEe-8)CN{vPvem!s1`H=132$NUtnJE9T2<7W2z
z97i{47S)b;%&^?i=iYloaI6RPE}qVD@wCacI@4}7gAsQt@Y<=ySBmSbD-F+j*sF#l
z&;#PScBed?>g-+87JU2UQq68jj|VZ<EqsM7`7kv4GBhEoqR`Ppw~ue{;dO&NGk@Ij
zcy&-mV_UMvuFa_n%kvIzm=aL1OYEFlc4dh*!^N(=(6*?guixfe1KZ1Nxf@cvqcY)9
zXqHa<edWX#uLFfUs#R_;wUADIuE`kh*qyWRmGWRs^DH<0TgK)<qkn~AK;nLj{g=qh
zGZi<uLl4u2@26P<Lh;JYjtSDBJLKb!d0Myal1==;lCg%)F7MTusf|3eE%bd-Z}=1?
zOSR;jV_LHk<8WsF;m$mL#`3UbY|iwnec6_Dad)^ue!8-xra?t5X+ea6+ug01iYvAh
zUdYZ-%{Y1F$-A~Sh3Y+^s6|2Vp&lc5Z{mI2*REJRkCEGvXvco7ES)pN=8~<`Qqp3)
z43#YX*oNwhw9LGm{H$(zD}ahG`OkXCG)-KSmGmBPcK+C(ssI|a-ikF3NC^#|R#w}5
zx3xx7#3&FO=cXn7Jr-FW5uez;_gYuvK+zMQMcc+(T1F3YYo(?5_yP^>!U#cqZdJ!U
zVZ8eK(z5u-;T|o?2Lt9IhhDPXHnuiPyz)VSZ6LW{-zP-jLdm*5bLp}X@@&=2t1(0A
zZ3Th##;5O>`zV+Wx-oU3;`#e{*B$zw>bNuwkJS)b>B40N9(y&gJmbq`G{1?~u<eM|
z0MC=?xHrsfYF^uS$3i0{U0YNpSu+yH<-R{~Jj>uMO|qp~$7WFnY(z?e!d-k5-VO4d
z2AU){PG==FcNrt&UUSN>Y;XzR6jQd|XK*m1x>xecqxf2DP87Fft-<yS9QLXxVIbSY
zFVz{67?r9T6Bejyb6>>JUHjN1hXrZe{OxTPv4TI|TJ>eMn|T&V&ZjG#yt8iQ@--`5
zQ;(d@F*#sUQD<n`LAyw*Xb)8FdeWP15@J*~7v0{PA6D$%<d7qBUp+j{iy!?VV|$~%
zx;VJIKrc@BC$8vi@(&Nn9qPpJ#Y+1VtKwQ4s)QWl3yo~$3X_>Uj#jjPurK#h>BVg=
z7N*fd{gJ%Ih0IrBdmdjuO$!M1kBW{A@C}(bg>BwS10mUMr1JR$LeLq8vNJRl1wa-i
zh{0`12%`fN4?UDl`+NXlf{@)`lQ9H<WL4xaP<F#nFq}ofAOMxaWM85G@&rE<YC5od
zKbblN7i=Vqz9SDpU|g=8Y%Ca*$3P5#<uMo`<ax4IsC+#D#J<!c|MvxujX@zoAhjR_
zL-M`=5Q0-YfKXW8D*y%%;#<+=gugI|%P$FlVVH_xAO%AJm?8#&;Ym#<dLRf+<)IKs
z1qq1a0t;eMHps$2DvzL~4M3sE6Hkai5%7yM<kwir%!eqdF#x65ih+cDQ~?Zzsl4xG
z0YkybgG~635eOwTIEYcK#t~{ha1@dcA%GLqy%PY7;uZlBAo*<=CafV)f=a}Z$*L!K
z1m&j!0g&PX2(T#jg8+{GCwk@ATLA<i`6e<5m3ILIzwYkq>Ocg-F@f!3TU8jRF_|__
HD`@`!RTwZ>

diff --git a/benchmarking/benchmark_incr_dec.sh b/benchmarking/benchmark_incr_dec.sh
index 3ddcb2271..3a75fa61d 100755
--- a/benchmarking/benchmark_incr_dec.sh
+++ b/benchmarking/benchmark_incr_dec.sh
@@ -42,7 +42,7 @@ request_per_second_values=(
 )
 
 dataset_name="sharegpt"
-dataset_fp="../wildchat/${dataset_name}.json"
+dataset_fp="../benchmarking/${dataset_name}.json"
 partition_name="all"
 
 export LEGION_BACKTRACE=1
@@ -68,7 +68,7 @@ for j in "${!batch_sizes[@]}"; do
     metrics_fp="/usr/FlexFlow/inference/output/incr_dec_llm_${model_name_}_bz_${batch_size}_rate_${rate}_dataset_${dataset_name}.csv"
     rm $metrics_fp $output_fp $log_fp || true
 
-    time ./inference/suffix_decoding/incr_dec \
+    time ./inference/simplified_infer/incr_dec \
         -ll:gpu $NGPUS -ll:cpu $NCPUS -ll:util $NCPUS \
         -tensor-parallelism-degree $NGPUS \
         -ll:fsize $FSIZE -ll:zsize $ZSIZE -ll:csize $CSIZE \
diff --git a/benchmarking/benchmark_specinfer.sh b/benchmarking/benchmark_specinfer.sh
index 5fe881f08..e0c8e39d7 100755
--- a/benchmarking/benchmark_specinfer.sh
+++ b/benchmarking/benchmark_specinfer.sh
@@ -52,7 +52,7 @@ request_per_second_values=(
 )
 
 dataset_name="sharegpt"
-dataset_fp="../wildchat/${dataset_name}.json"
+dataset_fp="../benchmarking/${dataset_name}.json"
 partition_name="all"
 
 export LEGION_BACKTRACE=1
diff --git a/benchmarking/plot_results.ipynb b/benchmarking/plot_results.ipynb
index 39047b86c..c7dcff18c 100644
--- a/benchmarking/plot_results.ipynb
+++ b/benchmarking/plot_results.ipynb
@@ -164,7 +164,7 @@
     "plt.grid(True)  # Turn the grid on\n",
     "\n",
     "# Save the plot as a PDF\n",
-    "plt.savefig('/usr/FlexFlow/wildchat/average_accepted_tokens.pdf')\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/average_accepted_tokens.pdf', bbox_inches='tight')\n",
     "\n",
     "plt.show()\n"
    ]
@@ -244,179 +244,179 @@
     "plt.tight_layout(rect=[0, 0, 1, 0.96])\n",
     "\n",
     "# Save the plot as a PDF\n",
-    "plt.savefig('/usr/FlexFlow/wildchat/throughput_vs_tpot.pdf')\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/throughput_vs_tpot.pdf')\n",
     "\n",
     "plt.show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n",
-      "/tmp/ipykernel_3339078/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "/tmp/ipykernel_3415116/2453520981.py:48: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "  ttft = group.apply(lambda x: x[x[\"request_step_idx\"] == 0][\"timestamp\"].values[0] - x[x[\"request_step_idx\"] == -1][\"timestamp\"].values[0])\n",
-      "/tmp/ipykernel_3339078/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "/tmp/ipykernel_3415116/2453520981.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return ttft.mean()[1] / 1000\n"
      ]
     },
@@ -434,7 +434,7 @@
     },
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADqJ0lEQVR4nOzdeVyN6f8/8NeptC9atJFKoiSh7EP2KCR7tsq+CxmMqGQdu5mxLxlK9u0jS4OIjGXsNJgohpQ1khZ1//7o1/11nJZTIqbX8/s4j++c676W93Wfuz68Xee6JIIgCCAiIiIiIiIiIiKiz6ZQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxEREREREREREVEpYcKViIiIiIiIiIiIqJQw4UpERCJvb29YWFh80TEkEgkCAwO/6BhlYeHChahWrRoUFRVRt27dsg5HRsuWLdGyZUupsqSkJPTo0QP6+vqQSCRYtmwZAODevXto3749dHR0IJFIsG/fvq8e739NfvdfXhYWFvD29i7VeL60z5lvWbOwsECnTp2++DiBgYGQSCRffBwiIiIi+vqYcCWib45EIinVV1RUFOLj4wu83rhxY/EvvkW9vpUEwo8//giJRILevXuXdShfxKefl4KCAvT09NCxY0ecO3euxP2uXLkSISEhpRfo/3fs2DH8+OOPaNasGTZt2oS5c+eW+hgf8/b2lro/mpqaqFatGnr06IHdu3cjJydHrn4mTJiAo0ePYtq0adiyZQs6dOgAAPDy8sKNGzcwZ84cbNmyBU5OTl9yOp9l7ty5cieEP36uZs+enW+dfv36ifeUvjwLCwupZ1lVVRXW1taYPHkyXr58WaI+Y2JiEBgYiNevX5dusHJKTU1FQEAAateuDQ0NDejr66Nu3boYP348njx5UiYxEREREdHXpVTWARARfWrLli1S73///XdERkbKlGdnZ0NRUbHIera2tnj//j0AwNPTE66urlLXK1WqBBMTE1SvXl0sS01NxciRI+Hh4YFu3bqJ5UZGRp83uVIgCAK2bdsGCwsLHDx4EG/fvoWWllap9L1u3Tq5k3VfQ97nlZ2djbt372LlypVo1aoVLl68CHt7+2L3t3LlShgYGJT6asETJ05AQUEBGzZsgLKycqn2XRAVFRWsX78eAPD+/XskJCTg4MGD6NGjB1q2bIn9+/dDW1tbrH/s2LF843Z3d4efn59Y9v79e5w7dw7Tp0/HmDFjvvxEPtPcuXPRo0cPdO3aVe42qqqq2LZtG/z9/aXK3717h/3790NVVbWUo6TC1K1bF5MmTQIApKen46+//sKyZctw6tQpXLhwodj9xcTEICgoCN7e3qhYsWIpR1u4rKwstGjRAn///Te8vLwwduxYpKam4tatWwgLC4OHhwdMTU0BAP7+/pg6depXjY+IiIiIvg4mXInom9O/f3+p93/++SciIyNlyj9VWL34+HgAQP369Qvsp06dOuJ/P3/+HCNHjkSdOnWKHPdri4qKwr///osTJ07AxcUFe/bsgZeXV5Ht0tPToaysDAUF2S83vHv3DhoaGqhQocKXCLnEPv28mjdvjo4dO2LVqlVYuXJlGUYmLTk5GWpqaqWWbBUEAenp6VBTUyuwjpKSksyzOXv2bMyfPx/Tpk3D0KFDsX37dvFafrElJyfLJKSePXsGAKWaqCrs2SsLrq6u2LNnD65duwYHBwexfP/+/cjMzESHDh1w4sSJMoywfKlcubLUszxkyBBoampi0aJFuHfvHqytrcswuuLZt28frly5gtDQUPTt21fqWnp6OjIzM8X3SkpKUFLiH8WJiIiI/ou+jb/5EBF953bt2gWJRIJTp07JXFuzZg0kEglu3rwJAHj69Cl8fHxQpUoVqKiowMTEBO7u7mJSuCihoaGoVasWWrVqhbZt2yI0NFSmTlRUFCQSCcLDw+Hv74/KlStDXV0db968gbe3NzQ1NREXFwdXV1doaWmhX79+AKT3cM3KyoKenh58fHxk+n/z5g1UVVXFlZGZmZmYOXMmHB0doaOjAw0NDTRv3hwnT56Ua07yat68OQAgLi5OqnzTpk1o3bo1DA0NoaKiglq1amHVqlVSdSwsLHDr1i2cOnUq3y0iXr9+DV9fX5iZmUFFRQXVq1fHggULilzxK5FIsGnTJrx7907sN2/bgg8fPiA4OBhWVlZQUVGBhYUFfvrpJ2RkZMjE1qlTJxw9ehROTk5QU1PDmjVrSnSPpk6divbt22Pnzp24e/euWP7xnpohISGQSCQQBAG//fabGHdgYCDMzc0BAJMnT4ZEIpHa0/fx48cYNGgQjIyMoKKiAjs7O2zcuFFq/MKePQA4f/48OnToAB0dHairq8PZ2Rlnz56V6iNvi49//vlHXKWoo6MDHx8fpKWlSd37d+/eYfPmzeIc5Fm93KRJE1haWiIsLEyqPDQ0FB06dICenl6+7VauXAk7OzuoqKjA1NQUo0ePzvdr62vXroWVlRXU1NTQsGFDREdH59tfRkYGAgICUL16daioqMDMzAw//vijzPPxqaysLAQFBcHa2hqqqqrQ19fHDz/8gMjIyELbvXz5En5+frC3t4empia0tbXRsWNHXLt2Tape3me4Y8cOzJkzB1WqVIGqqiratGmDf/75p8TzLQ5jY2MAkEpIXr9+Hd7e3qhWrRpUVVVhbGyMQYMG4cWLF2KdwMBATJ48GQBgaWkpPhcf/37dunUrGjZsCHV1dejq6qJFixb5rgA/c+YMGjZsCFVVVVSrVg2///57kXHn/W5q1qyZzDVVVVWpVeef7uH66VYhH78+3vda3ucmMjISP/zwAypWrAhNTU3UrFkTP/30U5FzICIiIqLPx39WJ6JyJS0tDc+fP5cq09HR+eyVnW5ubtDU1MSOHTvg7OwsdW379u2ws7ND7dq1AQDdu3fHrVu3MHbsWFhYWCA5ORmRkZF4+PBhkQdWZWRkYPfu3eLXbz09PeHj44OnT5+KCYqPBQcHQ1lZGX5+fsjIyBBXOX748AEuLi744YcfsGjRIqirq8u0rVChAjw8PLBnzx6sWbNGaoXkvn37kJGRgT59+gDITcCuX78enp6eGDp0KN6+fYsNGzbAxcUFFy5cKLVDpPKSJrq6ulLlq1atgp2dHbp06QIlJSUcPHgQo0aNQk5ODkaPHg0AWLZsGcaOHQtNTU1Mnz4dwP9tEZGWlgZnZ2c8fvwYw4cPR9WqVRETE4Np06YhMTFRPEwqP1u2bMHatWtx4cIF8Sv+TZs2BZC7Um/z5s3o0aMHJk2ahPPnz2PevHmIjY3F3r17pfq5c+cOPD09MXz4cAwdOhQ1a9Ys8X0aMGAAjh07hsjISNSoUUPmeosWLbBlyxYMGDAA7dq1w8CBAwHkrvKuWLEiJkyYIG7nkLeXaVJSEho3bgyJRIIxY8agUqVKOHz4MAYPHow3b97A19dXaoz8nr0TJ06gY8eOcHR0REBAABQUFMRkeXR0NBo2bCjVR69evWBpaYl58+bh8uXLWL9+PQwNDbFgwQLx3g8ZMgQNGzbEsGHDAABWVlZy3SNPT09s3boV8+fPh0QiwfPnz3Hs2DFs2bIFR44ckakfGBiIoKAgtG3bFiNHjsSdO3ewatUqXLx4EWfPnhV/h2zYsAHDhw9H06ZN4evri/v376NLly7Q09ODmZmZ2F9OTg66dOmCM2fOYNiwYbC1tcWNGzewdOlS3L17t9B9aQMDAzFv3jxx7m/evMGlS5dw+fJltGvXrsB29+/fx759+9CzZ09YWloiKSkJa9asgbOzM27fvi1+1T3P/PnzoaCgAD8/P6SkpODnn39Gv379cP78ebGOvPMtTFZWlvh7OT09HVeuXMGSJUvQokULWFpaivUiIyNx//59+Pj4wNjYGLdu3cLatWtx69Yt/Pnnn5BIJOjWrRvu3r2Lbdu2YenSpTAwMACQu3UMAAQFBSEwMBBNmzbFrFmzoKysjPPnz+PEiRNo3769ONY///yDHj16YPDgwfDy8sLGjRvh7e0NR0dH2NnZFTiXvH+w+P333+Hv71+sQ7GGDx+Otm3bSpUdOXIEoaGhMDQ0BCD/c3Pr1i106tQJderUwaxZs6CiooJ//vlH5h83iIiIiOgLEYiIvnGjR48W5Pl1VVi9Bw8eCADyfZ08eVKm/rNnzwQAQkBAgNxxenp6CoaGhsKHDx/EssTEREFBQUGYNWuWIAiC8OrVKwGAsHDhQrn7/diuXbsEAMK9e/cEQRCEN2/eCKqqqsLSpUul6p08eVIAIFSrVk1IS0uTuubl5SUAEKZOnSrTv5eXl2Bubi6+P3r0qABAOHjwoFQ9V1dXoVq1auL7Dx8+CBkZGVJ1Xr16JRgZGQmDBg2SKpfnvuZ9XkFBQcKzZ8+Ep0+fCtHR0UKDBg0EAMLOnTul6n86R0EQBBcXF6kYBUEQ7OzsBGdnZ5m6wcHBgoaGhnD37l2p8qlTpwqKiorCw4cPC43Xy8tL0NDQkCq7evWqAEAYMmSIVLmfn58AQDhx4oRYZm5uLgAQjhw5Uug4hY33sStXrggAhAkTJohlzs7OMnMHIIwePVqqLO/ef/qMDh48WDAxMRGeP38uVd6nTx9BR0dH/AwKevZycnIEa2trwcXFRcjJyRHL09LSBEtLS6Fdu3ZiWUBAgABA5tnx8PAQ9PX1pco0NDQELy+vAu9FQXO7efOmAECIjo4WBEEQfvvtN0FTU1N49+6dzP1NTk4WlJWVhfbt2wvZ2dli+a+//ioAEDZu3CgIgiBkZmYKhoaGQt26daV+HtauXSsAkLr/W7ZsERQUFMTx86xevVoAIJw9e1YsMzc3l5qjg4OD4ObmJtecP5aeni4Vf949UVFREX9HCcL/fYa2trZS81i+fLkAQLhx40ax51uQvGf/01ezZs1knrX8fs63bdsmABBOnz4tli1cuFAAIDx48ECq7r179wQFBQXBw8ND5j58/EzmxfRxn8nJyYKKioowadKkQueTlpYm1KxZUwAgmJubC97e3sKGDRuEpKQkmbp5z3lB7t27J+jo6Ajt2rUT/3dF3udm6dKlAgDh2bNnhcZLRERERF8GtxQgonJl2LBhiIyMlHp9vIfj5+jduzeSk5MRFRUllu3atQs5OTno3bs3AIj7fEZFReHVq1fFHiM0NBROTk7iAV9aWlpwc3PLd1sBIPe0+YL2AR05cmSR47Vu3RoGBgZSe4G+evUKkZGR4pwAQFFRUVwBm5OTg5cvX+LDhw9wcnLC5cuX5Z7fpwICAlCpUiUYGxujefPmiI2NxeLFi9GjRw+peh/PMSUlBc+fP4ezszPu37+PlJSUIsfZuXMnmjdvDl1dXTx//lx8tW3bFtnZ2Th9+nSxY4+IiAAATJw4Uao8b3XyoUOHpMotLS3h4uJS7HHyk7cq9e3bt6XSnyAI2L17Nzp37gxBEKTukYuLC1JSUmQ+50+fvatXr+LevXvo27cvXrx4IbZ/9+4d2rRpg9OnT8ts3zBixAip982bN8eLFy/E7Qk+h52dHerUqYNt27YBAMLCwuDu7p7vau8//vgDmZmZ8PX1ldqHdujQodDW1hY/y0uXLiE5ORkjRoyQWhHu7e0NHR0dqT537twJW1tb2NjYSN3P1q1bA0Ch23FUrFgRt27dwr1794o1ZxUVFTH+7OxsvHjxQvyqeX4/pz4+PlLzyNvS4/79+8Web2EaNWok/j7+3//+hzlz5uDWrVvo0qWLeOAhIP1znp6ejufPn6Nx48YAINfvmX379iEnJwczZ86U2U/405WotWrVEucL5K6QrVmzpjj3gqipqeH8+fPitgYhISEYPHgwTExMMHbs2CK3i8jz7t07eHh4QFdXF9u2bRMPiJT3ucnbg3n//v3f1EGIREREROUFtxQgonLF2tpa5iubpSVvX8rt27ejTZs2AHK3E6hbt674tW4VFRUsWLAAkyZNgpGRERo3boxOnTph4MCB+W4J8LHXr18jIiICY8aMkdpHsVmzZti9ezfu3r0r8/Xxj7+O+zElJSVUqVKlyDkpKSmhe/fuCAsLQ0ZGBlRUVLBnzx5kZWVJJVwBYPPmzVi8eDH+/vtvZGVlFRmDPIYNG4aePXsiPT0dJ06cwIoVK5CdnS1T7+zZswgICMC5c+ek9vgEchOwRSV/7t27h+vXr4tfO/5UcnJysWNPSEiAgoKCmBzPY2xsjIoVKyIhIUGq/HPu06dSU1MB5CbkS8OzZ8/w+vVrrF27FmvXrs23zqf36NP55CUHCzvgLSUlRWq7iKpVq0pdz7v26tUrqb0wS6pv375YvHgxJkyYgJiYmAL3t8z7rD7d5kFZWRnVqlUTr+f9/08PeapQoQKqVasmVXbv3j3ExsaW6JmbNWsW3N3dUaNGDdSuXRsdOnTAgAEDpA7+y09OTg6WL1+OlStX4sGDB1I/S/r6+jL1C7v/QPHmWxgDAwOp38tubm6oWbMmevTogfXr12Ps2LEAcvegDQoKQnh4uMz9kecfVuLi4qCgoIBatWoVWffTuQO585fnH8p0dHTw888/4+eff0ZCQgKOHz+ORYsW4ddff4WOjg5mz55dZB9Dhw5FXFwcYmJipD4beZ+b3r17Y/369RgyZAimTp2KNm3aoFu3bujRo8c3c3gdERER0X8ZE65ERKVERUUFXbt2xd69e7Fy5UokJSXh7NmzmDt3rlQ9X19fdO7cGfv27cPRo0cxY8YMzJs3DydOnEC9evUK7H/nzp3IyMjA4sWLsXjxYpnroaGhCAoKkioraHXrxyvditKnTx+sWbMGhw8fRteuXbFjxw7Y2NhIrQzeunUrvL290bVrV0yePBmGhoZQVFTEvHnzZA64Ko6PE+SdOnWCoqIipk6dilatWsHJyQlAbhKlTZs2sLGxwZIlS2BmZgZlZWVERERg6dKlcq3uysnJQbt27fDjjz/mez2/fVDlJe8ejgV9ViWRd0Dbp8neksq7h/379y8wYfppsu/T+eT1sXDhwgL39M1bmZsnb1XfpwRBKDJmeXh6emLatGkYOnQo9PX1pfbw/NJycnJgb2+PJUuW5Hu9sP1PW7Rogbi4OOzfvx/Hjh3D+vXrsXTpUqxevRpDhgwpsN3cuXMxY8YMDBo0CMHBwdDT04OCggJ8fX3z/Tn50ve/MHn/aHX69Gkx4dqrVy/ExMRg8uTJqFu3LjQ1NZGTk4MOHTqU+irO0pq7ubk5Bg0aBA8PD1SrVg2hoaFFJlyXL1+Obdu2YevWrTI/K/I+N2pqajh9+jROnjyJQ4cO4ciRI9i+fTtat26NY8eOFTg/IiIiIiodTLgSEZWi3r17Y/PmzTh+/DhiY2MhCILMSlAg92CfSZMmYdKkSbh37x7q1q2LxYsXY+vWrQX2HRoaitq1ayMgIEDm2po1axAWFiaTcC0NLVq0gImJCbZv344ffvgBJ06cEA+dyrNr1y5Uq1YNe/bskUow5hfr55g+fTrWrVsHf39/8WCjgwcPIiMjAwcOHJBalZbfV7ILSn5aWVkhNTW1VFc/m5ubIycnB/fu3YOtra1YnpSUhNevX4uH63wJW7ZsgUQiKfQApeKoVKkStLS0kJ2dXeJ7lHeYlba2dqne5+IcSvSpqlWrolmzZoiKisLIkSOhpJT/H4vyPqs7d+5IrdzMzMzEgwcPxPnk1bt37574FW8g91CoBw8eSP0jhZWVFa5du4Y2bdqUaA56enrw8fGBj48PUlNT0aJFCwQGBhaacN21axdatWqFDRs2SJW/fv1aPFyqOIoz3+L68OEDgP9brf3q1SscP34cQUFBmDlzplgvv20VCvs5z8nJwe3bt0vtID956erqwsrKSvzHkIJER0fDz88Pvr6+6Nevn8z14jw3CgoKaNOmDdq0aYMlS5Zg7ty5mD59Ok6ePPnFvulBRERERLn4nSIiolLUtm1b6OnpYfv27di+fTsaNmwo9dXqtLQ0pKenS7WxsrKClpZWoXv7PXr0CKdPn0avXr3Qo0cPmZePjw/++ecfqdPDS4uCggJ69OiBgwcPYsuWLfjw4YNMEjlvtdTHq7/Onz+Pc+fOlWosFStWxPDhw3H06FFcvXq1wLFTUlKwadMmmfYaGhp4/fq1THmvXr1w7tw5HD16VOba69evxeRPcbi6ugIAli1bJlWetzLNzc2t2H3KY/78+Th27Bh69+4t81XvklJUVET37t2xe/fufBNGz549K7IPR0dHWFlZYdGiRWISrbh95Kegz1Res2fPRkBAgLiKMj9t27aFsrIyVqxYIfWcbdiwASkpKeJn6eTkhEqVKmH16tXIzMwU64WEhMjE2KtXLzx+/Bjr1q2TGe/9+/d49+5dgfG8ePFC6r2mpiaqV69e5P6gioqKMis0d+7cicePHxfariDFmW9xHTx4EADEpG1+P+eA7M8XkPtMAJCJoWvXrlBQUMCsWbNkVsSW1qrda9eu4fnz5zLlCQkJuH37tsy2FB9LTExEr1698MMPP2DhwoX51pH3uXn58qXM9bwks7z7yBIRERFRyXGFKxFRKapQoQK6deuG8PBwvHv3DosWLZK6fvfuXbRp0wa9evVCrVq1oKSkhL179yIpKQl9+vQpsN+wsDAIgoAuXbrke93V1RVKSkoIDQ1Fo0aNSnVOQO7K3V9++QUBAQGwt7eXWrEJ5H7df8+ePfDw8ICbmxsePHiA1atXo1atWvkm1z7H+PHjsWzZMsyfPx/h4eFo3749lJWV0blzZwwfPhypqalYt24dDA0NkZiYKNXW0dERq1atwuzZs1G9enUYGhqidevWmDx5Mg4cOIBOnTrB29sbjo6OePfuHW7cuIFdu3YhPj6+2CsAHRwc4OXlhbVr1+L169dwdnbGhQsXsHnzZnTt2hWtWrX6rPvw4cMHcUV0eno6EhIScODAAVy/fh2tWrUqcK/Vkpo/fz5OnjyJRo0aYejQoahVqxZevnyJy5cv448//sg3wfMxBQUFrF+/Hh07doSdnR18fHxQuXJlPH78GCdPnoS2traYZCsOR0dH/PHHH1iyZAlMTU1haWlZrJ8BZ2dnODs7F1qnUqVKmDZtGoKCgtChQwd06dIFd+7cwcqVK9GgQQP0798fQO7P/+zZszF8+HC0bt0avXv3xoMHD7Bp0yaZPU0HDBiAHTt2YMSIETh58iSaNWuG7Oxs/P3339ixYweOHj0qbpvxqVq1aqFly5ZwdHSEnp4eLl26hF27dmHMmDGFzqNTp06YNWsWfHx80LRpU9y4cQOhoaHF2m/1Y8WZb2EeP34sPsuZmZm4du0a1qxZAwMDAzERrq2tjRYtWuDnn39GVlYWKleujGPHjuHBgwcy/Tk6OgLIXRHfp08fVKhQAZ07d0b16tUxffp0BAcHo3nz5ujWrRtUVFRw8eJFmJqaYt68eSW6Dx+LjIxEQEAAunTpgsaNG0NTUxP379/Hxo0bkZGRgcDAwALbjhs3Ds+ePcOPP/6I8PBwqWt16tRBnTp15H5uZs2ahdOnT8PNzQ3m5uZITk7GypUrUaVKFfzwww+fPU8iIiIiKoJARPSNGz16tCDPr6vC6j148EAAICxcuFCuMZ89eyYAEAICAooTqiAIghAZGSkAECQSifDo0SOpa8+fPxdGjx4t2NjYCBoaGoKOjo7QqFEjYceOHYX2aW9vL1StWrXQOi1bthQMDQ2FrKws4eTJkwIAYefOnTL1vLy8BA0NjXz78PLyEszNzWXKc3JyBDMzMwGAMHv27Hyvz507VzA3NxdUVFSEevXqCf/73//y7U+e+1rU5+Xt7S0oKioK//zzjyAIgnDgwAGhTp06gqqqqmBhYSEsWLBA2LhxowBAePDggdju6dOngpubm6ClpSUAEJydncVrb9++FaZNmyZUr15dUFZWFgwMDISmTZsKixYtEjIzMwuNt6B7mpWVJQQFBQmWlpZChQoVBDMzM2HatGlCenq6VD1zc3PBzc2t0DE+HQ+A+FJXVxcsLCyE7t27C7t27RKys7Nl2jg7O0vNVxByP4vRo0dLlRV275OSkoTRo0cLZmZmQoUKFQRjY2OhTZs2wtq1a8U6hT17giAIV65cEbp16ybo6+sLKioqgrm5udCrVy/h+PHjYp2AgAABgPDs2TOptps2bZL5TP/++2+hRYsWgpqamgBA8PLyKui2yf17oKDP89dffxVsbGyEChUqCEZGRsLIkSOFV69eydRbuXKlYGlpKaioqAhOTk7C6dOn873/mZmZwoIFCwQ7OztBRUVF0NXVFRwdHYWgoCAhJSVFrGdubi41r9mzZwsNGzYUKlasKKipqQk2NjbCnDlzinxO09PThUmTJgkmJiaCmpqa0KxZM+HcuXMysRX0Gebdv02bNpVovvkxNzeXepYVFBQEQ0NDwdPTU/z5zvPvv/8KHh4eQsWKFQUdHR2hZ8+ewpMnT/L9nRIcHCxUrlxZUFBQkHlmNm7cKNSrV0+8587OzkJkZKRUTPn9PMozp/v37wszZ84UGjduLBgaGgpKSkpCpUqVBDc3N+HEiRNSdfOe84/7//hefPz6eH7yPDfHjx8X3N3dBVNTU0FZWVkwNTUVPD09hbt37xYaPxERERGVDokgfIWTD4iIiIiIiIiIiIjKAe7hSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiL6KkJAQSCQSXLp0qcA68fHxkEgkWLRoUaF9WVhYQCKRoG3btvleX7duHSQSSZHjFSYwMBASiQTPnz8vsE5UVBQkEgl27dold7+9evWCRCLBlClTCu1TIpFg69at+dZp1qwZJBIJateune/17OxsmJqaQiKR4PDhw3LHBgATJkxA/fr1oaenB3V1ddja2iIwMBCpqalytV+1ahV69uyJqlWrQiKRwNvbu1jj5322+b2sra1l6m/YsAG2trZQVVWFtbU1fvnlF5k63t7eUv0oKSnBzMwMffr0we3bt+WKS57n93Pcvn0bgYGBiI+P/yL9fy8xEBERERH9FyiVdQBEREQloaqqipMnT+Lp06cwNjaWuhYaGgpVVVWkp6eXUXT5e/PmDQ4ePAgLCwts27YN8+fPh0QiybeuqqoqwsLC0L9/f6ny+Ph4xMTEQFVVtcBxTpw4gcTERFhYWCA0NBQdO3aUO8aLFy+iefPm8PHxgaqqKq5cuYL58+fjjz/+wOnTp6GgUPi/1S5YsABv375Fw4YNkZiYKPe4eZYtWyaT3E1ISIC/vz/at28vVb5mzRqMGDEC3bt3x8SJExEdHY1x48YhLS1NJqGtoqKC9evXAwA+fPiAuLg4rF69GkeOHMHt27dhampa7FhL0+3btxEUFISWLVvCwsKi3MZARERERPRfwIQrERF9l5o1a4aLFy9i+/btGD9+vFj+77//Ijo6Gh4eHti9e3cZRihr9+7dyM7OxsaNG9G6dWucPn0azs7O+dZ1dXXFgQMH8Pz5cxgYGIjlYWFhMDIygrW1NV69epVv261bt6J+/frw8vLCTz/9hHfv3kFDQ0OuGM+cOSNTZmVlBT8/P1y4cAGNGzcutP2pU6fE1a2amppyjfmxrl27ypTNnj0bANCvXz+x7P3795g+fTrc3NzEFcZDhw5FTk4OgoODMWzYMOjq6or1lZSUZJLXjRs3RqdOnXDo0CEMHTq02LGWFUEQkJ6eDjU1tbIOhYiIiIiI8sEtBYiI6LukqqqKbt26ISwsTKp827Zt0NXVhYuLi0ybrKws/P333yVaeVkaQkND0a5dO7Rq1Qq2trYIDQ0tsK67uztUVFSwc+dOqfKwsDD06tULioqK+bZ7//499u7diz59+qBXr154//499u/f/1lx5612fP36dZF1zc3NC1y1W1JhYWGwtLRE06ZNxbKTJ0/ixYsXGDVqlFTd0aNH4927dzh06FCR/eatjFZSKtm/P3t7e0NTUxOPHz9G165doampiUqVKsHPzw/Z2dlSdcPDw+Ho6AgtLS1oa2vD3t4ey5cvB5C7XUHPnj0BAK1atRK3PoiKigKQe/87deqEo0ePwsnJCWpqalizZo24BUdISIhMbBKJBIGBgVJljx8/xuDBg2FqagoVFRVYWlpi5MiRyMzMLDIGIiIiIiKSHxOuRET03erbty8uXLiAuLg4sSwsLAw9evRAhQoVZOo/fvwYtra2mDZt2tcMEwDw5MkTnDx5Ep6engAAT09P7Nq1C5mZmfnWV1dXh7u7O7Zt2yaWXbt2Dbdu3ULfvn0LHOfAgQNITU1Fnz59YGxsjJYtWxaa2M3Phw8f8Pz5czx58gTHjh2Dv78/tLS00LBhw2L1UxquXLmC2NhYmTlfuXIFAODk5CRV7ujoCAUFBfH6x54/f47nz58jKSkJ586dw4QJE6Cvr49OnTqVOL7s7Gy4uLhAX18fixYtgrOzMxYvXoy1a9eKdSIjI+Hp6QldXV0sWLAA8+fPR8uWLXH27FkAQIsWLTBu3DgAwE8//YQtW7Zgy5YtsLW1Ffu4c+cOPD090a5dOyxfvhx169YtVpxPnjxBw4YNER4ejt69e2PFihUYMGAATp06hbS0NLliICIiIiIi+XBLASIi+m61bt0axsbG2LZtG/z9/REbG4urV69i+fLluH//flmHJ2Xbtm1QUVGBu7s7AKBPnz6YOXMmIiIi8v0aPZCbUO7cuTMePXoEMzMzhIaGolq1aoV+rX/r1q1o2rQpzMzMxHFGjRqFZ8+eoVKlSnLFeunSJTRp0kR8X7NmTRw4cAB6enpyzrb05CWLP95OAAASExOhqKgIQ0NDqXJlZWXo6+vjyZMnUuXv3r2TmX/lypVx7Ngxue9LftLT09G7d2/MmDEDADBixAjUr18fGzZswMiRIwEAhw4dgra2No4ePZrvyuRq1aqhefPmWLFiBdq1a4eWLVvK1Pnnn39w5MgRqZXbxTncatq0aXj69CnOnz8vlaSeNWsWBEFAxYoVi4yBiIiIiIjkwxWuRET03VJUVESvXr3EVaChoaEwMzND8+bN861vYWEBQRDy/Qr2lxYaGgo3NzdoaWkBAKytreHo6Fjo6tP27dtDT08P4eHhEAQB4eHh4grZ/Lx48QJHjx6VqtO9e3dIJBLs2LFD7lhr1aqFyMhI7Nu3Dz/++CM0NDRkDrL6GnJychAeHo569erJrLR8//49lJWV822nqqqK9+/fy5RFRkYiMjISR48exZo1a6CpqQlXV1fcvXv3s+IcMWKE1PvmzZtLJfwrVqyId+/eITIyssRjWFpa5rtNhjxycnKwb98+dO7cWWZFMIBS3wKCiIiIiKi84wpXIiL6rvXt2xcrVqzAtWvXEBYWhj59+nxzCaTY2FhcuXIFAwcOxD///COWt2zZEr/99hvevHkDbW1tmXYVKlRAz549ERYWhoYNG+LRo0eFbiewfft2ZGVloV69elLjNGrUCKGhoRg9ejQA4OXLl1JbGaipqUFHR0d8r62tjbZt2wLI3Us2LCwM7u7uuHz5MhwcHEp+I/6/9+/fIyUlRaosbz/Vj506dQqPHz/GhAkTZK6pqakVuB1DfgdKKSoqinPK4+rqCmtra0ybNk080OzZs2dSdfT09ApM7AK5idxPV8jq6upKHWg2atQo7NixAx07dkTlypXRvn179OrVCx06dCiw309ZWlrKXfdTz549w5s3b1C7du0S90FERERERPLjClciIvquNWrUCFZWVvD19cWDBw8KTUiWla1btwIAJkyYAGtra/G1ePFipKenY/fu3QW27du3L65evYrAwEA4ODigVq1aBdbNWy3brFkzqXHOnDmDc+fOiasuu3XrBhMTE/E1fvz4QuPv1q0bgNyDn0rD9u3bpcY3MTEpcD4KCgr5ruo1MTFBdnY2kpOTpcozMzPx4sULmJqaFhlHlSpVULNmTZw+fRoA8OjRI5m4YmJiCu2joMPLPmZoaIirV6/iwIED6NKlC06ePImOHTvCy8uryLZ5Pk0gAwWvTP30wC4iIiIiIvq6uMKViIi+e56enpg9ezZsbW2LfZjQlyYIAsLCwtCqVSuMGjVK5npwcDBCQ0Ph4+OTb/sffvgBVatWRVRUFBYsWFDgOA8ePEBMTAzGjBkDZ2dnqWs5OTkYMGAAwsLC4O/vj8WLF0utwCwqOZmRkYGcnByZVakl5eLiUuTX6zMyMrB79260bNky3/jyPudLly7B1dVVLL906RJycnLkfg4+fPggbpdgbGwsE1dprOgFcveW7dy5Mzp37oycnByMGjUKa9aswYwZM1C9evUSrcrW1dUFALx+/VqqPCEhQep9pUqVoK2tjZs3bxba37e2MpyIiIiI6HvFhCsREX33hgwZAkVFRTRq1KjQellZWYiLi4OOjk6BqypL29mzZxEfH49Zs2ahR48eMtfv3r2LGTNm4MmTJ/kmFiUSCVasWIErV65gwIABBY6Tt7r1xx9/FA/M+tj69esRGhoKf39/ODo65tvH69evoaGhgQoVKsi0BSC1/2daWhoePnwIAwMDGBgYFBhXfgpb1ZonIiICr1+/ljksK0/r1q2hp6eHVatWSSVcV61aBXV1dbi5uRUZx927d3Hnzh3xfqiqqspsO1AaXrx4AX19ffG9goIC6tSpAyA3sQwAGhoaAGSTp4XR1taGgYEBTp8+DV9fX7F85cqVUvUUFBTQtWtXbN26FZcuXZLZx1UQBEgkkhLFQEREREREsphwJSKir2rjxo04cuSITPnHX2s/fvw40tPTZep07do1330ozc3NERgYWOTYjx8/hq2tLby8vOQ+OGvJkiVQV1eXKlNQUMBPP/0kvt+9ezf+/vtvmbZeXl4IDQ2FoqJigQnALl26YPr06QgPD8fEiRPzrePu7g53d/dC4wwNDUXdunXzTbbmjTN27FhcvnwZ9evXz7dOVFQUxo0bhx49esDa2hqZmZmIjo7Gnj174OTkhP79+4t1L1y4gFatWiEgIEDq3h88eBDXrl0DkJvgvn79OmbPni3GkJdoLEpoaChUVFTQvXv3fK+rqakhODgYo0ePRs+ePeHi4oLo6Ghs3boVc+bMgZ6enlT9Dx8+iFs75OTkID4+HqtXr0ZOTg4CAgLkiqmkhgwZgpcvX6J169aoUqUKEhIS8Msvv6Bu3briYWB169aFoqIiFixYgJSUFKioqKB169YwNDQssu/58+djyJAhcHJywunTp/M9BGzu3Lk4duwYnJ2dMWzYMNja2iIxMRE7d+7EmTNnULFixRLHQERERERE0phwJSKir2rVqlX5lnt7e4v/feTIkXyTshYWFl/94J958+bJlCkqKkolXAva29TZ2Rk7d+5E06ZNZRKAeWrXrg1LS0ts3bq1wIRrUS5fvoy///4bM2bMKLBO586dMXbsWGzdurXAhKu9vT1atWqF/fv3IzExEYIgwMrKCjNnzsTkyZMLPTwqz+7du7F582bx/ZUrV3DlyhUAuXumypNwffPmDQ4dOgQ3Nzepw7w+NWrUKFSoUAGLFy/GgQMHYGZmhqVLl+a7J21GRobUCmFtbW00aNAAW7ZsQZs2bYqM6XP0798fa9euxcqVK/H69WsYGxujd+/eCAwMhIJC7nb6xsbGWL16NebNm4fBgwcjOzsbJ0+eLDLZOXPmTDx79gy7du0SD+Y6fPiwTLvKlSvj/PnzmDFjBkJDQ/HmzRtUrlwZHTt2FP9BoaQxEBERERGRNIkgCEJZB0FERERERERERET0X6BQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKiVJZB/AtyMnJwZMnT6ClpQWJRFLW4RAREREREdF/nCAIePv2LUxNTaGgwLVQRET/JUy4Anjy5AnMzMzKOgwiIiIiIiIqZx49eoQqVaqUdRhERFSKmHAFoKWlBSD3f+i0tbXLOJryISsrC8eOHUP79u1RoUKFsg6H6Ivgc07lAZ9zKg/4nFN5wOf863vz5g3MzMzEv48SEdF/BxOugLiNgLa2NhOuX0lWVhbU1dWhra3NP9DRfxafcyoP+JxTecDnnMoDPudlh9vaERH993CjGCIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolHAPVyIiIiIiIiIqVdnZ2cjKyirrMIiISo2ysjIUFORbu8qEKxERERERERGVCkEQ8PTpU7x+/bqsQyEiKlUKCgqwtLSEsrJykXWZcCUiIiIiIiKiUpGXbDU0NIS6ujokEklZh0RE9NlycnLw5MkTJCYmomrVqkX+bmPClYiIiIiIiIg+W3Z2tphs1dfXL+twiIhKVaVKlfDkyRN8+PABFSpUKLQuD80iIiIiIiIios+Wt2erurp6GUdCRFT68rYSyM7OLrIuE65EREREREREVGq4jQAR/RcV53cbE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrEREREREREVE5FBUVBYlEgtevX8vdxsLCAsuWLftiMRWkJLF+Td7e3ujatWup99uyZUv4+vqWer/0ZTHhSkRERERERET0jfH29oZEIsGIESNkro0ePRoSiQTe3t5fP7BvmIWFBSQSCSQSCRQVFWFqaorBgwfj1atXxernayU5s7OzMX/+fNjY2EBNTQ16enpo1KgR1q9fL9bZs2cPgoODv3gsVLqYcCUiIiIiIiIi+gaZmZkhPDwc79+/F8vS09MRFhaGqlWrlmFk365Zs2YhMTERDx8+RGhoKE6fPo1x48aVdVj5CgoKwtKlSxEcHIzbt2/j5MmTGDZsmNQqXj09PWhpaZVdkFQiTLgSEREREREREX2D6tevDzMzM+zZs0cs27NnD6pWrYp69epJ1c3IyMC4ceNgaGgIVVVV/PDDD7h48aJUnYiICNSoUQNqampo1aoV4uPjZcY8c+YMmjdvDjU1NZiZmWHcuHF49+6d3DFfvHgR7dq1g4GBAXR0dODs7IzLly9L1ZFIJFi/fj08PDygrq4Oa2trHDhwoNix5kdLSwvGxsaoXLkyWrVqBS8vL6nxX7x4AU9PT1SuXBnq6uqwt7fHtm3bxOve3t44deoUli9fLq6WzRv71q1b6NSpE7S1taGlpYXmzZsjLi5OavxFixbBxMQE+vr6GD16NLKysgqM9cCBAxg1ahR69uwJS0tLODg4YPDgwfDz8xPrfLzaNm9bhU9fH6903r9/P+rXrw9VVVVUq1YNQUFB+PDhg1z3jkoPE65ERERERERERN+oQYMGYdOmTeL7jRs3wsfHR6bejz/+iN27d2Pz5s24fPkyqlevDhcXF7x8+RIA8OjRI3Tr1g2dO3fG1atXMWTIEEydOlWqj7i4OHTo0AHdu3fH9evXsX37dpw5cwZjxoyRO963b9/Cy8sLZ86cwZ9//glra2u4urri7du3UvWCgoLQq1cvXL9+Ha6urujXr1+xYpXH48ePcfDgQTRq1EgsS09Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuAAAWL58OZo0aYKhQ4ciMTERiYmJMDMzw+PHj9GiRQuoqKjgxIkT+OuvvzBo0CCpZObJkycRFxeHkydPYvPmzQgJCUFISEiB8RkbG+PEiRN49uyZXPNp2rSpGFNiYiJOnDgBVVVVtGjRAgAQHR2NgQMHYvz48bh9+zbWrFmDkJAQzJkzp9j3jj6TQEJKSooAQEhJSSnrUMqNzMxMYd++fUJmZmZZh0L0xfA5p/KAzzmVB3zOqTzgc/71/Rf/Hvr+/Xvh9u3bwvv378s6lP8ELy8vwd3dXUhOThZUVFSE+Ph4IT4+XlBVVRWePXsmuLu7C15eXoIgCEJqaqpQoUIFITQ0VGyfmZkpmJqaCj///LMgCIIwbdo0oVatWlJjTJkyRQAgvHr1ShAEQRg8eLAwbNgwqTrR0dGCgoKC+Lmam5sLS5culXse2dnZgpaWlnDw4EGxDIDg7+8vvk9NTRUACIcPH5Y71vyYm5sLysrKgoaGhqCqqioAEBo1alRoG0EQBDc3N2HSpEnie2dnZ2H8+PFSdaZNmyZYWloW+HvSy8tLMDc3Fz58+CCW9ezZU+jdu3eB4966dUuwtbUVFBQUBHt7e2H48OFCRESEVJ38YhEEQXj+/LlQrVo1YdSoUWJZmzZthLlz50rV27Jli2BiYlJgDCS/4vyO4wpXIiIiIiIiIqJvVKVKleDm5oaQkBBs2rQJbm5uMDAwkKoTFxeHrKwsNGvWTCyrUKECGjZsiNjYWABAbGys1EpPAGjSpInU+2vXriEkJASampriy8XFBTk5OXjw4IFc8SYlJWHo0KGwtraGjo4OtLW1kZqaiocPH0rVq1OnjvjfGhoa0NbWRnJystyxFmTy5Mm4evUqrl+/juPHjwMA3NzckJ2dDSD3oKrg4GDY29tDT08PmpqaOHr0qEx8n7p69SqaN2+OChUqFFjHzs4OioqK4nsTExNxTvmpVasWbt68iT///BODBg1CcnIyOnfujCFDhhQaS1ZWFrp37w5zc3MsX75cLL927RpmzZol9fnlrdRNS0srtE8qXUplHQARERERERERERVs0KBB4tf6f/vtty82TmpqKoYPH57vIVPyHtLl5eWFFy9eYPny5TA3N4eKigqaNGmCzMxMqXqfJi4lEglycnJKHvz/Z2BggOrVqwMArK2tsWzZMjRp0gQnT55E27ZtsXDhQixfvhzLli2Dvb09NDQ04OvrKxPfp9TU1IocuyRzUlBQQIMGDdCgQQP4+vpi69atGDBgAKZPnw5LS8t824wcORKPHj3ChQsXoKT0f6m91NRUBAUFoVu3bjJtVFVVi4yfSg8TrkRERERERERE37AOHTogMzMTEokELi4uMtetrKygrKyMs2fPwtzcHEDuKsiLFy+KBy7Z2trKHEz1559/Sr2vX78+bt++LSYsS+Ls2bNYuXIlXF1dAeTux/r8+fNi9SFPrPLKW3H6/v17MT53d3f0798fAJCTk4O7d++iVq1aYhtlZWVxRWyeOnXqYPPmzcjKyip0levnyoujoIPKlixZgh07diAmJgb6+vpS1+rXr487d+581udHpYNbChARERERERERfcMUFRURGxuL27dvS31lPY+GhgZGjhyJyZMn48iRI7h9+zaGDh2KtLQ0DB48GAAwYsQI3Lt3D5MnT8adO3cQFhYmc6DTlClTEBMTgzFjxuDq1au4d+8e9u/fX6xDs6ytrbFlyxbExsbi/Pnz6Nevn1yrQz8mT6wFefv2LZ4+fYrExERcuHABkydPRqVKldC0aVMxvsjISMTExCA2NhbDhw9HUlKSVB8WFhY4f/484uPj8fz5c+Tk5GDMmDF48+YN+vTpg0uXLuHevXvYsmUL7ty5U6y5faxHjx5YunQpzp8/j4SEBERFRWH06NGoUaMGbGxsZOr/8ccf+PHHH7Fw4UIYGBjg6dOnePr0KVJSUgAAM2fOxO+//46goCDcunULsbGxCA8Ph7+/f4ljpJJhwpWIiIiIiIiI6Bunra0NbW3tAq/Pnz8f3bt3x4ABA1C/fn38888/OHr0KHR1dQHkbgmwe/du7Nu3Dw4ODli9ejXmzp0r1UedOnVw6tQp3L17F82bN0e9evUwc+ZMmJqayh3nhg0b8OrVK9SvXx8DBgzAuHHjYGhoWKy5yhNrQWbOnAkTExOYmpqiU6dO0NDQwLFjx8TVoP7+/qhfvz5cXFzQsmVLGBsbo2vXrlJ9+Pn5QVFREbVq1UKlSpXw8OFD6Ovr48SJE0hNTYWzszMcHR2xbt26z1rt6uLigoMHD6Jz586oUaMGvLy8YGNjg2PHjkltFZDnzJkzyM7OxogRI2BiYiK+xo8fL/b3v//9D8eOHUODBg3QuHFjLF26VFz1TF+PRBAEoayDKGtv3ryBjo4OUlJSCv3lRaUnKysLERERcHV1/aJL8YnKEp9zKg/4nFN5wOecygM+51/ff/Hvoenp6Xjw4AEsLS25XyQR/ecU53ccV7gSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUiJ75BnRt+jeseLVt27/ZeIgIiIiIiIiIiIqBFe4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLCQ7OIiIiIiIiI6Ita/mr5VxtrvO74ErV78eIFbG1tceHCBVhYWJRuUN+hFi1aYMSIEejbty8AQCKRYO/evejatWuZxWRhYQFfX1/4+vp+sZgaN26MyZMno3v37qXWJwBkZmaiRo0a2LVrF5ycnEq1b/r2cIUrEREREREREZV7c+bMgbu7u5hsjY+Ph0QiwdWrV796LN7e3pBIJJBIJKhQoQKMjIzQrl07bNy4ETk5OV98/AMHDiApKQl9+vT54mN9jsTERHTs2LFU+/T398fUqVM/6z7Pnz8fEolETAwDgLKyMvz8/DBlypRSiJK+dUy4EhEREREREVG5lpaWhg0bNmDw4MFfddzMzMwCr3Xo0AGJiYmIj4/H4cOH0apVK4wfPx6dOnXChw8fvmhcK1asgI+PDxQUvu20kbGxMVRUVEq1z44dO+Lt27c4fPhwidpfvHgRa9asQZ06dWSu9evXD2fOnMGtW7c+N0z6xn3bPzlERERERERERF9YREQEVFRU0Lhx4wLrREVFQSKR4Pjx43BycoK6ujqaNm2KO3fuSNU7ePAgGjRoAFVVVRgYGMDDw0O8ZmFhgeDgYAwcOBDa2toYNmxYgeOpqKjA2NgYlStXRv369fHTTz9h//79OHz4MEJCQsR6r1+/xpAhQ1CpUiVoa2ujdevWuHbtmtwxferZs2c4ceIEOnfuLHMtb0WpmpoaqlWrhl27dkldnzJlCmrUqAF1dXVUq1YNM2bMQFZWlnj92rVraNWqFbS0tKCtrQ1HR0dcunRJvH7mzBk0b94campqMDMzw7hx4/Du3bsCY5VIJNi3bx+A/1uRvGfPHrRq1Qrq6upwcHDAuXPnpNoUNYaioiJcXV0RHh5e4LgFSU1NRb9+/bBu3Tro6urKXNfV1UWzZs1K1Dd9X5hwJSIiIiIiIqJyLTo6Go6OjnLVnT59OhYvXoxLly5BSUkJgwYNEq8dOnQIHh4ecHV1xZUrV3D8+HE0bNhQqv2iRYvg4OCAK1euYMaMGcWKs3Xr1nBwcMCePXvEsp49eyI5ORmHDx/GX3/9hfr166NNmzZ4+fKl3DF97MyZM1BXV4etra3MtRkzZqB79+64du0a+vXrhz59+iA2Nla8rqWlhZCQENy+fRvLly/HunXrsHTpUvF6v379UKVKFVy8eBF//fUXpk6digoVKgAA4uLi0KFDB3Tv3h3Xr1/H9u3bcebMGYwZM6ZY92j69Onw8/PD1atXUaNGDXh6eoorguUdo2HDhoiOji7WuAAwevRouLm5oW3btgXWKWnf9H3hoVlEREREREREVK4lJCTA1NRUrrpz5syBs7MzAGDq1Klwc3NDeno6VFVVMWfOHPTp0wdBQUFifQcHB6n2rVu3xqRJk0ocq42NDa5fvw4gNzl64cIFJCcni1+tX7RoEfbt24ddu3Zh2LBhcsX0sYSEBBgZGeW7nUDPnj0xZMgQAEBwcDAiIyPxyy+/YOXKlQBy9z/NY2FhAT8/P4SHh+PHH38EADx8+BCTJ0+GjY0NAMDa2lqsP2/ePPTr10/c99Ta2horVqyAs7MzVq1aBVVVVbnuj5+fH9zc3AAAQUFBsLOzwz///AMbGxu5xzA1NcWjR4+Qk5Mj97YK4eHhuHz5Mi5evFhoPVNTUyQkJMjVJ32/uMKViIiIiIiIiMq19+/fy53Q+3hvThMTEwBAcnIyAODq1ato06ZNoe0/94R6QRAgkUgA5H5FPzU1Ffr6+tDU1BRfDx48QFxcnNwxfaywe9GkSROZ9x+vcN2+fTuaNWsGY2NjaGpqwt/fHw8fPhSvT5w4EUOGDEHbtm0xf/58Mca8uYSEhEjNw8XFBTk5OXjw4IHc8Rf2+cg7hpqaGnJycpCRkSHXmI8ePcL48eMRGhpa5HOkpqaGtLQ0uedD3yeucCUiIiIiIiKics3AwACvXr2Sq27eV+ABiInPvBPt1dTUimyvoaFRggj/T2xsLCwtLQHk7hlqYmKCqKgomXoVK1aUO6aPFedefOzcuXPo168fgoKC4OLiAh0dHYSHh2Px4sVincDAQPTt2xeHDh3C4cOHERAQgPDwcHh4eCA1NRXDhw/HuHHjZPquWrWq3HEU9vnIO8bLly+hoaEh973766+/kJycjPr164tl2dnZOH36NH799VdkZGRAUVFR7LtSpUpyz4e+T0y4EhEREREREVG5Vq9ePWzduvWz+6lTpw6OHz8OHx+fUohK1okTJ3Djxg1MmDABAFC/fn08ffoUSkpKsLCwKJWY6tWrh6dPn+LVq1cyBz/9+eefGDhwoNT7evXqAQBiYmJgbm6O6dOni9fz++p8jRo1UKNGDUyYMAGenp7YtGkTPDw8UL9+fdy+fRvVq1eXK86SkHeMmzdvivOSR5s2bXDjxg2pMh8fH9jY2GDKlClisrUkfdP3iVsKEBEREREREVG55uLiglu3bpVoZefHAgICsG3bNgQEBCA2NhY3btzAggULStRXRkYGnj59isePH+Py5cuYO3cu3N3d0alTJzHp2bZtWzRp0gRdu3bFsWPHEB8fj5iYGEyfPh2XLl0qUUz16tWDgYEBzp49K3Nt586d2LhxI+7evYuAgABcuHBBPHDK2toaDx8+RHh4OOLi4rBixQrs3btXbPv+/XuMGTMGUVFRSEhIwNmzZ3Hx4kXxcK4pU6YgJiYGY8aMwdWrV3Hv3j3s37+/2IdmFUbeMaKjo9G+fXu5+9XS0kLt2rWlXhoaGtDX10ft2rU/q2/6PnGFKxERERERERF9UeN1x5d1CIWyt7dH/fr1sWPHDgwfPrzE/bRs2RI7d+5EcHAw5s+fD21tbbRo0aJEfR05cgQmJiZQUlKCrq4uHBwcsGLFCnh5eYkHOUkkEkRERGD69Onw8fHBs2fPYGxsjBYtWsDIyKhEMSkqKsLHxwehoaHo1KmT1LWgoCCEh4dj1KhRMDExwbZt21CrVi0AQJcuXTBhwgSMGTMGGRkZcHNzw4wZMxAYGCj2++LFCwwcOBBJSUkwMDBAt27dxMO86tSpg1OnTmH69Olo3rw5BEGAlZUVevfuXaL7lx95xnj8+DFiYmKkVjzHx8fD0tISJ0+eRMuWLUs8/rlz55CSkoIePXp8zjToOyARBEEo6yDK2ps3b6Cjo4OUlBRoa2uXdTjlQlZWFiIiIuDq6iq1v0qB7h0r3gDW/NciKnvFfs6JvkN8zqk84HNO5QGf86/vv/j30PT0dDx48ACWlpZyH0D1LTl06BAmT56Mmzdvyn0y/X/V06dPYWdnh8uXL8Pc3Lysw/mqpkyZglevXmHt2rVi2cmTJ9GtWzfcv39fZpuF4ujduzccHBzw008/lUao9JUV53ccV7gSERERERERUbnn5uaGe/fu4fHjxzAzMyvrcMqUsbExNmzYgIcPH5a7hKuhoSEmTpwoVRYREYGffvrps5KtmZmZsLe3F/ffpf82JlyJiIiIiIiIiAD4+vqWdQjfjK5du5Z1CGVi0qRJMmULFy787H6VlZXh7+//2f3Q96F8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolCiVdQBERERERERE9N82/8rzrzbW1HoGJWr34sUL2Nra4sKFC7CwsCjdoL5DLVq0wIgRI9C3b18AgEQiwd69e9G1a9cyi8nCwgK+vr7w9fX9YjE1btwYkydPRvfu3Uutz6/RN31buMKViIiIiIiIiMq9OXPmwN3dXUy2xsfHQyKR4OrVq189Fm9vb0gkEkgkElSoUAFGRkZo164dNm7ciJycnC8+/oEDB5CUlIQ+ffp88bE+R2JiIjp27Fiqffr7+2Pq1KnFvs/Z2dmYMWMGLC0toaamBisrKwQHB0MQhM/um74/TLgSERERERERUbmWlpaGDRs2YPDgwV913MzMzAKvdejQAYmJiYiPj8fhw4fRqlUrjB8/Hp06dcKHDx++aFwrVqyAj48PFBS+7bSRsbExVFRUSrXPjh074u3btzh8+HCx2i1YsACrVq3Cr7/+itjYWCxYsAA///wzfvnll8/um74/3/ZPDhERERERERHRFxYREQEVFRU0bty4wDpRUVGQSCQ4fvw4nJycoK6ujqZNm+LOnTtS9Q4ePIgGDRpAVVUVBgYG8PDwEK9ZWFggODgYAwcOhLa2NoYNG1bgeCoqKjA2NkblypVRv359/PTTT9i/fz8OHz6MkJAQsd7r168xZMgQVKpUCdra2mjdujWuXbsmd0yfevbsGU6cOIHOnTvLXMtbUaqmpoZq1aph165dUtenTJmCGjVqQF1dHdWqVcOMGTOQlZUlXr927RpatWoFLS0taGtrw9HREZcuXRKvnzlzBs2bN4eamhrMzMwwbtw4vHv3rsBYJRIJ9u3bB+D/ViTv2bMHrVq1grq6OhwcHHDu3DmpNkWNoaioCFdXV4SHhxc4bn5iYmLg7u4ONzc3WFhYoEePHmjfvj0uXLjw2X3T94cJVyIiIvp67h0r3ouIiIjoK4iOjoajo6NcdadPn47Fixfj0qVLUFJSwqBBg8Rrhw4dgoeHB1xdXXHlyhUcP34cDRs2lGq/aNEiODg44MqVK5gxY0ax4mzdujUcHBywZ88esaxnz55ITk7G4cOH8ddff6F+/fpo06YNXr58KXdMHztz5gzU1dVha2src23GjBno3r07rl27hn79+qFPnz6IjY0Vr2tpaSEkJAS3b9/G8uXLsW7dOixdulS83q9fP1SpUgUXL17EX3/9halTp6JChQoAgLi4OHTo0AHdu3fH9evXsX37dpw5cwZjxowp1j2aPn06/Pz8cPXqVdSoUQOenp7iimB5x2jYsCGio6OLNW7Tpk1x/Phx3L17F0BucvnMmTMyWx6UpG/6/vDQLCIiIiIiIiIq1xISEmBqaipX3Tlz5sDZ2RkAMHXqVLi5uSE9PR2qqqqYM2cO+vTpg6CgILG+g4ODVPvWrVtj0qRJJY7VxsYG169fB5CbHL1w4QKSk5PFr9YvWrQI+/btw65duzBs2DC5YvpYQkICjIyM8t1OoGfPnhgyZAgAIDg4GJGRkfjll1+wcuVKALl7lOaxsLCAn58fwsPD8eOPPwIAHj58iMmTJ8PGxgYAYG1tLdafN28e+vXrJx6IZW1tjRUrVsDZ2RmrVq2CqqqqXPfHz88Pbm5uAICgoCDY2dnhn3/+gY2NjdxjmJqa4tGjR8jJyZF7W4WpU6fizZs3sLGxgaKiIrKzszFnzhz069dPql5J+qbvDz9ZIiIiIiIiIirX3r9/L3dCr06dOuJ/m5iYAACSk5MBAFevXkWbNm0Kbe/k5FTCKHMJggCJRAIgdxVlamoq9PX1oampKb4ePHiAuLg4uWP6WGH3okmTJjLvP17hun37djRr1gzGxsbQ1NSEv78/Hj58KF6fOHEihgwZgrZt22L+/PlijHlzCQkJkZqHi4sLcnJy8ODBA7njL+zzkXcMNTU15OTkICMjQ+5xd+zYgdDQUISFheHy5cvYvHkzFi1ahM2bN0vVK0nf9P3hClciIiIiIiIiKtcMDAzw6tUruermfQUegJj4zDt1Xk1Nrcj2GhoaJYjw/8TGxsLS0hIAkJqaChMTE0RFRcnUq1ixotwxfaw49+Jj586dQ79+/RAUFAQXFxfo6OggPDwcixcvFusEBgaib9++OHToEA4fPoyAgACEh4fDw8MDqampGD58OMaNGyfTd9WqVeWOo7DPR94xXr58CQ0NjWLdu8mTJ2Pq1Kno06cPAMDe3h4JCQmYN28evLy8Pqtv+v4w4UpEREREVJqKu/+wdfsvEwcREcmtXr162Lp162f3U6dOHRw/fhw+Pj6lEJWsEydO4MaNG5gwYQIAoH79+nj69CmUlJRgYWFRKjHVq1cPT58+xatXr6Crqyt17c8//8TAgQOl3terVw9A7qFR5ubmmD59ung9ISFBpv8aNWqgRo0amDBhAjw9PbFp0yZ4eHigfv36uH37NqpXry5XnCUh7xg3b94U5yWvtLQ0mS0CFBUVxWTv5/RN3x9uKUBERERERERE5ZqLiwtu3bpVopWdHwsICMC2bdsQEBCA2NhY3LhxAwsWLChRXxkZGXj69CkeP36My5cvY+7cuXB3d0enTp3EpGfbtm3RpEkTdO3aFceOHUN8fDxiYmIwffp0XLp0qUQx1atXDwYGBjh79qzMtZ07d2Ljxo24e/cuAgICcOHCBfHAKWtrazx8+BDh4eGIi4vDihUrsHfvXrHt+/fvMWbMGERFRSEhIQFnz57FxYsXxcO5pkyZgpiYGIwZMwZXr17FvXv3sH///mIfmlUYeceIjo5G+/bF+wfRzp07Y86cOTh06BDi4+Oxd+9eLFmyBB4eHp/dN31/uMKViIiIiIiIiL6oqfUMyjqEQtnb26N+/frYsWMHhg8fXuJ+WrZsiZ07dyI4OBjz58+HtrY2WrRoUaK+jhw5AhMTEygpKUFXVxcODg5YsWIFvLy8xJWUEokEERERmD59Onx8fPDs2TMYGxujRYsWMDIyKlFMioqK8PHxQWhoKDp16iR1LSgoCOHh4Rg1ahRMTEywbds21KpVCwDQpUsXTJgwAWPGjEFGRgbc3NwwY8YMBAYGiv2+ePECAwcORFJSEgwMDNCtWzfxMK86derg1KlTmD59Opo3bw5BEGBlZYXevXuX6P7lR54xHj9+jJiYGKkVz/Hx8bC0tMTJkyfRsmXLfPv+5ZdfMGPGDIwaNQrJyckwNTXF8OHDMXPmzEL7pv8miSAIQlkHUdbevHkDHR0dpKSkQFtbu6zDKReysrIQEREBV1dXqf1VCsSv5tF3qNjPOdF3iL/PqTzgc07lAf/c8vX9F/8emp6ejgcPHsDS0lLuA6i+JYcOHcLkyZNx8+bNcn96/NOnT2FnZ4fLly/D3Ny8rMP5qqZMmYJXr15h7dq1YtnJkyfRrVs33L9/X2abhc/tm74fxfkdxxWuRERERERERFTuubm54d69e3j8+DHMzMzKOpwyZWxsjA0bNuDhw4flLuFqaGiIiRMnSpVFRETgp59++qxka0F9038TE65ERERERERERAB8fX3LOoRvRteuXcs6hDIxadIkmbKFCxd+sb7pv6l8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSolTWARAR0f9371jx6lu3/zJxEBEREREREVGJcYUrEREREREREZV7L168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJQcWKFctsfCq+1atXo3PnzmUdxneNK1yJiIiIiIiI6ItKCQr6amPpBASUqN2cOXPg7u4OCwsLALmJ1FOnThVYPyoqCs7OziUa62tp2rQpEhMToaOj80XHkUgk2Lt3L7p27fpFx/kSAgMDER4ejkePHkFZWRmOjo6YM2cOGjVqVGCbFy9eoF+/frh+/bqYqHd3d8fcuXOhra0NADhz5gymTJmCv//+G2lpaTA3N8fw4cMxYcIEqb5+++03LFy4EE+fPoWDgwN++eUXNGzYULxuYWGBhIQEbNu2DX369JFqa2dnh9u3b2PTpk3w9vYu9txfvHgBBwcHPH78GK9evRIT44MGDUJwcDCio6PRvHnzYvdLXOFKREREREREROVcWloaNmzYgMGDB4tle/bsQWJiotQrISEBtWvXhpOTU6EJuW+FsrIyjI2NIZFIyjqUb1aNGjXw66+/4saNGzhz5gwsLCzQvn17PHv2rMA2CgoKcHd3x4EDB3D37l2EhITgjz/+wIgRI8Q6GhoaGDNmDE6fPo3Y2Fj4+/vD398fa9euFets374dEydOREBAAC5fvgwHBwe4uLggOTlZajwzMzNs2rRJquzPP//E06dPoaGhUeK5Dx48GHXq1JEpV1ZWRt++fbFixYoS913eMeFKREREREREROVaREQEVFRU0LhxY7FMT08PxsbGUq/g4GA8f/4ce/fuhaqqqlg3JycHP/74o9gmMDBQvBYfHw+JRIKrV6+KZa9fv4ZEIkFUVJRYdurUKTRs2BAqKiowMTHB1KlT8eHDB/F6y5YtMXbsWPj6+kJXVxdGRkZYt24d3r17Bx8fH2hpaaF69eo4fPiw2ObTLQXyvt5/9OhR2NraQlNTEx06dEBiYqLU/di4cSPs7OzEWMaMGfOZd/j/xMXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23bthUZT9++fdG2bVtUq1YNdnZ2WLJkCd68eYPr168X2EZXVxcjR46Ek5MTzM3N0aZNG4waNQrR0dFinXr16sHT0xN2dnawsLBA//794eLiIlVnyZIlGDp0KHx8fFCrVi2sXr0a6urq2Lhxo9R4/fr1w6lTp/Do0SOxbOPGjejXrx+UlEr25fVVq1bh9evX8PPzy/d6586dceDAAbx//75E/Zd3TLgSERERERERUbkWHR0NR0fHQuusXLkSv//+O3bv3o0qVapIXdu8eTM0NDRw/vx5/Pzzz5g1axYiIyPlHv/x48dwdXVFgwYNcO3aNaxatQobNmzA7NmzZcYxMDDAhQsXMHbsWIwcORI9e/ZE06ZNcfnyZbRv3x4DBgxAWlpagWOlpaVh0aJF2LJlC06fPo2HDx9KJd1WrVqF0aNHY9iwYbhx4wYOHDiA6tWryz2XoqSmpsLV1RXHjx/HlStX0KFDB3Tu3BkPHz6Uqrd06VI0a9YMV65cgZubGwYMGICBAweif//+uHz5MqysrDBw4EAIggAASE9Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuCB3bJmZmVi7di10dHTg4OAgd7snT55gz549hW4xceXKFcTExIh1MjMz8ddff6Ft27ZiHQUFBbRt2xbnzp2TamtkZAQXFxds3rwZQO5nuH37dgwaNEjuGD92+/ZtzJo1C7///jsUFPJPDTo5OeHDhw84f/58icYo75hwJSIiIiIiIqJyLSEhAaampgVeP336NHx9ffHbb7+hadOmMtfr1KmDgIAAWFtbY+DAgXBycsLx48flHn/lypUwMzPDr7/+ChsbG3Tt2hVBQUFYvHgxcnJyxHoODg7w9/eHtbU1pk2bBlVVVRgYGGDo0KGwtrbGzJkz8eLFi0JXZ2ZlZWH16tVwcnJC/fr1MWbMGKlYZ8+ejUmTJmH8+PGoUaMGGjRoUKqHgjk4OGD48OGoXbs2rK2tERwcDCsrKxw4cECqnqurK4YPHy7O682bN2jQoAF69uyJGjVqYMqUKYiNjUVSUhIAoHLlyvDz80PdunVRrVo1jB07Fh06dMCOHTuKjOl///sfNDU1oaqqiqVLlyIyMhIGBgZFtvP09IS6ujoqV64MbW1trF+/XqZOlSpVoKKiAicnJ4wePRpDhgwBADx//hzZ2dkwMjKSqm9kZISnT5/K9DNo0CCEhIRAEATs2rULVlZWqFu3bpExfiojIwOenp5YuHAhqlatWmA9dXV16OjoICEhodhjEBOuRERERERERFTOvX//XmqLgI89fPgQPXr0wLBhw8Rk2ac+3QfTxMREZh/OwsTGxqJJkyZSe602a9YMqamp+Pfff/MdR1FREfr6+rC3txfL8pJ3hY2trq4OKyurfGNNTk7GkydP0KZNm3zbjhgxApqamuKrJFJTU+Hn5wdbW1tUrFgRmpqaiI2NlVnh+vFc8+ZV2Fyzs7MRHBwMe3t76OnpQVNTE0ePHhX7DQ0NlYr946/2t2rVClevXkVMTAw6dOiAXr16if127NhRbGNnZycV49KlS3H58mXs378fcXFxmDhxosx8o6OjcenSJaxevRrLli2Ta5uD/Li5uSE1NRWnT5/Gxo0bS7y6ddq0abC1tUX//v2LrKumplboamkqWMk2eiAiIiL6SHxYPBShWGS9arILQoiIiIjKnIGBAV69eiVT/v79e3h4eMDOzg7Lli0rsH2FChWk3kskEnFlat5XtvO++g7krjItifzG+bgsL2H78apYefrIi01NTa3Q8WfNmlXgnp/y8vPzQ2RkJBYtWoTq1atDTU0NPXr0QGZmZoFx5s2rsLkuXLgQy5cvx7Jly2Bvbw8NDQ34+vqK/Xbp0kXqoLPKlSuL/62hoYHq1aujevXqaNy4MaytrbFhwwZMmzYN69evF/cx/fTe5e3ta2NjAz09PTRv3hwzZsyAiYmJWMfS0hJAbrI4KSkJgYGB8PT0hIGBARQVFcUVunmSkpJgbGwsc9+UlJQwYMAABAQE4Pz589i7d2+h97kgJ06cwI0bN7Br1y4A//dcGhgYYPr06QgKChLrvnz5EpUqVSrROOUdE65EREREREREVK7Vq1cPW7dulSkfMmQIXr58iaNHj5b4cKK8hFViYiLq1asHAFIHaAGAra0tdu/eDUEQxETi2bNnoaWlJbNf7JekpaUFCwsLHD9+HK1atZK5bmhoCENDw88a4+zZs/D29oaHhweA3BWv8fHxn9VnXr/u7u7iys2cnBzcvXsXtWrVApA7Ny0tLbn6ysnJQUZGBgDpxGxRbQCI7YrqV1lZGY6Ojjh+/Di6du0qXj9+/HiBh5QNGjQIixYtQu/evaGrqytXXJ/avXu31EFYFy9exKBBgxAdHS218jkuLg7p6eniM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRMLPNToSwkMDMSIESNgaGiIjh074u3btzh79izGjh1baLsHDx7IJJKtra1l6llbW2PPnj3o3LkzJBIJZsyYUeiKXHlZW1tj165diImJga6uLpYsWYKkpCQx4Zqfd+/eYc6cOejSpQtMTEzw/Plz/Pbbb3j8+DF69uxZYLuIiAgkJSWhQYMG0NTUxK1btzB58mQ0a9YMFhYWAIDffvsNVatWhY2NDYDcfYAXLVqEcePGif1MnDgRXl5ecHJyQsOGDbFs2TK8e/cOPj4++Y5ra2uL58+fQ11dvQR3KNfHSVUgdy/ZvL4rVqwolkdHR6NatWoy9Uk+TLgSERERERERUblmb2+P+vXrY8eOHRg+fDiA3IOssrKy0KFDh3zbbNq0Cd7e3nL1v3HjRgwePBiOjo6oWbMmfv75Z7Rv3168XrlyZURERGDy5MlwcHCAnp4eBg8eLJOY/Rq8vLyQnp6OpUuXws/PDwYGBujRo0eR7Qrav/RTS5YswaBBg9C0aVMYGBhgypQpePPmzWfH7e/vj/v378PFxQXq6uoYNmwYunbtipSUlALbKCoq4u+//8bmzZvx/Plz6Ovro0GDBoiOjpbZr/VjampqWLduHSZMmICMjAyYmZmhW7dumDp1qlgnJycH06ZNw4MHD6CkpAQrKyssWLBAfL4AoHfv3nj27BlmzpyJp0+fom7dujhy5IjMQVof09fXL/Q+eHt7Iz4+HlFRUYXWK8q2bdswdOjQz+qjPJMIH28iUk69efMGOjo6SElJgba2dlmHUy5kZWUhIiICrq6uMnug5OveseINYN2+6DpEXxifcyoP8p7zWq9rybmH6z/FG4DPOX0D+PucygM+51/ff/Hvoenp6Xjw4AEsLS0LPIDqW3bo0CFMnjwZN2/e/OqrSolKi7OzM1q1aoXAwMAS93Hr1i20bt0ad+/ehY6OTukF950rzu84rnAlIiIiIiIionLPzc0N9+7dw+PHj2FmZlbW4RAVW0pKCuLi4nDo0KHP6icxMRG///47k62fgQlXIiIiIiIiIiIAvr6+ZR0CUYnp6Ojg33///ex+2rZtWwrRlG/fzBr5+fPnQyKRSP1yS09Px+jRo6Gvrw9NTU10794dSUlJUu0ePnwINzc3qKurw9DQEJMnT8aHDx++cvRERERERERERERE30jC9eLFi1izZg3q1KkjVT5hwgQcPHgQO3fuxKlTp/DkyRN069ZNvJ6dnQ03NzdkZmYiJiYGmzdvRkhICGbOnPm1p0BERERERERERERU9lsKpKamol+/fli3bh1mz54tlqekpGDDhg0ICwtD69atAeSeAGhra4s///wTjRs3xrFjx3D79m388ccfMDIyQt26dREcHIwpU6YgMDAQysrKZTUtIiJRfFi8nIcJfYVgiIiIiArBP7cQERF9vjJPuI4ePRpubm5o27atVML1r7/+QlZWltS+ETY2NqhatSrOnTuHxo0b49y5c7C3t4eRkZFYx8XFBSNHjsStW7dQr169fMfMyMhARkaG+P7NmzcAck/mzMrKKu0pUj7y7rPc9ztbKO4AxYyIqPTlPd/ZyJavPp9z+g7xOafygH9uofKAv8+/Pv7dk4jov6tME67h4eG4fPkyLl68KHPt6dOnUFZWRsWKFaXKjYyM8PTpU7HOx8nWvOt51woyb948BAUFyZQfO3YM6urqxZ0GfYbIyMgv0/GdiC/TL1EJ3Kl4R656t+Wr9lHHfM7p28HnnMoD/rmFygP+Pv960tLSyjoEIiL6Qsos4fro0SOMHz8ekZGRUFVV/apjT5s2DRMnThTfv3nzBmZmZmjfvj20tbW/aizlVVZWFiIjI9GuXTtUqFCh6AZxJ4o3gFXrkgVGVIrynvOar2vK9dU8i0b3izcAn3P6BvA5p/KAf26h8oC/z7++vG9aEhHRf0+ZJVz/+usvJCcno379+mJZdnY2Tp8+jV9//RVHjx5FZmYmXr9+LbXKNSkpCcbGxgAAY2NjXLhwQarfpKQk8VpBVFRUoKKiIlNeoUIF+f4QTaXm8c7Hcu4RJSlex/wc6Rui+P//rygVFPmc0/eLzzmVB3L/WZHPOX3H+Pv86+HfPYmI/rsUymrgNm3a4MaNG7h69ar4cnJyQr9+/cT/rlChAo4fPy62uXPnDh4+fIgmTZoAAJo0aYIbN24gOTlZrBMZGQltbW3UqlXrq8+JiIiIiIiIiL5PL168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJkdkukr5tR44cQd26dZGTk1PWoXy3yizhqqWlhdq1a0u9NDQ0oK+vj9q1a0NHRweDBw/GxIkTcfLkSfz111/w8fFBkyZN0LhxYwBA+/btUatWLQwYMADXrl3D0aNH4e/vj9GjR+e7gpWIiIiIiIiIysDfkq/3KqE5c+bA3d0dFhYWAHITqRKJpMDXqVOnSunmfDlNmzZFYmIidHR0vug4EokE+/bt+6JjfCmBgYGwsbGBhoYGdHV10bZtW5w/f77QNi9evECHDh1gamoKFRUVmJmZYcyYMVJbhZw5cwbNmjWDvr4+1NTUYGNjg6VLl8r09dtvv8HCwgKqqqpo1KiRzDe5LSwsIJFIEB4eLtPWzs4OEokEISEhxZrzxYsX0aZNG1SsWBG6urpwcXHBtWvXxOsdOnRAhQoVEBoaWqx+6f+UWcJVHkuXLkWnTp3QvXt3tGjRAsbGxtizZ494XVFREf/73/+gqKiIJk2aoH///hg4cCBmzZpVhlETERERERER0fckLS0NGzZswODBg8WyPXv2IDExUeqVkJCA2rVrw8nJCY0aNSrDiOWjrKwMY2NjSCQlT0T/19WoUQO//vorbty4gTNnzsDCwgLt27fHs2fPCmyjoKAAd3d3HDhwAHfv3kVISAj++OMPjBgxQqyjoaGBMWPG4PTp04iNjYW/vz/8/f2xdu1asc727dsxceJEBAQE4PLly3BwcICLi4vUN7kBwMzMDJs2bZIq+/PPP/H06VNoaGgUa76pqano0KEDqlativPnz+PMmTPQ0tKCi4sLsrKyxHre3t5YsWJFsfqm//NNJVyjoqKwbNky8b2qqip+++03vHz5Eu/evcOePXtk9mY1NzdHREQE0tLS8OzZMyxatAhKSmW2NS0RERERERERfWciIiKgoqIifqMWAPT09GBsbCz1Cg4OxvPnz7F3716pA8BzcnLw448/im0CAwPFa/Hx8ZBIJLh69apY9vr1a0gkEkRFRYllp06dQsOGDaGiogITExNMnToVHz58EK+3bNkSY8eOha+vL3R1dWFkZIR169bh3bt38PHxgZaWFqpXr47Dhw+LbT7dUiDv6/1Hjx6Fra0tNDU10aFDByQmJkrdj40bN8LOzk6MZcyYMZ95h/9PXFwc3N3dYWRkBE1NTTRo0AB//PGHVB0LCwvMnj0bAwcOhKamJszNzXHgwAE8e/YM7u7u0NTURJ06dXDp0iWxzYsXL+Dp6YnKlStDXV0d9vb22LZtW5Hx9O3bF23btkW1atVgZ2eHJUuW4M2bN7h+/XqBbXR1dTFy5Eg4OTnB3Nwcbdq0wahRoxAdHS3WqVevHjw9PWFnZwcLCwv0798fLi4uUnWWLFmCoUOHwsfHB7Vq1cLq1auhrq6OjRs3So3Xr18/nDp1Co8ePRLLNm7ciH79+hU7B/b333/j5cuXmDVrFmrWrAk7OzsEBAQgKSkJCQkJYr3OnTvj0qVLiIuLK1b/lOubSrgSEREREX2r4sPicX/z/SJfRET0/YmOjoajo2OhdVauXInff/8du3fvRpUqVaSubd68GRoaGjh//jx+/vlnzJo1C5GRkXKP//jxY7i6uqJBgwa4du0aVq1ahQ0bNmD27Nky4xgYGODChQsYO3YsRo4ciZ49e6Jp06a4fPky2rdvjwEDBiAtLa3AsdLS0rBo0SJs2bIFp0+fxsOHD+Hn5ydeX7VqFUaPHo1hw4bhxo0bOHDgAKpXry73XIqSmpoKV1dXHD9+HFeuXEGHDh3QuXNnPHz4UKre0qVL0axZM1y5cgVubm4YMGAABg4ciP79++Py5cuwsrLCwIEDIQgCACA9PR2Ojo44dOgQbt68iWHDhmHAgAEyX9EvTGZmJtauXQsdHR04ODjI3e7JkyfYs2cPnJ2dC6xz5coVxMTEiHUyMzPx119/oW3btmIdBQUFtG3bFufOnZNqa2RkBBcXF2zevBlA7me4fft2DBo0SO4Y89SsWRP6+vrYsGEDMjMz8f79e2zYsAG2trbidhoAULVqVRgZGUkliEl+TLgSERERERERUbmWkJAAU1PTAq+fPn0avr6++O2339C0aVOZ63Xq1EFAQACsra0xcOBAODk5SR0CXpSVK1fCzMwMv/76K2xsbNC1a1cEBQVh8eLFUgcXOTg4wN/fH9bW1pg2bRpUVVVhYGCAoUOHwtraGjNnzsSLFy8KXZ2ZlZWF1atXw8nJCfXr18eYMWOkYp09ezYmTZqE8ePHo0aNGmjQoEGpHgrm4OCA4cOHo3bt2rC2tkZwcDCsrKxw4MABqXqurq4YPny4OK83b96gQYMG6NmzJ2rUqIEpU6YgNjYWSUlJAIDKlSvDz88PdevWRbVq1TB27Fh06NABO3bsKDKm//3vf9DU1ISqqiqWLl2KyMhIGBgYFNnO09MT6urqqFy5MrS1tbF+/XqZOlWqVIGKigqcnJwwevRoDBkyBADw/PlzZGdnw8jISKq+kZERnj59KtPPoEGDEBISAkEQsGvXLlhZWaFu3bpFxvgpLS0tREVFYevWrVBTU4OmpiaOHDmCw4cPy6yWNTU1lVr1SvJjwpWIiIiIiIiIyrX3799LbRHwsYcPH6JHjx4YNmyYmCz7VJ06daTem5iYyOzDWZjY2Fg0adJEaq/VZs2aITU1Ff/++2++4ygqKkJfXx/29vZiWV7yrrCx1dXVYWVllW+sycnJePLkCdq0aZNv2xEjRkBTU1N8lURqair8/Pxga2uLihUrQlNTE7GxsTIrXD+ea968CptrdnY2goODYW9vDz09PWhqauLo0aNiv6GhoVKxf7xys1WrVrh69SpiYmLQoUMH9OrVS+y3Y8eOYhs7OzupGJcuXYrLly9j//79iIuLw8SJE2XmGx0djUuXLmH16tVYtmyZXNsc5MfNzQ2pqak4ffo0Nm7cWKLVrUDusz548GA0a9YMf/75J86ePYvatWvDzc0N79+/l6qrpqZW6GppKhg3OyUiIiIiIiKics3AwACvXr2SKX///j08PDxgZ2cndebMpypUqCD1XiKRiCtTFRRy17rlffUdgNThRMWR3zgfl+UlbD9eFStPH3mxqampFTr+rFmzpLYfKAk/Pz9ERkZi0aJFqF69OtTU1NCjRw9kZmYWGGfevAqb68KFC7F8+XIsW7YM9vb20NDQgK+vr9hvly5dpA46q1y5svjfGhoaqF69OqpXr47GjRvD2toaGzZswLRp07B+/XoxEfnpvcvb29fGxgZ6enpo3rw5ZsyYARMTE7GOpaUlgNxkcVJSEgIDA+Hp6QkDAwMoKiqKK3TzJCUlyZxfBABKSkoYMGAAAgICcP78eezdu7fQ+1yQsLAwxMfH49y5c+KzGRYWBl1dXezfvx99+vQR6758+RKVKlUq0TjlHROuRERERERERFSu1atXD1u3bpUpHzJkCF6+fImjR4+W+IDuvIRVYmIi6tWrBwBSB2gBgK2tLXbv3g1BEMRE4tmzZ6GlpSWzX+yXpKWlBQsLCxw/fhytWrWSuW5oaAhDQ8PPGuPs2bPw9vaGh4cHgNwVr/Hx8Z/VZ16/7u7u6N+/P4DcROzdu3dRq1YtALlz09LSkquvnJwcZGRkAJBOzBbVBoDYrqh+lZWV4ejoiOPHj6Nr167i9ePHjxd4SNmgQYOwaNEi9O7dG7q6unLF9am0tDQoKChIrabOe/9xoj49PR1xcXHiM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRPFVYhfS2BgIEaMGAFDQ0N07NgRb9++xdmzZzF27NhC2z148EAmkWxtbS1Tz9raGnv27EHnzp0hkUgwY8aMQlfkysva2hq7du1CTEwMdHV1sWTJEiQlJYkJ1/y8e/cOc+bMQZcuXWBiYoLnz5/jt99+w+PHj9GzZ88C20VERCApKQkNGjSApqYmbt26hcmTJ6NZs2biwVO//fYbqlatChsbGwC5+wAvWrQI48aNE/uZOHEivLy84OTkhIYNG2LZsmV49+4dfHx88h3X1tYWz58/h7q6egnuUK527dph8uTJGD16NMaOHYucnBzMnz8fSkpKUkn2P//8EyoqKmjSpEmJxyrPmHAlIiIiIiIionLN3t4e9evXx44dOzB8+HAAuQdZZWVloUOHDvm22bRpE7y9veXqf+PGjRg8eDAcHR1Rs2ZN/Pzzz2jfvr14vXLlyoiIiMDkyZPh4OAAPT09DB48WCYx+zV4eXkhPT0dS5cuhZ+fHwwMDNCjR48i2xW0f+mnlixZgkGDBqFp06YwMDDAlClT8ObNm8+O29/fH/fv34eLiwvU1dUxbNgwdO3aFSkpKQW2UVRUxN9//43Nmzfj+fPn0NfXR4MGDRAdHS2zX+vH1NTUsG7dOkyYMAEZGRkwMzNDt27dMHXqVLFOTk4Opk2bhgcPHkBJSQlWVlZYsGCB+HwBQO/evfHs2TPMnDkTT58+Rd26dXHkyBGZg7Q+pq+vX+h98Pb2Rnx8PKKiovK9bmNjg4MHDyIoKAhNmjSBgoIC6tWrhyNHjkhthbBt2zb069fvs5K75ZlE+HgTkXLqzZs30NHRQUpKCrS1tcs6nHIhKysLERERqPW6FhShWGT9ak3/Kd4A1u2LrkP0hfE5p/KAzzmVB3zOqTzgc/71/Rf/Hpqeno4HDx7A0tKywAOovmWHDh3C5MmTcfPmza++qpSotDg7O6NVq1YIDAwscR/Pnz9HzZo1cenSJXEPWire7ziucCUiIiIiIiKics/NzQ337t3D48ePYWZmVtbhEBVbSkoK4uLicOjQoc/qJz4+HitXrmSy9TMw4UpEREREREREBMDX17esQyAqMR0dHfz777+f3Y+TkxOcnJxKIaLyi2vkiYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERFRuffixQsYGhoiPj6+rEORW1RUFCQSCV6/fg0ACAkJQcWKFcs0ppKKj4+HRCLB1atXyzoUKqcaN26M3bt3l0pfSqXSCxERERERERFRAe5vvv/VxqrmVa1E7ebMmQN3d3dYWFiUbkCfiIqKQqtWrfDq1avvNjlaUi1btkTdunWxbNmysg6l2Pbs2YO5c+fin3/+QVZWFqytrTFp0iQMGDCgwDaJiYmYNGkSLl26hH/++Qfjxo2Ta+7Z2dkIDAzE1q1b8fTpU5iamsLb2xv+/v6QSCQAcu/lqVOnxDaGhoZo0aIFFi1aBHNz8wL7Lu3nLzAwEPv27ftqiXILCwv4+vrC19e3xH2Eh4fD09MT7u7u2Ldvn1ju7++PCRMmwMPDAwoKn7dGlStciYiIiIiIiKhcS0tLw4YNGzB48OCyDoW+UXp6epg+fTrOnTuH69evw8fHBz4+Pjh69GiBbTIyMlCpUiX4+/vDwcFB7rEWLFiAVatW4ddff0VsbCwWLFiAn3/+Gb/88otUvaFDhyIxMRFPnjzB/v378ejRI/Tv37/Ec/ySsrKyyjoEALkrqf38/NC8eXOZax07dsTbt29x+PDhzx6HCVciIiIiIiIiKtciIiKgoqKCxo0bi2V5X9c/evQo6tWrBzU1NbRu3RrJyck4fPgwbG1toa2tjb59+yItLU1sl5OTg3nz5sHS0hJqampwcHDArl27AOQme1q1agUA0NXVhUQigbe3NwDgyJEj+OGHH1CxYkXo6+ujU6dOiIuL+6x5xcXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23btn1WrJ/Kzs7G4MGDxftYs2ZNLF++XKqOt7c3unbtirlz58LIyAgVK1bErFmz8OHDB0yePBl6enqoUqUKNm3aJNVuypQpqFGjBtTV1VGtWjXMmDGjyIRgy5Yt4eHhAVtbW1hZWWH8+PGoU6cOzpw5U2AbCwsLLF++HAMHDoSOjo7cc4+JiYG7uzvc3NxgYWGBHj16oH379rhw4YJUPXV1dRgbG8PExASNGzfGmDFjcPnyZbnHAf5vG4qjR4/C1tYWmpqa6NChAxITE8U6UVFRaNiwITQ0NFCxYkU0a9YMCQkJCAkJQVBQEK5duwaJRAKJRIKQkBAAgEQiwapVq9ClSxdoaGhgzpw5+W55sW/fPnHVbp6DBw+iQYMGUFVVhYGBATw8PADkfgYJCQmYMGGCOF5xZGdno1+/fggKCkK1arIr4RUVFeHq6orw8PBi9ZsfJlyJiIiIiIiIqFyLjo6Go6NjvtcCAwPx66+/IiYmBo8ePUKvXr2wbNkyhIWF4dChQzh27JjUysN58+bh999/x+rVq3Hr1i1MmDAB/fv3x6lTp2BmZibuEXnnzh0kJiaKScR3795h4sSJuHTpEo4fPw4FBQV4eHggJyenxPNKTU2Fq6srjh8/jitXrqBDhw7o3LkzHj58KFVv6dKlaNasGa5cuQI3NzcMGDAAAwcORP/+/XH58mVYWVlh4MCBEAQBAJCeng5HR0ccOnQIN2/exLBhwzBgwACZhODnyMnJQZUqVbBz507cvn0bM2fOxE8//YQdO3ZI1Ttx4gSePHmC06dPY8mSJQgICECnTp2gq6uL8+fPY8SIERg+fDj+/fdfsY2WlhZCQkJw+/ZtLF++HOvWrcPSpUvljk0QBBw/fhx37txBixYtSm3OeZo2bYrjx4/j7t27AIBr167hzJkz6NixY4FtXr58iR07dqBRo0bFHi8tLQ2LFi3Cli1bcPr0aTx8+BB+fn4AgA8fPqBr165wdnbG9evXce7cOQwbNgwSiQS9e/fGpEmTYGdnh8TERCQmJqJ3795iv4GBgfDw8MCNGzcwaNAguWI5dOgQPDw84OrqiitXruD48eNo2LAhgNxtHapUqYJZs2aJ4xXHrFmzYGhoWOhK9oYNGyI6OrpY/eaHe7gSERERERERUbmWkJAAU1PTfK/Nnj0bzZo1AwAMHjwY06ZNQ1xcnLhCrkePHjh58iSmTJmCjIwMzJ07F3/88QeaNGkCAKhWrRrOnDmDNWvWwNnZGXp6egBy99z8eLVf9+7dpcbduHEjKlWqhNu3b6N27dolmpeDg4PUV9mDg4Oxd+9eHDhwAGPGjBHLXV1dMXz4cADAzJkzsWrVKjRo0AA9e/YEkLsitEmTJkhKSoKxsTEqV64sJuQAYOzYsTh69Ch27NghJsc+V4UKFRAUFCS+t7S0xLlz57Bjxw706tVLLNfT08OKFSugoKCAmjVr4ueff0ZaWhp++uknAMC0adMwf/58nDlzBn369AGQu1dnHgsLC/j5+SE8PBw//vhjoTGlpKSgcuXKyMjIgKKiIlauXIl27dqVynw/NnXqVLx58wY2NjZQVFREdnY25syZg379+knVW7lyJdavXw9BEJCWloYaNWoUusVBQbKysrB69WpYWVkBAMaMGYNZs2YBAN68eYOUlBR06tRJvG5rayu21dTUhJKSEoyNjWX67du3L3x8fIoVy5w5c9CnTx+pzz7vGdbT04OioiK0tLTyHa8wZ86cwYYNG4rca9bU1BSPHj1CTk7OZ+3jyhWuRERERERERFSuvX//Hqqqqvleq1OnjvjfRkZG4tfQPy5LTk4GAPzzzz9IS0tDu3btoKmpKb5+//33IrcHuHfvHjw9PVGtWjVoa2uLh3flrUbt2LGj2J+dnZ1c80pNTYWfnx9sbW1RsWJFaGpqIjY2VmaF66dzBAB7e3uZsrx5ZmdnIzg4GPb29tDT04OmpiaOHj0q9hsaGio1/5KuGPztt9/g6OiISpUqQVNTE2vXrpWJ3c7OTioxZmRkJBW7oqIi9PX1xdgBYPv27WjWrBmMjY2hqakJf39/sd+HDx9KxT537lyxnZaWFq5evYqLFy9izpw5mDhxIqKioko0NyB3ZfXHY4WGhgIAduzYgdDQUISFheHy5cvYvHkzFi1ahM2bN0u179evH65evSqugK1evTrat2+Pt2/fivcmr+/CVseqq6uLyVQAMDExEe+Xnp4evL294eLigs6dO2P58uVyryx1cnIq1v0AgKtXr6JNmzbFbleYt2/fYsCAAVi3bh0MDAwKraumpoacnBxkZGR81phc4UpERERERERE5ZqBgQFevXqV77UKFSqI/y2RSKTe55Xlfe0/NTUVQO7XoitXrixVT0VFpdAYOnfuDHNzc6xbtw6mpqbIyclB7dq1kZmZCQBYv3493r9/LxNTYfz8/BAZGYlFixahevXqUFNTQ48ePcQ+C5pjQWV581y4cCGWL1+OZcuWwd7eHhoaGvD19RX77dKli9RX2z+9F/IIDw+Hn58fFi9ejCZNmkBLSwsLFy7E+fPnC4w9L9bCPqNz586J+3i6uLhAR0cH4eHhWLx4MYDcFY4fr4LMW5EMAAoKCqhevToAoG7duoiNjcW8efPQsmXLYs8PyE1IfjxWXmJ78uTJmDp1qrgi197eHgkJCZg3bx68vLzE+jo6OmI81atXx4YNG2BiYoLt27djyJAhiIiIEPemVVNTKzCO/O5X3vYRALBp0yaMGzcOR44cwfbt2+Hv74/IyEipPY/zo6GhIfVeQUFBql9A9jCtwuIsqbi4OMTHx6Nz585iWd7zoKSkhDt37ogJ55cvX0JDQ+Oz42DClYiIiIiIiIjKtXr16mHr1q2f3U+tWrWgoqKChw8fwtnZOd86ysrKAHJXieZ58eIF7ty5g3Xr1omnp396GFNJkpZnz56Ft7e3eOhQamoq4uPji91Pfv26u7ujf//+AHKTV3fv3kWtWrUA5K4E1dLS+uwxmjZtilGjRolln3uIGJB7IJW5uTmmT58uliUkJIj/raSkJCYxi/K5KyHV1NTyHSstLU3m6+yKiopF7uerqKgIAGJi3tzcvMSxfapevXqoV68epk2bhiZNmiAsLAyNGzeGsrKy1LNcmEqVKuHt27d49+6dmIz99Cv+derUwfHjxwvciqA44+WxsbHBjRs3pMr8/f3x9u1bLF++HGZmZmL5zZs3Ua9evWL1nx8mXImIiIiIiIioXHNxccG0adPw6tUr6OrqlrgfLS0t+Pn5YcKECcjJycEPP/yAlJQUnD17Ftra2vDy8oK5uTkkEgn+97//wdXVFWpqatDV1YW+vj7Wrl0LExMTPHz4EFOnTv3seVlbW2PPnj3o3LkzJBIJZsyY8VmHcH3c765duxATEwNdXV0sWbIESUlJYsK1MM+ePZNJspmYmOQ7xu+//46jR4/C0tISW7ZswcWLF2FpafnZsT98+BDh4eFo0KABDh06hL179xbZbt68eXBycoKVlRUyMjIQERGBLVu2YNWqVWKdadOm4fHjx/j999/Fsry5pqaminNXVlYu9F517twZc+bMQdWqVWFnZ4crV65gyZIlMgdPpaWl4enTpwCApKQkBAcHQ1VVFe3bty/OLSnUgwcPsHbtWnTp0gWmpqa4c+cO7t27h4EDBwLI3QP3wYMHuHr1KqpUqQItLa0CV3M3atQI6urq+OmnnzBu3DicP38eISEhUnUCAgLQpk0bWFlZoU+fPvjw4QMiIiIwZcoUcbzTp0+jT58+UFFRKXKLAABQVVWV2Qc5b//kT8ujo6NL5f4x4UpEREREREREX1Q1r2pFVypD9vb2qF+/Pnbs2CEeHlVSwcHBqFSpEubNm4f79++jYsWKqF+/vniIU+XKlREUFISpU6fCx8cHAwcOREhICMLDwzFu3DjUrl0bNWvWxIoVK0r8VfU8eUm6pk2bwsDAAFOmTMGbN28+q08gd3Xg/fv34eLiAnV1dQwbNgxdu3ZFSkpKkW3DwsIQFhYmVRYcHCyuls0zfPhwXLlyBb1794ZEIoGnpydGjRqFw4cPf1bsXbp0wYQJEzBmzBhkZGTAzc0NM2bMQGBgYKHt3r17h1GjRuHff/+FmpoabGxssHXrVvTu3Vusk5iYKLPH7MerJf/66y+EhYXB3Ny80JXGv/zyC2bMmIFRo0YhOTkZpqamGD58OGbOnClVb926dVi3bh0AQFdXF3Xq1EFERARq1qwp590omrq6Ov7++29s3rwZL168gImJCUaPHi3+nHTv3h179uxBq1at8Pr1a2zatAne3t759qWnp4etW7di8uTJWLduHdq0aYPAwEAMGzZMrNOyZUvs3LkTwcHBmD9/PrS1tdGiRQvx+qxZszB8+HAx8Z23RYFEIil0bHk8fvwYMTExpbLaXSJ8unlCOfTmzRvo6OggJSUF2traZR1OuZCVlYWIiAjUel0LilAssn61pv8UbwDr0vvXHKKS4nNO5QGfcyoP+JxTecDn/Ov7L/49ND09HQ8ePIClpWWBB1B9yw4dOoTJkyfj5s2bn3U6ORF9XQ8ePECNGjVw+/ZtWFtbl7ifKVOm4NWrV1i7dm2+14vzO44rXImIiIiIiIio3HNzc8O9e/fw+PFjqT0diejbFhERgWHDhn1WshUADA0NMXHixFKJiQlXIiIiIiIiIiIAvr6+ZR0CERXT6NGjS6WfSZMmlUo/AMA18kRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIqNx78eIFDA0NER8fX9ahyC0qKgoSiQSvX78GAISEhKBixYplGlNJxcfHQyKR4OrVq2UdCpVTffr0weLFi0ulLyZciYiIiIiIiOjLunfs671KaM6cOXB3d4eFhUXpzTsfnyZJy5OWLVvC19e3rMMokT179sDJyQkVK1aEhoYG6tatiy1bthTaJjExEX379kWNGjWgoKAg99yzs7MxY8YMWFpaQk1NDVZWVggODoYgCGKdli1bQiKRiC8jIyP07NkTCQkJhfZd2s9fYGAg6tatWyp9ycPCwgLLli0rUdtly5ahZs2aUFNTg5mZGSZMmID09HTxur+/P+bMmYOUlJTPjpMJVyIiIiIiIiIq19LS0rBhwwYMHjy4rEOhb5Senh6mT5+Oc+fO4fr16/Dx8YGPjw+OHj1aYJuMjAxUqlQJ/v7+cHBwkHusBQsWYNWqVfj1118RGxuLBQsW4Oeff8b/a+++43s6//+PP9+JiJAlZIiRWDVjKxEtSimamq29qqVFrcZq7f1pzVZLqQoqDdpq1aymNlW7VT5aM1oxPlYQIuv3h5/z9a4g7+REJHnce3O7eZ9zneu8rncvVU/Xuc7HH39s1e7NN99UVFSUzp49q++//15nzpxRx44dUz3G9BQXF5eh9w8LC9PQoUM1atQoHTlyRPPnz9fSpUv13nvvGW3Kly+v4sWL68svv0zz/QhcAQAAAABAtrZmzRo5OjqqZs2axrF7KwHXr1+vypUry8nJSS+88IIuXLigtWvXqkyZMnJ1dVX79u0VExNjXJeYmKhJkyYZqxMrVqyor7/+WtLdx+br1asnScqbN68sFou6du0qSVq3bp1q164td3d35cuXTy+//LKOHz+epnEdP35czZo1k7e3t5ydnVW9enX99NNPVm38/f01fvx4de7cWc7OzvLz89PKlSt18eJFNWvWTM7OzqpQoYL27NljXHPp0iW1a9dOBQsWVO7cuRUQEKCvvvoqTbX+W0JCgrp37258j6VKldLMmTOt2nTt2lXNmzfXxIkT5e3tLXd3d40dO1bx8fEaNGiQPDw8VKhQIS1YsMDquiFDhuiZZ55R7ty5VaxYMY0YMeKxgWDdunXVokULlSlTRsWLF1e/fv1UoUIFbdu27aHX+Pv7a+bMmercubPc3NxSPPYdO3aoWbNmatq0qfz9/dW6dWs1bNhQv/76q1W73Llzy8fHRwUKFFDNmjXVp08f7du3L8X3kf5vG4r169erTJkycnZ21ksvvaSoqCijzaZNm/Tss88qT548cnd3V1BQkE6fPq3Q0FCNGTNGBw8eNFbahoaGSpIsFotmz56tV155RXny5NGECROS3fLiu+++k8VisTr2ww8/qHr16sqVK5fy58+vFi1aSLr77+D06dMaMGCAcb+U2rFjh4KCgtS+fXv5+/urYcOGateu3QPfaXBwsMLDw234BpNH4AoAAAAAALK1rVu3qmrVqsmeGz16tGbNmqUdO3bozJkzeu211zRjxgyFhYVp9erV+vHHH61WHk6aNEmLFi3SnDlz9Mcff2jAgAHq2LGjNm/erMKFC+ubb76RJB09elRRUVFGiHjz5k0NHDhQe/bsUUREhOzs7NSiRQslJiamelw3btxQkyZNFBERof379+ull15ScHCwIiMjrdpNnz5dQUFB2r9/v5o2bapOnTqpc+fO6tixo/bt26fixYurc+fOxiPtt2/fVtWqVbV69WodOnRIPXr0UKdOnR4Ir9IiMTFRhQoV0vLly3X48GGNHDlS7733npYtW2bV7ueff9bZs2e1ZcsWTZs2TaNGjdLLL7+svHnzateuXXrrrbfUs2dP/f3338Y1Li4uCg0N1eHDhzVz5kzNmzdP06dPT3FtSUlJioiI0NGjR/X888+bNuZ7atWqpYiICP3555+SpIMHD2rbtm1q3LjxQ6+5fPmyli1bpho1ath8v5iYGE2ZMkWLFy/Wli1bFBkZqZCQEElSfHy8mjdvrjp16ui3337Tzp071aNHD1ksFrVp00bvvvuuypUrp6ioKEVFRalNmzZGv6NHj1aLFi30+++/6/XXX09RLatXr1aLFi3UpEkT7d+/XxEREXr22Wcl3d3WoVChQho7dqxxv5SqVauW9u7da8zREydOaM2aNWrSpIlVu2effVa//vqrYmNjU9x3cnKk6WoAAAAAAIBM7vTp0/L19U323Pjx4xUUFCRJ6t69u4YNG6bjx4+rWLFikqTWrVtr48aNGjJkiGJjYzVx4kT99NNPCgwMlCQVK1ZM27Zt02effaY6derIw8NDkuTl5WW12q9Vq1ZW9/3iiy/k6empw4cPq3z58qkaV8WKFa0eZR83bpxWrFihlStXqk+fPsbxJk2aqGfPnpKkkSNHavbs2apevbpeffVVSXdXhAYGBur8+fPy8fFRwYIFjUBOkt555x2tX79ey5YtM8KxtHJwcNCYMWOMz0WLFtXOnTu1bNkyvfbaa8ZxDw8PffTRR7Kzs1OpUqX0wQcfKCYmxnhUfNiwYZo8ebK2bdumtm3bSrq7V+c9/v7+CgkJUXh4uAYPHvzImq5du6aCBQsqNjZW9vb2+vTTT/Xiiy+aMt77DR06VNHR0SpdurTs7e2VkJCgCRMmqEOHDlbtPv30U33++edKSkpSTEyMnnnmmUducfAwcXFxmjNnjooXLy5J6tOnj8aOHStJio6O1rVr1/Tyyy8b58uUKWNc6+zsrBw5csjHx+eBftu3b69u3brZVMuECRPUtm1bq3/39+awh4eH7O3t5eLikuz9HqV9+/b63//+p9q1ayspKUnx8fF66623rLYUkCRfX1/duXNH586dk5+fn033uB8rXAEAAAAAQLZ269Yt5cqVK9lzFSpUMH7u7e1tPIZ+/7ELFy5Iko4dO6aYmBi9+OKLcnZ2Nn4sWrTosdsD/PXXX2rXrp2KFSsmV1dX4+Vd91ajNm7c2OivXLlyKRrXjRs3FBISojJlysjd3V3Ozs46cuTIAytc/z1GSQoICHjg2L1xJiQkaNy4cQoICJCHh4ecnZ21fv16o98lS5ZYjX/r1q0pqvffPvnkE1WtWlWenp5ydnbW3LlzH6i9XLlysrP7v3jL29vbqnZ7e3vly5fPqF2Sli5dqqCgIPn4+MjZ2VnDhw83+o2MjLSqfeLEicZ1Li4uOnDggHbv3q0JEyZo4MCB2rRpU6rGJt1dWX3/vZYsWSJJWrZsmZYsWaKwsDDt27dPCxcu1JQpU7Rw4UKr6zt06KADBw4YK2BLlCihhg0b6vr168Z3c6/vR62OzZ07txGmSlKBAgWM78vDw0Ndu3ZVo0aNFBwcrJkzZ6Z4ZWm1atVs+j4k6cCBA6pfv77N1z3Opk2bNHHiRH366afat2+fvv32W61evVrjxo2zaufk5CRJVtuEpAYrXAEAAAAAQLaWP39+XblyJdlzDg4Oxs8tFovV53vH7j32f+PGDUl3H4suWLCgVTtHR8dH1hAcHCw/Pz/NmzdPvr6+SkxMVPny5XXnzh1J0ueff65bt249UNOjhISEaMOGDZoyZYpKlCghJycntW7d2ujzYWN82LF74/zwww81c+ZMzZgxQwEBAcqTJ4/69+9v9PvKK69YPdr+7+8iJcLDwxUSEqKpU6cqMDBQLi4u+vDDD7Vr166H1n6v1kf9O9q5c6c6dOigMWPGqFGjRnJzc1N4eLimTp0q6e4KxwMHDhjX3luRLEl2dnYqUaKEJKlSpUo6cuSIJk2apLp169o8PuluIHn/ve4F24MGDdLQoUONFbkBAQE6ffq0Jk2apC5duhjt3dzcjHpKlCih+fPnq0CBAlq6dKneeOMNrVmzxtib9l6QmJzkvq9720dI0oIFC9S3b1+tW7dOS5cu1fDhw7VhwwarPY+TkydPHqvPdnZ2Vv1KD75M61F1psWIESPUqVMnvfHGG5Lufqc3b95Ujx499P777xuh/eXLlyVJnp6eabofgSsAAAAAAMjWKleubMqbycuWLStHR0dFRkaqTp06ybbJmTOnpLurRO+5dOmSjh49qnnz5um5556TpAdexpSa0HL79u3q2rWr8dKhGzdu6NSpUzb3k1y/zZo1U8eOHSXdDWL//PNPlS1bVtLdlaAuLi5pvketWrXUq1cv41haXyIm3X15kp+fn95//33j2OnTp42f58iRwwgxHycxMTFNe306OTkle6+YmBirVbvS3ZW6j9vP197eXpKMYD4tj8T/W+XKlVW5cmUNGzZMgYGBCgsLU82aNZUzZ06rufwonp6eun79um7evGmEsfcHztLd1dYREREP3YrAlvvd72HfqSSrEPjQoUMqVKiQ8ufPb/M97kfgCgAAAAAAsrVGjRpp2LBhunLlivLmzZvqflxcXBQSEqIBAwYoMTFRtWvX1rVr17R9+3a5urqqS5cu8vPzk8Vi0apVq9SkSRM5OTkpb968ypcvn+bOnasCBQooMjJSQ4cOTfO4SpYsqW+//VbBwcGyWCwaMWJEml7CdX+/X3/9tXbs2KG8efNq2rRpOn/+vBG4PsrFixcfCNkKFCiQ7D0WLVqk9evXq2jRolq8eLF2796tokWLprn2yMhIhYeHq3r16lq9erVWrFjx2OsmTZqkatWqqXjx4oqNjdWaNWu0ePFizZ4922gzbNgw/fPPP1q0aJFx7N5Yb9y4YYw9Z86cj/yugoODNWHCBBUpUkTlypXT/v37NW3atAdePBUTE6Nz585Jks6fP69x48YpV65catiwoS1fySOdPHlSc+fO1SuvvCJfX18dPXpUf/31lzp37izp7h64J0+e1IEDB1SoUCG5uLg8dDV3jRo1lDt3br333nvq27evdu3apdDQUKs2o0aNUv369VW8eHG1bdtW8fHxWrNmjYYMGWLcb8uWLWrbtq0cHR1THIwGBwdr2rRpqly5smrUqKFjx45pxIgRCg4ONoJX6e42D2Z8fwSuAAAAAAAgfZU0LwBKDwEBAapSpYqWLVtmvDwqtcaNGydPT09NmjRJJ06ckLu7u6pUqWK8nKdgwYIaM2aMhg4dqm7duqlz584KDQ1VeHi4+vbtq/Lly6tUqVL66KOPUv2o+j33QrpatWopf/78GjJkiKKjo9PUp3T3pVMnTpxQo0aNlDt3bvXo0UPNmzfXtWvXHnttWFiYwsLCrI6NGzfOWC17T8+ePbV//361adNGFotF7dq1U69evbR27do01f7KK69owIAB6tOnj2JjY9W0aVONGDFCo0ePfuR1N2/eVK9evfT333/LyclJpUuX1pdffqk2bdoYbaKioh7YY7Zy5crGz/fu3auwsDD5+fk9cqXxxx9/rBEjRqhXr166cOGCfH191bNnT40cOdKq3bx58zRv3jxJUt68eVWhQgWtWbNGpUqVSuG38Xi5c+fWf//7Xy1cuFCXLl1SgQIF1Lt3b+PXSatWrfTtt9+qXr16unr1qhYsWKCuXbsm25eHh4e+/PJLDRo0SPPmzVP9+vU1evRo9ejRw2hTt25dLV++XOPGjdPkyZPl6uqq559/3jg/duxY9ezZ0wi+761OtVgsj7z38OHDZbFYNHz4cP3zzz/y9PQ0gu17bt++re+++07r1q1L47cmWZL+vXlCNhQdHS03Nzddu3ZNrq6uGV1OthAXF6c1a9ao7NWyspf9Y9sXq3XMths85b+ZI3tgniM7YJ4jO2CeIztgnj95WfHPobdv39bJkydVtGjRh76A6mm2evVqDRo0SIcOHXrg0WMAT6+TJ0/qmWee0eHDh1WyZMlU9zN79mytWLFCP/74Y7LnbflvHCtcAQAAAABAtte0aVP99ddf+ueff1S4cOGMLgdACq1Zs0Y9evRIU9gq3X152Mcff2xKTQSuAAAAAAAAkvr375/RJQCwUe/evU3p54033jClH0lijTwAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAACQ7V26dEleXl46depURpeSYps2bZLFYtHVq1clSaGhoXJ3d8/QmlLr1KlTslgsOnDgQEaXgmzozp078vf31549e0zpL4cpvQAAAAAAADxE49/2PrF7ra1QNVXXTZgwQc2aNZO/v7+5Bf3Lpk2bVK9ePV25ciXThqOpVbduXVWqVEkzZszI6FJs9u2332rixIk6duyY4uLiVLJkSb377rvq1KnTI6+ZPXu2Dhw4oNjYWJUrV06jR49Wo0aNHnuvOXPmaO/evbp8+bL279+vSpUqWbXx9/fX6dOnJUl2dnby9vZW48aNNWXKFOXNm/ehfYeGhqp///5GSJ9WXbt21dWrV/Xdd9+Z0t/jWCwWrVixQs2bN091H5MnT9awYcPUr18/Yy7mzJlTISEhGjJkiCIiItJcJytcAQAAAABAthYTE6P58+ere/fuGV0KnlIeHh56//33tXPnTv3222/q1q2bunXrpvXr1z/0mi1btujFF1/UmjVrtHfvXtWrV0/BwcHav3//I+918+ZN1a5dW//5z38e2W7s2LGKiopSZGSklixZoi1btqhv376pGl96i4uLy+gSJEm7d+/WZ599pgoVKjxwrkOHDtq2bZv++OOPNN+HwBUAAAAAAGRra9askaOjo2rWrGkcu/e4/vr161W5cmU5OTnphRde0IULF7R27VqVKVNGrq6uat++vWJiYozrEhMTNWnSJBUtWlROTk6qWLGivv76a0l3H5uvV6+eJClv3ryyWCzq2rWrJGndunWqXbu23N3dlS9fPr388ss6fvx4msZ1/PhxNWvWTN7e3nJ2dlb16tX1008/WbXx9/fX+PHj1blzZzk7O8vPz08rV67UxYsX1axZMzk7O6tChQpWj1pfunRJ7dq1U8GCBZU7d24FBAToq6++SlOt/5aQkKDu3bsb32OpUqU0c+ZMqzZdu3ZV8+bNNXHiRHl7e8vd3V1jx45VfHy8Bg0aJA8PDxUqVEgLFiywum7IkCF65plnlDt3bhUrVkwjRox4bCBYt25dtWjRQmXKlFHx4sXVr18/VahQQdu2bXvoNTNmzNDgwYNVvXp1lSxZUhMnTlTJkiX1ww8/PPJenTp10siRI9WgQYNHtnNxcZGPj48KFiyoevXqqUuXLtq3b98jr/m30aNHq1KlSlq8eLH8/f3l5uamtm3b6vr160abr7/+WgEBAXJyclK+fPnUoEED3bx5U6NHj9bChQv1/fffy2KxyGKxaNOmTcb2EEuXLlWdOnWUK1cuLVmyxLjXv7+jf68q/+KLL1SuXDk5OjqqQIEC6tOnjyQZ7Vq0aCGLxWLzavQbN26oQ4cOmjdvXrKrgPPmzaugoCCFh4fb1G9yCFwBAAAAAEC2tnXrVlWtmvxWBKNHj9asWbO0Y8cOnTlzRq+99ppmzJihsLAwrV69Wj/++KM+/vhjo/2kSZO0aNEizZkzR3/88YcGDBigjh07avPmzSpcuLC++eYbSdLRo0cVFRVlhIg3b97UwIEDtWfPHkVERMjOzk4tWrRQYmJiqsd148YNNWnSRBEREdq/f79eeuklBQcHKzIy0qrd9OnTFRQUpP3796tp06bq1KmTOnfurI4dO2rfvn0qXry4OnfurKSkJEnS7du3VbVqVa1evVqHDh1Sjx491KlTJ/3666+prvXfEhMTVahQIS1fvlyHDx/WyJEj9d5772nZsmVW7X7++WedPXtWW7Zs0bRp0zRq1Ci9/PLLyps3r3bt2qW33npLPXv21N9//21c4+LiotDQUB0+fFgzZ87UvHnzNH369BTXlpSUpIiICB09elTPP/+8TWO6fv26PDw8UnxNSv3zzz/64YcfVKNGDZuvPX78uL777jutWrVKq1at0ubNmzV58mRJUlRUlNq1a6fXX39dR44c0aZNm9SyZUslJSUpJCREr732ml566SVFRUUpKipKtWrVMvodOnSo+vXrpyNHjjx2G4V7Zs+erd69e6tHjx76/ffftXLlSpUoUULS3dWpkrRgwQJFRUUZn1Oqd+/eatq06SOD7GeffVZbt261qd/ksIcrAAAAAADI1k6fPi1fX99kz40fP15BQUGSpO7du2vYsGE6fvy4ihUrJklq3bq1Nm7cqCFDhig2NlYTJ07UTz/9pMDAQElSsWLFtG3bNn322WeqU6eOEbZ5eXlZ7eHaqlUrq/t+8cUX8vT01OHDh1W+fPlUjatixYqqWLGi8XncuHFasWKFVq5caawalKQmTZqoZ8+ekqSRI0dq9uzZql69ul599VVJd1eEBgYG6vz588aKypCQEOP6d955R+vXr9eyZcv07LPPpqrWf3NwcNCYMWOMz0WLFtXOnTu1bNkyvfbaa8ZxDw8PffTRR7Kzs1OpUqX0wQcfKCYmRu+9954kadiwYZo8ebK2bdumtm3bSpKGDx9uXO/v76+QkBCFh4dr8ODBj6zp2rVrKliwoGJjY2Vvb69PP/1UL774YorHNGXKFN24ccOq/rQYMmSIhg8froSEBN2+fVs1atTQtGnTbO4nMTFRoaGhcnFxkXR3hW1ERIQmTJigqKgoxcfHq2XLlvLz85MkBQQEGNc6OTkpNjZWPj4+D/Tbv39/tWzZ0qZaxo8fr3fffVf9+vUzjlWvXl2S5OnpKUlyd3dP9n6PEh4ern379j02pPX19TX2xk0LVrgCAAAAAIBs7datW8qVK1ey5+7f69Hb29t4DP3+YxcuXJAkHTt2TDExMXrxxRfl7Oxs/Fi0aNFjtwf466+/1K5dOxUrVkyurq7G49L3VqM2btzY6K9cuXIpGteNGzcUEhKiMmXKyN3dXc7Ozjpy5MgDK1z/PUbJOlS7d+zeOBMSEjRu3DgFBATIw8NDzs7OWr9+vdHvkiVLrMaf2hWDn3zyiapWrSpPT085Oztr7ty5D9Rerlw52dn9X7zl7e1tVbu9vb3y5ctn1C5JS5cuVVBQkHx8fOTs7Kzhw4cb/UZGRlrVPnHiROM6FxcXHThwQLt379aECRM0cOBAbdq0KUVjCQsL05gxY7Rs2TJ5eXlJSvv3NGjQIB04cEC//fab8aKnpk2bKiEhQZKs+n7rrbce2o+/v78RtkpSgQIFjO+rYsWKql+/vgICAvTqq69q3rx5unLlSorqq1atmk3juXDhgs6ePav69evbdN3jnDlzRv369dOSJUse+uv8HicnJ6stQlKLFa4AAAAAACBby58//0NDJAcHB+PnFovF6vO9Y/ce+79x44YkafXq1SpYsKBVO0dHx0fWEBwcLD8/P82bN0++vr5KTExU+fLldefOHUnS559/rlu3bj1Q06OEhIRow4YNmjJlikqUKCEnJye1bt3a6PNhY3zYsXvj/PDDDzVz5kzNmDFDAQEBypMnj/r372/0+8orr1g92v7v7yIlwsPDFRISoqlTpyowMFAuLi768MMPtWvXrofWfq/WR/072rlzpzp06KAxY8aoUaNGcnNzU3h4uKZOnSrp7grHAwcOGNfe//i/nZ2d8Xh7pUqVdOTIEU2aNEl169Z97FjeeOMNLV++3Opx9rR+T/nz5zfqKVmypGbMmKHAwEBt3LhRDRo0sBqHq6vrQ/t51Pdlb2+vDRs2aMeOHcb2Ge+//7527dqlokWLPrK+PHnyWH22s7MztqW45/69c52cnB7ZX2rt3btXFy5cUJUqVYxjCQkJ2rJli2bNmmWsWJaky5cvGytp04LAFQAAAAAAZGuVK1fWl19+meZ+ypYtK0dHR0VGRqpOnTrJtsmZM6ckGasQpbsvoTp69KjmzZun5557TpIeeBlTakLL7du3q2vXrmrRooWku4HwqVOnbO4nuX6bNWumjh07SrobxP75558qW7aspLsrQe9fMZnae9SqVUu9evUyjqX1JWKStGPHDvn5+en99983jt3/CHmOHDmMEPNxEhMTFRsb+8g2X331lV5//XWFh4eradOmVufM+J7udy80vBfMp3Qcj2OxWBQUFKSgoCCNHDlSfn5+WrFihQYOHKicOXNazeVH8fT01Llz55SUlGSE+PeHwi4uLvL391dERITxcrl/c3BwSPH97qlfv75+//13q2PdunVT6dKlNWTIEON7k6RDhw6pcuXKNvWfHAJXAAAAAACQrTVq1EjDhg3TlStXkn17eUq5uLgoJCREAwYMUGJiomrXrq1r165p+/btcnV1VZcuXeTn5yeLxaJVq1apSZMmcnJyUt68eZUvXz7NnTtXBQoUUGRkpIYOHZrmcZUsWVLffvutgoODZbFYNGLEiDS9hOv+fr/++mvt2LFDefPm1bRp03T+/HkjcH2UixcvWoVs0t1H2JO7x6JFi7R+/XoVLVpUixcv1u7dux+7qjIltUdGRio8PFzVq1fX6tWrtWLFisdeN2nSJFWrVk3FixdXbGys1qxZo8WLF2v27NlGm2HDhumff/7RokWLJN3dRqBLly6aOXOmatSooXPnzkm6u5LTzc3tofe6fPmyIiMjdfbsWUl3X7AmST4+PlZ7l16/ft0IMM+cOaPBgwfL09PT6sVVabVr1y5FRESoYcOG8vLy0q5du3Tx4kWVKVNG0t3tCNavX6+jR48qX758jxxX3bp1dfHiRX3wwQdq3bq11q1bp7Vr11qtvh09erTeeusteXl5qXHjxrp+/bq2b9+ud955x7hfRESEgoKC5OjomKJfry4uLg/sg5wnTx7ly5fvgeNbt27VuHHjUvz9PAyBKwAAAAAASFdrK1TN6BIeKSAgQFWqVNGyZcuMl0el1rhx4+Tp6alJkybpxIkTcnd3V5UqVYyXOBUsWFBjxozR0KFD1a1bN3Xu3FmhoaEKDw9X3759Vb58eZUqVUofffTRYx9Vf5xp06bp9ddfV61atZQ/f34NGTJE0dHRaepTuvvSqRMnTqhRo0bKnTu3evTooebNm+vatWuPvTYsLExhYWFWx8aNG2eslr2nZ8+e2r9/v9q0aSOLxaJ27dqpV69eWrt2bZpqf+WVVzRgwAD16dNHsbGxatq0qUaMGKHRo0c/8rqbN2+qV69e+vvvv+Xk5KTSpUvryy+/VJs2bYw2UVFRVnvMzp07V/Hx8erdu7d69+5tHO/SpYtCQ0Mfeq+VK1eqW7duxud7L/saNWqUVZ0jR47UyJEjJd1dPVq9enX9+OOPypcvX0q+ihRxdXXVli1bNGPGDEVHR8vPz09Tp05V48aNJUlvvvmmNm3apGrVqunGjRvauHGjsf/wv5UpU0affvqpJk6cqHHjxqlVq1YKCQnR3LlzjTZdunTR7du3NX36dIWEhCh//vxq3bq1cX7q1KkaOHCg5s2bp4IFC+rUqVM6deqUihYtqo0bN6bp18zOnTt17do1q/ulliXp35snZEPR0dFyc3PTtWvXHrmnBcwTFxenNWvWqOzVsrKX/WPbF6t1zLYblGyYysoA8zDPkR0wz5EdMM+RHTDPn7ys+OfQ27dv6+TJkypatOhjX0zzNFq9erUGDRqkQ4cOWb2ECcDTbePGjWrZsqVOnDiRphXqbdq0UcWKFY2/HPk3W/4bxwpXAAAAAACQ7TVt2lR//fWX/vnnHxUuXDijywGQQmvWrNF7772XprD1zp07CggI0IABA0ypicAVAAAAAABAUv/+/TO6BAA2+vDDD9PcR86cOTV8+HATqrmLNfIAAAAAAAAAYBICVwAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJMQuAIAAAAAAACASQhcAQAAAABAtnfp0iV5eXnp1KlTGV1Kim3atEkWi0VXr16VJIWGhsrd3T1Da0qtU6dOyWKx6MCBAxldCrKpmjVr6ptvvjGlrxym9AIAAAAAAPAQcWPefWL3chg1NVXXTZgwQc2aNZO/v7+5Bf3Lpk2bVK9ePV25ciXThqOpVbduXVWqVEkzZszI6FJs9u2332rixIk6duyY4uLiVLJkSb377rvq1KnTI6+ZPXu2Dhw4oNjYWJUrV06jR49Wo0aNHnuvOXPmaO/evbp8+bL279+vSpUqWbXx9/fX6dOnJUl2dnby9vZW48aNNWXKFOXNm/ehfYeGhqp///5GSJ9WXbt21dWrV/Xdd9+Z0t/jWCwWrVixQs2bN7fpuoSEBI0ePVpffvmlzp07J19fX3Xt2lXDhw+XxWKRJA0fPlwDBgxQixYtZGeXtjWqrHAFAAAAAADZWkxMjObPn6/u3btndCl4Snl4eOj999/Xzp079dtvv6lbt27q1q2b1q9f/9BrtmzZohdffFFr1qzR3r17Va9ePQUHB2v//v2PvNfNmzdVu3Zt/ec//3lku7FjxyoqKkqRkZFasmSJtmzZor59+6ZqfOktLi4uQ+//n//8R7Nnz9asWbN05MgR/ec//9EHH3ygjz/+2GjTuHFjXb9+XWvXrk3z/QhcAQAAAABAtrZmzRo5OjqqZs2axrF7j+uvX79elStXlpOTk1544QVduHBBa9euVZkyZeTq6qr27dsrJibGuC4xMVGTJk1S0aJF5eTkpIoVK+rrr7+WdPex+Xr16kmS8ubNK4vFoq5du0qS1q1bp9q1a8vd3V358uXTyy+/rOPHj6dpXMePH1ezZs3k7e0tZ2dnVa9eXT/99JNVG39/f40fP16dO3eWs7Oz/Pz8tHLlSl28eFHNmjWTs7OzKlSooD179hjXXLp0Se3atVPBggWVO3duBQQE6KuvvkpTrf+WkJCg7t27G99jqVKlNHPmTKs2Xbt2VfPmzTVx4kR5e3vL3d1dY8eOVXx8vAYNGiQPDw8VKlRICxYssLpuyJAheuaZZ5Q7d24VK1ZMI0aMeGwgWLduXbVo0UJlypRR8eLF1a9fP1WoUEHbtm176DUzZszQ4MGDVb16dZUsWVITJ05UyZIl9cMPPzzyXp06ddLIkSPVoEGDR7ZzcXGRj4+PChYsqHr16qlLly7at2/fI6/5t9GjR6tSpUpavHix/P395ebmprZt2+r69etGm6+//loBAQFycnJSvnz51KBBA928eVOjR4/WwoUL9f3338tischisWjTpk3G9hBLly5VnTp1lCtXLi1ZssS417+/o3+vKv/iiy9Urlw5OTo6qkCBAurTp48kGe1atGghi8Vi02r0HTt2qFmzZmratKn8/f3VunVrNWzYUL/++qvRxt7eXk2aNFF4eLhN32FyCFwBAAAAAEC2tnXrVlWtWjXZc6NHj9asWbO0Y8cOnTlzRq+99ppmzJihsLAwrV69Wj/++KPVKrlJkyZp0aJFmjNnjv744w8NGDBAHTt21ObNm1W4cGFjj8ijR48qKirKCBFv3rypgQMHas+ePYqIiJCdnZ1atGihxMTEVI/rxo0batKkiSIiIrR//3699NJLCg4OVmRkpFW76dOnKygoSPv371fTpk3VqVMnde7cWR07dtS+fftUvHhxde7cWUlJSZKk27dvq2rVqlq9erUOHTqkHj16qFOnTlbhVVolJiaqUKFCWr58uQ4fPqyRI0fqvffe07Jly6za/fzzzzp79qy2bNmiadOmadSoUXr55ZeVN29e7dq1S2+99ZZ69uypv//+27jGxcVFoaGhOnz4sGbOnKl58+Zp+vTpKa4tKSlJEREROnr0qJ5//nmbxnT9+nV5eHik+JqU+ueff/TDDz+oRo0aNl97/Phxfffdd1q1apVWrVqlzZs3a/LkyZKkqKgotWvXTq+//rqOHDmiTZs2qWXLlkpKSlJISIhee+01vfTSS4qKilJUVJRq1apl9Dt06FD169dPR44ceew2CvfMnj1bvXv3Vo8ePfT7779r5cqVKlGihCRp9+7dkqQFCxYoKirK+JwStWrVUkREhP78809J0sGDB7Vt2zY1btzYqt2zzz6rrVu3prjfh2EPVwAAkGXYsj9cavd3AwAAWc/p06fl6+ub7Lnx48crKChIktS9e3cNGzZMx48fV7FixSRJrVu31saNGzVkyBDFxsZq4sSJ+umnnxQYGChJKlasmLZt26bPPvtMderUMcI2Ly8vqz1cW7VqZXXfL774Qp6enjp8+LDKly+fqnFVrFhRFStWND6PGzdOK1as0MqVK41Vg5LUpEkT9ezZU5I0cuRIzZ49W9WrV9err74q6e6K0MDAQJ0/f95YURkSEmJc/84772j9+vVatmyZnn322VTV+m8ODg4aM2aM8blo0aLauXOnli1bptdee8047uHhoY8++kh2dnYqVaqUPvjgA8XExOi9996TJA0bNkyTJ0/Wtm3b1LZtW0l39+q8x9/fXyEhIQoPD9fgwYMfWdO1a9dUsGBBxcbGyt7eXp9++qlefPHFFI9pypQpunHjhlX9aTFkyBANHz5cCQkJun37tmrUqKFp06bZ3E9iYqJCQ0Pl4uIi6e4K24iICE2YMEFRUVGKj49Xy5Yt5efnJ0kKCAgwrnVyclJsbKx8fHwe6Ld///5q2bKlTbWMHz9e7777rvr162ccq169uiTJ09NTkuTu7p7s/R5l6NChio6OVunSpWVvb6+EhARNmDBBHTp0sGrn6+urM2fOKDExMU37uLLCFQAAAAAAZGu3bt1Srly5kj1XoUIF4+fe3t7GY+j3H7tw4YIk6dixY4qJidGLL74oZ2dn48eiRYseuz3AX3/9pXbt2qlYsWJydXU1Hpe+txq1cePGRn/lypVL0bhu3LihkJAQlSlTRu7u7nJ2dtaRI0ceWOH67zFK1qHavWP3xpmQkKBx48YpICBAHh4ecnZ21vr1641+lyxZYjX+1K4Y/OSTT1S1alV5enrK2dlZc+fOfaD2cuXKWQVj3t7eVrXb29srX758Ru2StHTpUgUFBcnHx0fOzs4aPny40W9kZKRV7RMnTjSuc3Fx0YEDB7R7925NmDBBAwcO1KZNm1I0lrCwMI0ZM0bLli2Tl5eXpLR/T4MGDdKBAwf022+/KSIiQpLUtGlTJSQkSJJV32+99dZD+/H39zfCVkkqUKCA8X1VrFhR9evXV0BAgF599VXNmzdPV65cSVF91apVs2k8Fy5c0NmzZ1W/fn2brkuJZcuWacmSJQoLC9O+ffu0cOFCTZkyRQsXLrRq5+TkpMTERMXGxqbpfqxwBQAAAAAA2Vr+/PkfGiI5ODgYP7dYLFaf7x2799j/jRs3JEmrV69WwYIFrdo5Ojo+sobg4GD5+flp3rx58vX1VWJiosqXL687d+5Ikj7//HPdunXrgZoeJSQkRBs2bNCUKVNUokQJOTk5qXXr1kafDxvjw47dG+eHH36omTNnasaMGQoICFCePHnUv39/o99XXnnF6tH2f38XKREeHq6QkBBNnTpVgYGBcnFx0Ycffqhdu3Y9tPZ7tT7q39HOnTvVoUMHjRkzRo0aNZKbm5vCw8M1derdp598fX114MAB49r7H/+3s7MzHm+vVKmSjhw5okmTJqlu3bqPHcsbb7yh5cuXW+3LmtbvKX/+/EY9JUuW1IwZMxQYGKiNGzeqQYMGVuNwdXV9aD+P+r7s7e21YcMG7dixw9g+4/3339euXbtUtGjRR9aXJ08eq892dnbGthT33L93rpOT0yP7S4tBgwZp6NChxirngIAAnT59WpMmTVKXLl2MdpcvX1aePHnSXAuBKwAAAJCJsHUGAJivcuXK+vLLL9PcT9myZeXo6KjIyEjVqVMn2TY5c+aUJGMVonT3JVRHjx7VvHnz9Nxzz0nSAy9jSk1ouX37dnXt2lUtWrSQdDcQPnXqlM39JNdvs2bN1LFjR0l3g9g///xTZcuWlXR3Jej9KyZTe49atWqpV69exrG0vkRMuvvyJD8/P73//vvGsdOnTxs/z5EjhxFiPk5KVkJ+9dVXev311xUeHq6mTZtanTPje7qfvb29JBnBfErH8TgWi0VBQUEKCgrSyJEj5efnpxUrVmjgwIHKmTOn1Vx+FE9PT507d05JSUlGiH9/KOzi4iJ/f39FREQYL5f7NwcHhxTf734xMTEPbBFgb2//wB7Jhw4dUuXKlW3u/98IXAEAAAAAQLbWqFEjDRs2TFeuXFHevHlT3Y+Li4tCQkI0YMAAJSYmqnbt2rp27Zq2b98uV1dXdenSRX5+frJYLFq1apWaNGkiJycn5c2bV/ny5dPcuXNVoEABRUZGaujQoWkeV8mSJfXtt98qODhYFotFI0aMSNNLuO7v9+uvv9aOHTuUN29eTZs2TefPnzcC10e5ePGiVcgm3X2EPbl7LFq0SOvXr1fRokW1ePFi7d69+7GrKlNSe2RkpMLDw1W9enWtXr1aK1aseOx1kyZNUrVq1VS8eHHFxsZqzZo1Wrx4sWbPnm20GTZsmP755x8tWrRI0t1tBLp06aKZM2eqRo0aOnfunKS7Kznd3Nweeq/Lly8rMjJSZ8+elXT3BWuS5OPjY7V36fXr140A88yZMxo8eLA8PT2tXlyVVrt27VJERIQaNmwoLy8v7dq1SxcvXlSZMmUk3d2OYP369Tp69Kjy5cv3yHHVrVtXFy9e1AcffKDWrVtr3bp1Wrt2rdXq29GjR+utt96Sl5eXGjdurOvXr2v79u165513jPtFREQoKChIjo6OKf71GhwcrAkTJqhIkSIqV66c9u/fr2nTpun111+3ard161Y1bNjQ1q/pAQSuAAAAAAAgXT3tK+4DAgJUpUoVLVu2zHh5VGqNGzdOnp6emjRpkk6cOCF3d3dVqVLFeIlTwYIFNWbMGA0dOlTdunVT586dFRoaqvDwcPXt21fly5dXqVKl9NFHHz32UfXHuRco1apVS/nz59eQIUMUHR2dpj6luy+dOnHihBo1aqTcuXOrR48eat68ua5du/bYa8PCwhQWFmZ1bNy4ccZq2Xt69uyp/fv3q02bNrJYLGrXrp169eqltWvXpqn2V155RQMGDFCfPn0UGxurpk2basSIERo9evQjr7t586Z69eqlv//+W05OTipdurS+/PJLtWnTxmgTFRVltcfs3LlzFR8fr969e6t3797G8S5duig0NPSh91q5cqW6detmfL73GPyoUaOs6hw5cqRGjhwp6e7q0erVq+vHH39Uvnz5UvJVpIirq6u2bNmiGTNmKDo6Wn5+fpo6daoaN24sSXrzzTe1adMmVatWTTdu3NDGjRuN/Yf/rUyZMvr00081ceJEjRs3Tq1atVJISIjmzp1rtOnSpYtu376t6dOnKyQkRPnz51fr1q2N81OnTtXAgQM1b948FSxYUKdOndKpU6dUtGhRbdy48aG/Zj7++GONGDFCvXr10oULF+Tr66uePXsa358k/fPPP9qxY4cpq90tSf/ePCEbio6Olpubm65du/bIPS1gnri4OK1Zs0Zlr5aVvewf275YrWO23aBk2v82Akgr5jmyg6dtnvOoNdID8xzZwdM2z7ODrPjn0Nu3b+vkyZMqWrToQ19A9TRbvXq1Bg0apEOHDqXp7eQAnqyNGzeqZcuWOnHiRJpWqA8ZMkRXrlyxCoDvZ8t/41jhCgAAAAAAsr2mTZvqr7/+0j///KPChQtndDkAUmjNmjV677330hS2SpKXl5cGDhxoSk0ErgAAAAAAAJL69++f0SUAsNGHH35oSj/vvpvyp4gehzXyAAAAAAAAAGASVrgCAAAAAJ4q7FUMAMjMWOEKAAAAAABMw7u5AWRFtvy3jcAVAAAAAACkmYODgyQpJiYmgysBAPPduXNHkmRvb//YtmwpAACZVOPf9trUfm2FqulUCZB+bJ3nK9OpDgAA8Hj29vZyd3fXhQsXJEm5c+eWxWLJ4KoAIO0SExN18eJF5c6dWzlyPD5OJXAFgGyCvdAAAACQ3nx8fCTJCF0BIKuws7NTkSJFUvQXSQSuAAAAAADAFBaLRQUKFJCXl5fi4uIyuhwAME3OnDllZ5ey3VkJXAGx8g8AAAAAzGRvb5+ifQ4BICvipVkAAAAAAAAAYBJWuAIAAAAZiJfDAQAAZC2scAUAAAAAAAAAkxC4AgAAAAAAAIBJ2FIAAAAAAJCu2DoDAJCdsMIVAAAAAAAAAExC4AoAAAAAAAAAJmFLAWRJPLIEAAAAAACAjMAKVwAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJMQuAIAAAAAAACASQhcAQAAAAAAAMAkBK4AAAAAAAAAYBICVwAAAAAAAAAwCYErAAAAAAAAAJgkQwPX2bNnq0KFCnJ1dZWrq6sCAwO1du1a4/zt27fVu3dv5cuXT87OzmrVqpXOnz9v1UdkZKSaNm2q3Llzy8vLS4MGDVJ8fPyTHgoAAAAAAAAAZGzgWqhQIU2ePFl79+7Vnj179MILL6hZs2b6448/JEkDBgzQDz/8oOXLl2vz5s06e/asWrZsaVyfkJCgpk2b6s6dO9qxY4cWLlyo0NBQjRw5MqOGBAAAAAAAACAby5GRNw8ODrb6PGHCBM2ePVu//PKLChUqpPnz5yssLEwvvPCCJGnBggUqU6aMfvnlF9WsWVM//vijDh8+rJ9++kne3t6qVKmSxo0bpyFDhmj06NHKmTNnRgwLAAAAAAAAQDaVoYHr/RISErR8+XLdvHlTgYGB2rt3r+Li4tSgQQOjTenSpVWkSBHt3LlTNWvW1M6dOxUQECBvb2+jTaNGjfT222/rjz/+UOXKlZO9V2xsrGJjY43P0dHRkqS4uDjFxcWl0whxv3vfc4ISUtY+Icmm/h0SE22rx87ehsbMEaQM8xzZAfMc2QHzHNkB8/zJ48+eAJB1WZKSkmz7ndJkv//+uwIDA3X79m05OzsrLCxMTZo0UVhYmLp162YVjErSs88+q3r16uk///mPevToodOnT2v9+vXG+ZiYGOXJk0dr1qxR48aNk73n6NGjNWbMmAeOh4WFKXfu3OYOEAAAAACAf4mJiVH79u117do1ubq6ZnQ5AAATZfgK11KlSunAgQO6du2avv76a3Xp0kWbN29O13sOGzZMAwcOND5HR0ercOHCatiwIb/RPSFxcXHasGGDSl0tJXs9/m+v/WucsKn/Vrc9bGof/sPyFLd1GDrBpr6RfTHPkR0wz5EdMM+RHTDPn7x7T1oCALKeDA9cc+bMqRIlSkiSqlatqt27d2vmzJlq06aN7ty5o6tXr8rd3d1of/78efn4+EiSfHx89Ouvv1r1d/78eePcwzg6OsrR0fGB4w4ODnJwcEjrkGAD+///z+M42Fts6jfOzrb3wTkkpuzRKUnMEdiMeY7sgHmO7IB5juyAef7kZNa6AQCPZ9vvek9AYmKiYmNjVbVqVTk4OCgiIsI4d/ToUUVGRiowMFCSFBgYqN9//10XLlww2mzYsEGurq4qW7bsE68dAAAAAAAAQPaWoStchw0bpsaNG6tIkSK6fv26wsLCtGnTJq1fv15ubm7q3r27Bg4cKA8PD7m6uuqdd95RYGCgatasKUlq2LChypYtq06dOumDDz7QuXPnNHz4cPXu3TvZFawAAAAAAAAAkJ4yNHC9cOGCOnfurKioKLm5ualChQpav369XnzxRUnS9OnTZWdnp1atWik2NlaNGjXSp59+alxvb2+vVatW6e2331ZgYKDy5MmjLl26aOzYsRk1JAAAAAAAAADZWIYGrvPnz3/k+Vy5cumTTz7RJ5988tA2fn5+WrNmjdmlAQAAAAAAAIDNnro9XAEAAAAAAAAgsyJwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGCSHLZekJiYqM2bN2vr1q06ffq0YmJi5OnpqcqVK6tBgwYqXLhwetQJAAAAAAAAAE+9FK9wvXXrlsaPH6/ChQurSZMmWrt2ra5evSp7e3sdO3ZMo0aNUtGiRdWkSRP98ssv6VkzAAAAAAAAADyVUrzC9ZlnnlFgYKDmzZunF198UQ4ODg+0OX36tMLCwtS2bVu9//77evPNN00tFgAAAAAAAACeZikOXH/88UeVKVPmkW38/Pw0bNgwhYSEKDIyMs3FAQAAAAAAAEBmkuItBR4Xtt7PwcFBxYsXT1VBAAAAAAAAAJBZpThwvd+6deu0bds24/Mnn3yiSpUqqX379rpy5YppxQEAAAAAAABAZpKqwHXQoEGKjo6WJP3+++9699131aRJE508eVIDBw40tUAAAAAAAAAAyCxSvIfr/U6ePKmyZctKkr755hu9/PLLmjhxovbt26cmTZqYWiAAAAAAAAAAZBapWuGaM2dOxcTESJJ++uknNWzYUJLk4eFhrHwFAAAAAAAAgOwmVStca9eurYEDByooKEi//vqrli5dKkn6888/VahQIVMLBAAAAAAAAIDMIlUrXGfNmqUcOXLo66+/1uzZs1WwYEFJ0tq1a/XSSy+ZWiAAAAAAAAAAZBapWuFapEgRrVq16oHj06dPT3NBAAAAAAAAAJBZpSpwvefChQu6cOGCEhMTrY5XqFAhTUUBAAAAAAAAQGaUqsB179696tKli44cOaKkpCRJksViUVJSkiwWixISEkwtEgAAAAAAAAAyg1QFrq+//rqeeeYZzZ8/X97e3rJYLGbXBQAAAAAAAACZTqoC1xMnTuibb75RiRIlzK4HAAAAAAAAADItu9RcVL9+fR08eNDsWgAAAAAAAAAgU0vVCtfPP/9cXbp00aFDh1S+fHk5ODhYnX/llVdMKQ4AAAAAAAAAMpNUBa47d+7U9u3btXbt2gfO8dIsAAAAAAAAANlVqgLXd955Rx07dtSIESPk7e1tdk0AAAAAgLT6rw0vN05wkvRVupUCAEB2kqrA9dKlSxowYABhKwAAADIngigAAACkk1S9NKtly5bauHGj2bUAAAAAAAAAQKaWqhWuzzzzjIYNG6Zt27YpICDggZdm9e3b15TiAOCpw4ooZAfMcwAAAABItVQFrp9//rmcnZ21efNmbd682eqcxWIhcM2u+AM6AAAAAAAAsrlUBa4nT540uw4AAAAAAAAAyPRStYcrAAAAAAAAAOBBKQ5cJ0+erFu3bqWo7a5du7R69epUFwUAAAAAAAAAmVGKtxQ4fPiwihQpoldffVXBwcGqVq2aPD09JUnx8fE6fPiwtm3bpi+//FJnz57VokWL0q1oPBnXxoyxqb1bm3QqBAAAAAAAAMgkUhy4Llq0SAcPHtSsWbPUvn17RUdHy97eXo6OjoqJiZEkVa5cWW+88Ya6du2qXLlypVvRAAAAAAAAAPA0sumlWRUrVtS8efP02Wef6bffftPp06d169Yt5c+fX5UqVVL+/PnTq04AAADgoXgyBwAAAE8LmwLXe+zs7FSpUiVVqlTJ5HIAAIDZCKIAAAAA4MlJVeAKAFkFQRQAAMgs+P8WAAAyB7uMLgAAAAAAAAAAsgoCVwAAAAAAAAAwCYErAAAAAAAAAJjEpsDV3t5eFy5cSK9aAAAAAAAAACBTsylwTUpKSq86AAAAAAAAACDTY0sBAAAAAAAAADBJDlsv+Pzzz+Xs7PzINn379k11QQAAAAAAAACQWdkcuM6ZM0f29vYPPW+xWAhcAQAAAAAAAGRLNgeue/bskZeXV3rUAgAAAAAAAACZmk17uFoslvSqAwAAAAAAAAAyPZsC16SkpPSqAwAAAAAAAAAyPZsC11GjRj32hVkAAAAAAAAAkF3ZFLjOnz9ft27dMj7PmjVL0dHRphcFAAAAAAAAAJmRTYHr33//rYSEBOPze++9p//973+mFwUAAAAAAAAAmZFNgeu/sacrAAAAAAAAAPyfNAWuAAAAAAAAAID/k8PWCz7//HPjxVnx8fEKDQ1V/vz5rdr07dvXnOoAAAAAAAAAIBOxKXAtUqSI5s2bZ3z28fHR4sWLrdpYLBYCVwAAAAAAAADZkk2B66lTp9KpDAAAAAAAAADI/Gzaw/WFF17Q1atX06kUAAAAAAAAAMjcbApcN23apDt37qRXLQAAAAAAAACQqdkUuAIAAAAAAAAAHs6mPVwl6fDhwzp37twj21SoUCHVBQEAAAAAAABAZmVz4Fq/fn0lJSU9cNxisSgpKUkWi0UJCQmmFAcAAAAAAAAAmYnNgeuuXbvk6emZHrUAAAAAAAAAQKZmc+BapEgReXl5pUctAAAAAAAAAJCp8dIsAAAAAAAAADCJTYFrnTp1dOfOnfSqBQAAAAAAAAAyNZsC1y1btihnzpzpVQsAAAAAAAAAZGo2Ba5JSUnpVQcAAAAAAAAAZHo27+FqsVjSow4AAAAAAAAAyPRy2HrBM88889jQ9fLly6kuCAAAAAAAAAAyK5sD1zFjxsjNzS09agEAAAAAAACATM3mwLVt27by8vJKj1oAAAAAAAAAIFOzaQ9X9m8FAAAAAAAAgIezKXBNSkpKrzoAAAAAAAAAINOzaUuBxMTE9KoDAAAAAAAAADI9m1a4AgAAAAAAAAAejsAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJBkauE6aNEnVq1eXi4uLvLy81Lx5cx09etSqze3bt9W7d2/ly5dPzs7OatWqlc6fP2/VJjIyUk2bNlXu3Lnl5eWlQYMGKT4+/kkOBQAAAAAAAAAyNnDdvHmzevfurV9++UUbNmxQXFycGjZsqJs3bxptBgwYoB9++EHLly/X5s2bdfbsWbVs2dI4n5CQoKZNm+rOnTvasWOHFi5cqNDQUI0cOTIjhgQAAAAAAAAgG8uRkTdft26d1efQ0FB5eXlp7969ev7553Xt2jXNnz9fYWFheuGFFyRJCxYsUJkyZfTLL7+oZs2a+vHHH3X48GH99NNP8vb2VqVKlTRu3DgNGTJEo0ePVs6cOR+4b2xsrGJjY43P0dHRkqS4uDjFxcWl44gzl3g72/L4uASnlLdNvNs2QQkp7DvJplocEhNtah9nZ29DY+ZIVsI8f1hj5nlWwjx/WGPmeVbCPH9YY+Z5VsI8f1jjzDnP+bMnAGRdlqSkJNt+p0xHx44dU8mSJfX777+rfPny+vnnn1W/fn1duXJF7u7uRjs/Pz/1799fAwYM0MiRI7Vy5UodOHDAOH/y5EkVK1ZM+/btU+XKlR+4z+jRozVmzJgHjoeFhSl37tzpMTQAAAAAAAwxMTFq3769rl27JldX14wuBwBgogxd4Xq/xMRE9e/fX0FBQSpfvrwk6dy5c8qZM6dV2CpJ3t7eOnfunNHG29v7gfP3ziVn2LBhGjhwoPE5OjpahQsXVsOGDfmN7j7Rkyfb1N615aQUt41LdNKGE1+o1NVSstfj//bav8YJm2ppddvDpvbhPyxPcVuHoRNs6htPN+Z58pjnWQvzPHnM86yFeZ485nnWwjxPXmad5/eetAQAZD1PTeDau3dvHTp0SNu2bUv3ezk6OsrR0fGB4w4ODnJwcEj3+2cWOWx87MfB/pbN97D///88vm+LTf3G2fi4lUNiyh6dksQcyWKY5w9pyzzPUpjnD2nLPM9SmOcPacs8z1KY5w9pm0nneWatGwDweBn60qx7+vTpo1WrVmnjxo0qVKiQcdzHx0d37tzR1atXrdqfP39ePj4+Rpvz588/cP7eOQAAAAAAAAB4UjI0cE1KSlKfPn20YsUK/fzzzypatKjV+apVq8rBwUERERHGsaNHjyoyMlKBgYGSpMDAQP3++++6cOGC0WbDhg1ydXVV2bJln8xAAAAAAAAAAEAZvKVA7969FRYWpu+//14uLi7Gnqtubm5ycnKSm5ubunfvroEDB8rDw0Ourq565513FBgYqJo1a0qSGjZsqLJly6pTp0764IMPdO7cOQ0fPly9e/dOdtsAAAAAAAAAAEgvGRq4zp49W5JUt25dq+MLFixQ165dJUnTp0+XnZ2dWrVqpdjYWDVq1Eiffvqp0dbe3l6rVq3S22+/rcDAQOXJk0ddunTR2LFjn9QwAAAAAAAAAEBSBgeuSUlJj22TK1cuffLJJ/rkk08e2sbPz09r1qwxszQAAAAAAAAAsNlT8dIsAAAAAAAAAMgKCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQ5MroAAAAAAMiuJu//X4rbvp2OdQAAAPMQuALIcviDC7ID5jkAAAAAPJ3YUgAAAAAAAAAATMIKVwAAADyVWMkNAACAzIjANZvhDy4AAAAAAABA+mFLAQAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAm4aVZAJ56M6/MtPGKDulSBwAAAAAAwOMQuGZyBFEAAAAAAADA04MtBQAAAAAAAADAJKxwBQDgKcATCwCQNfDfcwAAQOAKAACAJ4IgCgAAANkBWwoAAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJiFwBQAAAAAAAACTELgCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAAAAAAAACYhMAVAAAAAAAAAExC4AoAAAAAAAAAJsnQwHXLli0KDg6Wr6+vLBaLvvvuO6vzSUlJGjlypAoUKCAnJyc1aNBAf/31l1Wby5cvq0OHDnJ1dZW7u7u6d++uGzduPMFRAAAAAAAAAMBdGRq43rx5UxUrVtQnn3yS7PkPPvhAH330kebMmaNdu3YpT548atSokW7fvm206dChg/744w9t2LBBq1at0pYtW9SjR48nNQQAAAAAAAAAMOTIyJs3btxYjRs3TvZcUlKSZsyYoeHDh6tZs2aSpEWLFsnb21vfffed2rZtqyNHjmjdunXavXu3qlWrJkn6+OOP1aRJE02ZMkW+vr5PbCwAAAAAAAAAkKGB66OcPHlS586dU4MGDYxjbm5uqlGjhnbu3Km2bdtq586dcnd3N8JWSWrQoIHs7Oy0a9cutWjRItm+Y2NjFRsba3yOjo6WJMXFxSkuLi6dRpQ+LPEWm9rbJcanuG28nW0LoOMSnFLeNvFu2wQlpLDvJJtqcUhMtKl9nJ29DY0z1xzJCpjnyWOeZy3M8+Qxz7MW5nnymOdZC/M8eczzB2W2P3sCAFLOkpSUZNvvlOnEYrFoxYoVat68uSRpx44dCgoK0tmzZ1WgQAGj3WuvvSaLxaKlS5dq4sSJWrhwoY4ePWrVl5eXl8aMGaO333472XuNHj1aY8aMeeB4WFiYcufObd6gAAAAAABIRkxMjNq3b69r167J1dU1o8sBAJjoqV3hmp6GDRumgQMHGp+jo6NVuHBhNWzYMNP9Rjf76myb2t+OfC3FbbuvmWdT364tJ6W4bVyikzac+EKlrpaSvR7/t9f+NU7YVEur2x42tQ//YXmK2zoMnWBT30g75nnymOdZC/M8eczzrIV5njzmedbCPE8e8/xB9560BABkPU9t4Orj4yNJOn/+vNUK1/Pnz6tSpUpGmwsXLlhdFx8fr8uXLxvXJ8fR0VGOjo4PHHdwcJCDg4MJ1T85STlsW6CcaJfyf+U5bHzsx8H+lk3tJcn+///z+L5tezQrzsbHrRwSU/bolKRMN0eyAuZ58pjnWQvzPHnM86yFeZ485nnWwjxPHvP8QZm1bgDA49n2u94TVLRoUfn4+CgiIsI4Fh0drV27dikwMFCSFBgYqKtXr2rv3r1Gm59//lmJiYmqUaPGE68ZAAAAAAAAQPaWoStcb9y4oWPHjhmfT548qQMHDsjDw0NFihRR//79NX78eJUsWVJFixbViBEj5Ovra+zzWqZMGb300kt68803NWfOHMXFxalPnz5q27atfH19M2hUAAAAAAAAALKrDA1c9+zZo3r16hmf7+2r2qVLF4WGhmrw4MG6efOmevTooatXr6p27dpat26dcuXKZVyzZMkS9enTR/Xr15ednZ1atWqljz766ImPBQAAAAAAAAAyNHCtW7eukpIevseRxWLR2LFjNXbs2Ie28fDwUFhYWHqUBwAAAAAAAAA2eWr3cAUAAAAAAACAzIbAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADAJgSsAAAAAAAAAmITAFQAAAAAAAABMQuAKAAAAAAAAACYhcAUAAAAAAAAAkxC4AgAAAAAAAIBJCFwBAAAAAAAAwCQErgAAAAAAAABgEgJXAAAAAAAAADBJlglcP/nkE/n7+ytXrlyqUaOGfv3114wuCQAAAAAAAEA2kyUC16VLl2rgwIEaNWqU9u3bp4oVK6pRo0a6cOFCRpcGAAAAAAAAIBvJkdEFmGHatGl688031a1bN0nSnDlztHr1an3xxRcaOnToA+1jY2MVGxtrfL527Zok6fLly4qLi3syRZsk9lrs4xvd5070lRS3vXLnjk19x1/LleK2cYm5FBMTo2u3rsle9o9tf+naDZtq0W0Hm5pfuhOf4rYOly7ZVgvSjHn+EMzzLIV5/hDM8yyFef4QzPMshXn+EMzzB1y/fl2SlJSUlMGVAADMZknK5P91v3PnjnLnzq2vv/5azZs3N4536dJFV69e1ffff//ANaNHj9aYMWOeYJUAAAAAADzozJkzKlSoUEaXAQAwUaZf4fq///1PCQkJ8vb2tjru7e2t//73v8leM2zYMA0cOND4nJiYqMuXLytfvnyyWCzpWi/uio6OVuHChXXmzBm5urpmdDlAumCeIztgniM7YJ4jO2CeP3lJSUm6fv26fH19M7oUAIDJMn3gmhqOjo5ydHS0Oubu7p4xxWRzrq6u/A8dsjzmObID5jmyA+Y5sgPm+ZPl5uaW0SUAANJBpn9pVv78+WVvb6/z589bHT9//rx8fHwyqCoAAAAAAAAA2VGmD1xz5sypqlWrKiIiwjiWmJioiIgIBQYGZmBlAAAAAAAAALKbLLGlwMCBA9WlSxdVq1ZNzz77rGbMmKGbN2+qW7duGV0aHsLR0VGjRo16YGsHICthniM7YJ4jO2CeIztgngMAYB5LUlJSUkYXYYZZs2bpww8/1Llz51SpUiV99NFHqlGjRkaXBQAAAAAAACAbyTKBKwAAAAAAAABktEy/hysAAAAAAAAAPC0IXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAASDXevQkAAAAA1ghcAQBAqjk6OurIkSMZXQYAIBW2bt2qjh07KjAwUP/8848kafHixdq2bVsGVwYAQOaWI6MLAM6cOaNRo0bpiy++yOhSgDS5deuW9u7dKw8PD5UtW9bq3O3bt7Vs2TJ17tw5g6oD0mbgwIHJHk9ISNDkyZOVL18+SdK0adOeZFlAupg1a5Z+/fVXNWnSRG3bttXixYs1adIkJSYmqmXLlho7dqxy5OB/o5G5ffPNN+rUqZM6dOig/fv3KzY2VpJ07do1TZw4UWvWrMngCgEAyLwsSTwLiAx28OBBValSRQkJCRldCpBqf/75pxo2bKjIyEhZLBbVrl1b4eHhKlCggCTp/Pnz8vX1ZZ4j07Kzs1PFihXl7u5udXzz5s2qVq2a8uTJI4vFop9//jljCgRMMn78eH3wwQdq2LChtm/frv79++vDDz/UgAEDZGdnp+nTp+vtt9/WmDFjMrpUIE0qV66sAQMGqHPnznJxcdHBgwdVrFgx7d+/X40bN9a5c+cyukQAADIt/moe6W7lypWPPH/ixIknVAmQfoYMGaLy5ctrz549unr1qvr376+goCBt2rRJRYoUyejygDSbOHGi5s6dq6lTp+qFF14wjjs4OCg0NPSBVd1AZhUaGqrQ0FC1bNlSBw8eVNWqVbVw4UJ16NBBklS6dGkNHjyYwBWZ3tGjR/X8888/cNzNzU1Xr1598gUBAJCFELgi3TVv3lwWi+WRL1axWCxPsCLAfDt27NBPP/2k/PnzK3/+/Prhhx/Uq1cvPffcc9q4caPy5MmT0SUCaTJ06FDVr19fHTt2VHBwsCZNmiQHB4eMLgsw3dmzZ1WtWjVJUsWKFWVnZ6dKlSoZ56tUqaKzZ89mUHWAeXx8fHTs2DH5+/tbHd+2bZuKFSuWMUUBAJBF8NIspLsCBQro22+/VWJiYrI/9u3bl9ElAml269Ytq/38LBaLZs+ereDgYNWpU0d//vlnBlYHmKN69erau3evLl68qGrVqunQoUP8hRmyHB8fHx0+fFiS9NdffykhIcH4LEl//PGHvLy8Mqo8wDRvvvmm+vXrp127dslisejs2bNasmSJQkJC9Pbbb2d0eQAAZGqscEW6q1q1qvbu3atmzZole/5xq1+BzKB06dLas2ePypQpY3V81qxZkqRXXnklI8oCTOfs7KyFCxcqPDxcDRo0YF9iZDkdOnRQ586d1axZM0VERGjw4MEKCQnRpUuXZLFYNGHCBLVu3TqjywTSbOjQoUpMTFT9+vUVExOj559/Xo6OjgoJCdE777yT0eUBAJCp8dIspLutW7fq5s2beumll5I9f/PmTe3Zs0d16tR5wpUB5pk0aZK2bt360Df69urVS3PmzFFiYuITrgxIP3///bf27t2rBg0asG0GsozExERNnjxZO3fuVK1atTR06FAtXbpUgwcPVkxMjIKDgzVr1izmPLKMO3fu6NixY7px44bKli0rZ2fnjC4JAIBMj8AVAAAAAAAAAEzClgIAAAAAkM3cvHlTkydPVkREhC5cuPDAUzgnTpzIoMoAAMj8CFwBAAAAIJt54403tHnzZnXq1EkFChTgJYgAAJiILQUAAAAAIJtxd3fX6tWrFRQUlNGlAACQ5dhldAEAAAAAgCcrb9688vDwyOgyAADIkghcAQAAACCbGTdunEaOHKmYmJiMLgUAgCyHLQUAAAAAIJupXLmyjh8/rqSkJPn7+8vBwcHq/L59+zKoMgAAMj9emgUAAAAA2Uzz5s0zugQAALIsVrgCAAAAAAAAgEnYwxUAAAAAAAAATMKWAgAAAACQDXh4eOjPP/9U/vz5lTdvXlksloe2vXz58hOsDACArIXAFQAAAACygenTp8vFxUWSNGPGjIwtBgCALIw9XAEAAAAAAADAJKxwBQAAAIBsIDo6OsVtXV1d07ESAACyNla4AgAAAEA2YGdn98h9WyUpKSlJFotFCQkJT6gqAACyHla4AgAAAEA2sHHjxowuAQCAbIEVrgAAAACQDbRs2VKhoaFydXXVokWL1KZNGzk6OmZ0WQAAZDkErgAAAACQDeTMmVOnT59WgQIFZG9vr6ioKHl5eWV0WQAAZDlsKQAAAAAA2UDp0qU1bNgw1atXT0lJSVq2bNlDX47VuXPnJ1wdAABZBytcAQAAACAb2L59u959910dP35cly9flouLS7Iv0bJYLLp8+XIGVAgAQNZA4AoAAAAA2YydnZ3OnTvHlgIAAKQDu4wuAAAAAACQ/lq2bKno6GhJ0oIFC+Ti4pLBFQEAkDWxwhUAAAAAsgFemgUAwJPBS7MAAAAAIBvgpVkAADwZrHAFAAAAgGxgx44dGjhwIC/NAgAgnRG4AgAAAEA2Y2dnp6ioKHl7e2d0KQAAZDkErgAAAACQzZw+fVqurq764osvdOTIEUlSuXLl1L1794duMwAAAFKGwBUAAAAAspk9e/aoUaNGcnJy0rPPPitJ2r17t27duqX169eratWqGVwhAACZF4ErAAAAAGQzzz33nEqUKKF58+YpR46771KOj4/XG2+8oRMnTmjLli0ZXCEAAJkXgSsAAAAAZDNOTk7av3+/SpcubXX88OHDqlatmmJiYjKoMgAAMj+7jC4AAAAAAPBkubq6KjIy8oHjZ86ckYuLSwZUBABA1kHgCgAAAADZTJs2bdS9e3ctXbpUZ86c0ZkzZxQeHq433nhD7dq1y+jyAADI1HJkdAEAAAAAgCdrypQpslgs6ty5s+Lj4yVJDg4OevvttzV58uQMrg4AgMyNPVwBAAAAIJuKiYnR8ePHJUnFixdX7ty5M7giAAAyPwJXAAAAAAAAADAJe7gCAAAAAAAAgEkIXAEAAAAAAADAJASuAAAAAAAAAGASAlcAAAAAAAAAMAmBKwAA2ZDFYtF3331nWn+bNm2SxWLR1atXTesTAAAAADIjAlcAADKJnTt3yt7eXk2bNk1zX1FRUWrcuLEJVaWcv7+/LBaLLBaLcufOrYCAAH3++ec292N2WAwAAAAAZiJwBQAgk5g/f77eeecdbdmyRWfPnn1k26SkJMXHxz9w/M6dO5IkHx8fOTo6pkudjzJ27FhFRUXp0KFD6tixo958802tXbv2idcBAAAAAOmFwBUAgEzgxo0bWrp0qd5++201bdpUoaGhVufvPdK/du1aVa1aVY6Ojtq2bZvq1q2rPn36qH///sqfP78aNWokyXqVaK1atTRkyBCr/i5evCgHBwdt2bJFkrR48WJVq1ZNLi4u8vHxUfv27XXhwgWbx3Hv+mLFimnIkCHy8PDQhg0bjPO7d+/Wiy++qPz588vNzU116tTRvn37jPP+/v6SpBYtWshisRifJen7779XlSpVlCtXLhUrVkxjxoxJNnQGAAAAgPRE4AoAQCawbNkylS5dWqVKlVLHjh31xRdfKCkp6YF2Q4cO1eTJk3XkyBFVqFBBkrRw4ULlzJlT27dv15w5cx64pkOHDgoPD7fqb+nSpfL19dVzzz0nSYqLi9O4ceN08OBBfffddzp16pS6du2a6vEkJibqm2++0ZUrV5QzZ07j+PXr19WlSxdt27ZNv/zyi0qWLKkmTZro+vXrku4GspK0YMECRUVFGZ+3bt2qzp07q1+/fjp8+LA+++wzhYaGasKECamuEQAAAABSw5KU3J/WAADAUyUoKEivvfaa+vXrp/j4eBUoUEDLly9X3bp1Jd1d4VqvXj199913atasmXFd3bp1FR0dbbVKVLq7wnXFihVq3ry5Ll68KF9fX/38889GwFqrVi09//zzmjx5crL17NmzR9WrV9f169fl7Oxs3P/KlStyd3dP9hp/f39FRUXJwcFBsbGxio+Pl4eHh3bt2qUSJUoke01iYqLc3d0VFhaml19++YHa72nQoIHq16+vYcOGGce+/PJLDR48+LHbLwAAAACAmVjhCgDAU+7o0aP69ddf1a5dO0lSjhw51KZNG82fP/+BttWqVXvgWNWqVR/Zv6enpxo2bKglS5ZIkk6ePKmdO3eqQ4cORpu9e/cqODhYRYoUkYuLi+rUqSNJioyMtGksgwYN0oEDB/Tzzz+rRo0amj59ulXYev78eb355psqWbKk3Nzc5Orqqhs3bjz2PgcPHtTYsWPl7Oxs/HjzzTcVFRWlmJgYm2oEAAAAgLTIkdEFAACAR5s/f77i4+Pl6+trHEtKSpKjo6NmzZolNzc343iePHkeuD65Y//WoUMH9e3bVx9//LHCwsIUEBCggIAASdLNmzfVqFEjNWrUSEuWLJGnp6ciIyPVqFEj4yVcKZU/f36VKFFCJUqU0PLlyxUQEKBq1aqpbNmykqQuXbro0qVLmjlzpvz8/OTo6KjAwMDH3ufGjRsaM2aMWrZs+cC5XLly2VQjAAAAAKQFK1wBAHiKxcfHa9GiRZo6daoOHDhg/Dh48KB8fX311VdfmXKfZs2a6fbt21q3bp3CwsKsVrf+97//1aVLlzR58mQ999xzKl26dKpemPVvhQsXVps2bay2Adi+fbv69u2rJk2aqFy5cnJ0dNT//vc/q+scHByUkJBgdaxKlSo6evSoEebe/8POjv/dAQAAAPDksMIVAICn2KpVq3TlyhV1797daiWrJLVq1Urz58/XW2+9leb75MmTR82bN9eIESN05MgRY/sCSSpSpIhy5sypjz/+WG+99ZYOHTqkcePGpfmektSvXz+VL19ee/bsUbVq1VSyZEktXrxY1apVU3R0tAYNGiQnJyera/z9/RUREaGgoCA5Ojoqb968GjlypF5++WUVKVJErVu3lp2dnQ4ePKhDhw5p/PjxptQKAAAAACnBkg8AAJ5i8+fPV4MGDR4IW6W7geuePXv022+/mXKvDh066ODBg3ruuedUpEgR47inp6dCQ0O1fPlylS1bVpMnT9aUKVNMuWfZsmXVsGFDjRw5UtLd8V65ckVVqlRRp06d1LdvX3l5eVldM3XqVG3YsEGFCxdW5cqVJUmNGjXSqlWr9OOPP6p69eqqWbOmpk+fLj8/P1PqBAAAAICUsiQlJSVldBEAAAAAAAAAkBWwwhUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJMQuAIAAAAAAACASQhcAQAAAAAAAMAkBK4AAAAAAAAAYBICVwAAAAAAAAAwCYErAAAAAAAAAJiEwBUAAAAAAAAATELgCgAAAAAAAAAmIXAFAAAAAAAAAJP8P6J8V3JpZ4/JAAAAAElFTkSuQmCC",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABVwAAALvCAYAAACZeQ7oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADxCklEQVR4nOzdeVyN6f8/8NeptC9atJFKoiSh7EP2KCR7tsq+CxmMqGQdu5mxLxlK9u0jS4OIjGXsNJgohpQ1khZ1//7o1/11nJZTIqbX8/s4j++c676W93Wfuz68Xee6JIIgCCAiIiIiIiIiIiKiz6ZQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUsKEKxEREREREREREVEpYcKViIiIiIiIiIiIqJQw4UpERCJvb29YWFh80TEkEgkCAwO/6BhlYeHChahWrRoUFRVRt27dsg5HRsuWLdGyZUupsqSkJPTo0QP6+vqQSCRYtmwZAODevXto3749dHR0IJFIsG/fvq8e739NfvdfXhYWFvD29i7VeL60z5lvWbOwsECnTp2++DiBgYGQSCRffBwiIiIi+vqYcCWib45EIinVV1RUFOLj4wu83rhxY/EvvkW9vpUEwo8//giJRILevXuXdShfxKefl4KCAvT09NCxY0ecO3euxP2uXLkSISEhpRfo/3fs2DH8+OOPaNasGTZt2oS5c+eW+hgf8/b2lro/mpqaqFatGnr06IHdu3cjJydHrn4mTJiAo0ePYtq0adiyZQs6dOgAAPDy8sKNGzcwZ84cbNmyBU5OTl9yOp9l7ty5cieEP36uZs+enW+dfv36ifeUvjwLCwupZ1lVVRXW1taYPHkyXr58WaI+Y2JiEBgYiNevX5dusHJKTU1FQEAAateuDQ0NDejr66Nu3boYP348njx5UiYxEREREdHXpVTWARARfWrLli1S73///XdERkbKlGdnZ0NRUbHIera2tnj//j0AwNPTE66urlLXK1WqBBMTE1SvXl0sS01NxciRI+Hh4YFu3bqJ5UZGRp83uVIgCAK2bdsGCwsLHDx4EG/fvoWWllap9L1u3Tq5k3VfQ97nlZ2djbt372LlypVo1aoVLl68CHt7+2L3t3LlShgYGJT6asETJ05AQUEBGzZsgLKycqn2XRAVFRWsX78eAPD+/XskJCTg4MGD6NGjB1q2bIn9+/dDW1tbrH/s2LF843Z3d4efn59Y9v79e5w7dw7Tp0/HmDFjvvxEPtPcuXPRo0cPdO3aVe42qqqq2LZtG/z9/aXK3717h/3790NVVbWUo6TC1K1bF5MmTQIApKen46+//sKyZctw6tQpXLhwodj9xcTEICgoCN7e3qhYsWIpR1u4rKwstGjRAn///Te8vLwwduxYpKam4tatWwgLC4OHhwdMTU0BAP7+/pg6depXjY+IiIiIvg4mXInom9O/f3+p93/++SciIyNlyj9VWL34+HgAQP369Qvsp06dOuJ/P3/+HCNHjkSdOnWKHPdri4qKwr///osTJ07AxcUFe/bsgZeXV5Ht0tPToaysDAUF2S83vHv3DhoaGqhQocKXCLnEPv28mjdvjo4dO2LVqlVYuXJlGUYmLTk5GWpqaqWWbBUEAenp6VBTUyuwjpKSksyzOXv2bMyfPx/Tpk3D0KFDsX37dvFafrElJyfLJKSePXsGAKWaqCrs2SsLrq6u2LNnD65duwYHBwexfP/+/cjMzESHDh1w4sSJMoywfKlcubLUszxkyBBoampi0aJFuHfvHqytrcswuuLZt28frly5gtDQUPTt21fqWnp6OjIzM8X3SkpKUFLiH8WJiIiI/ou+jb/5EBF953bt2gWJRIJTp07JXFuzZg0kEglu3rwJAHj69Cl8fHxQpUoVqKiowMTEBO7u7mJSuCihoaGoVasWWrVqhbZt2yI0NFSmTlRUFCQSCcLDw+Hv74/KlStDXV0db968gbe3NzQ1NREXFwdXV1doaWmhX79+AKT3cM3KyoKenh58fHxk+n/z5g1UVVXFlZGZmZmYOXMmHB0doaOjAw0NDTRv3hwnT56Ua07yat68OQAgLi5OqnzTpk1o3bo1DA0NoaKiglq1amHVqlVSdSwsLHDr1i2cOnUq3y0iXr9+DV9fX5iZmUFFRQXVq1fHggULilzxK5FIsGnTJrx7907sN2/bgg8fPiA4OBhWVlZQUVGBhYUFfvrpJ2RkZMjE1qlTJxw9ehROTk5QU1PDmjVrSnSPpk6divbt22Pnzp24e/euWP7xnpohISGQSCQQBAG//fabGHdgYCDMzc0BAJMnT4ZEIpHa0/fx48cYNGgQjIyMoKKiAjs7O2zcuFFq/MKePQA4f/48OnToAB0dHairq8PZ2Rlnz56V6iNvi49//vlHXKWoo6MDHx8fpKWlSd37d+/eYfPmzeIc5Fm93KRJE1haWiIsLEyqPDQ0FB06dICenl6+7VauXAk7OzuoqKjA1NQUo0ePzvdr62vXroWVlRXU1NTQsGFDREdH59tfRkYGAgICUL16daioqMDMzAw//vijzPPxqaysLAQFBcHa2hqqqqrQ19fHDz/8gMjIyELbvXz5En5+frC3t4empia0tbXRsWNHXLt2Tape3me4Y8cOzJkzB1WqVIGqqiratGmDf/75p8TzLQ5jY2MAkEpIXr9+Hd7e3qhWrRpUVVVhbGyMQYMG4cWLF2KdwMBATJ48GQBgaWkpPhcf/37dunUrGjZsCHV1dejq6qJFixb5rgA/c+YMGjZsCFVVVVSrVg2///57kXHn/W5q1qyZzDVVVVWpVeef7uH66VYhH78+3vda3ucmMjISP/zwAypWrAhNTU3UrFkTP/30U5FzICIiIqLPx39WJ6JyJS0tDc+fP5cq09HR+eyVnW5ubtDU1MSOHTvg7OwsdW379u2ws7ND7dq1AQDdu3fHrVu3MHbsWFhYWCA5ORmRkZF4+PBhkQdWZWRkYPfu3eLXbz09PeHj44OnT5+KCYqPBQcHQ1lZGX5+fsjIyBBXOX748AEuLi744YcfsGjRIqirq8u0rVChAjw8PLBnzx6sWbNGaoXkvn37kJGRgT59+gDITcCuX78enp6eGDp0KN6+fYsNGzbAxcUFFy5cKLVDpPKSJrq6ulLlq1atgp2dHbp06QIlJSUcPHgQo0aNQk5ODkaPHg0AWLZsGcaOHQtNTU1Mnz4dwP9tEZGWlgZnZ2c8fvwYw4cPR9WqVRETE4Np06YhMTFRPEwqP1u2bMHatWtx4cIF8Sv+TZs2BZC7Um/z5s3o0aMHJk2ahPPnz2PevHmIjY3F3r17pfq5c+cOPD09MXz4cAwdOhQ1a9Ys8X0aMGAAjh07hsjISNSoUUPmeosWLbBlyxYMGDAA7dq1w8CBAwHkrvKuWLEiJkyYIG7nkLeXaVJSEho3bgyJRIIxY8agUqVKOHz4MAYPHow3b97A19dXaoz8nr0TJ06gY8eOcHR0REBAABQUFMRkeXR0NBo2bCjVR69evWBpaYl58+bh8uXLWL9+PQwNDbFgwQLx3g8ZMgQNGzbEsGHDAABWVlZy3SNPT09s3boV8+fPh0QiwfPnz3Hs2DFs2bIFR44ckakfGBiIoKAgtG3bFiNHjsSdO3ewatUqXLx4EWfPnhV/h2zYsAHDhw9H06ZN4evri/v376NLly7Q09ODmZmZ2F9OTg66dOmCM2fOYNiwYbC1tcWNGzewdOlS3L17t9B9aQMDAzFv3jxx7m/evMGlS5dw+fJltGvXrsB29+/fx759+9CzZ09YWloiKSkJa9asgbOzM27fvi1+1T3P/PnzoaCgAD8/P6SkpODnn39Gv379cP78ebGOvPMtTFZWlvh7OT09HVeuXMGSJUvQokULWFpaivUiIyNx//59+Pj4wNjYGLdu3cLatWtx69Yt/Pnnn5BIJOjWrRvu3r2Lbdu2YenSpTAwMACQu3UMAAQFBSEwMBBNmzbFrFmzoKysjPPnz+PEiRNo3769ONY///yDHj16YPDgwfDy8sLGjRvh7e0NR0dH2NnZFTiXvH+w+P333+Hv71+sQ7GGDx+Otm3bSpUdOXIEoaGhMDQ0BCD/c3Pr1i106tQJderUwaxZs6CiooJ//vlH5h83iIiIiOgLEYiIvnGjR48W5Pl1VVi9Bw8eCADyfZ08eVKm/rNnzwQAQkBAgNxxenp6CoaGhsKHDx/EssTEREFBQUGYNWuWIAiC8OrVKwGAsHDhQrn7/diuXbsEAMK9e/cEQRCEN2/eCKqqqsLSpUul6p08eVIAIFSrVk1IS0uTuubl5SUAEKZOnSrTv5eXl2Bubi6+P3r0qABAOHjwoFQ9V1dXoVq1auL7Dx8+CBkZGVJ1Xr16JRgZGQmDBg2SKpfnvuZ9XkFBQcKzZ8+Ep0+fCtHR0UKDBg0EAMLOnTul6n86R0EQBBcXF6kYBUEQ7OzsBGdnZ5m6wcHBgoaGhnD37l2p8qlTpwqKiorCw4cPC43Xy8tL0NDQkCq7evWqAEAYMmSIVLmfn58AQDhx4oRYZm5uLgAQjhw5Uug4hY33sStXrggAhAkTJohlzs7OMnMHIIwePVqqLO/ef/qMDh48WDAxMRGeP38uVd6nTx9BR0dH/AwKevZycnIEa2trwcXFRcjJyRHL09LSBEtLS6Fdu3ZiWUBAgABA5tnx8PAQ9PX1pco0NDQELy+vAu9FQXO7efOmAECIjo4WBEEQfvvtN0FTU1N49+6dzP1NTk4WlJWVhfbt2wvZ2dli+a+//ioAEDZu3CgIgiBkZmYKhoaGQt26daV+HtauXSsAkLr/W7ZsERQUFMTx86xevVoAIJw9e1YsMzc3l5qjg4OD4ObmJtecP5aeni4Vf949UVFREX9HCcL/fYa2trZS81i+fLkAQLhx40ax51uQvGf/01ezZs1knrX8fs63bdsmABBOnz4tli1cuFAAIDx48ECq7r179wQFBQXBw8ND5j58/EzmxfRxn8nJyYKKioowadKkQueTlpYm1KxZUwAgmJubC97e3sKGDRuEpKQkmbp5z3lB7t27J+jo6Ajt2rUT/3dF3udm6dKlAgDh2bNnhcZLRERERF8GtxQgonJl2LBhiIyMlHp9vIfj5+jduzeSk5MRFRUllu3atQs5OTno3bs3AIj7fEZFReHVq1fFHiM0NBROTk7iAV9aWlpwc3PLd1sBIPe0+YL2AR05cmSR47Vu3RoGBgZSe4G+evUKkZGR4pwAQFFRUVwBm5OTg5cvX+LDhw9wcnLC5cuX5Z7fpwICAlCpUiUYGxujefPmiI2NxeLFi9GjRw+peh/PMSUlBc+fP4ezszPu37+PlJSUIsfZuXMnmjdvDl1dXTx//lx8tW3bFtnZ2Th9+nSxY4+IiAAATJw4Uao8b3XyoUOHpMotLS3h4uJS7HHyk7cq9e3bt6XSnyAI2L17Nzp37gxBEKTukYuLC1JSUmQ+50+fvatXr+LevXvo27cvXrx4IbZ/9+4d2rRpg9OnT8ts3zBixAip982bN8eLFy/E7Qk+h52dHerUqYNt27YBAMLCwuDu7p7vau8//vgDmZmZ8PX1ldqHdujQodDW1hY/y0uXLiE5ORkjRoyQWhHu7e0NHR0dqT537twJW1tb2NjYSN3P1q1bA0Ch23FUrFgRt27dwr1794o1ZxUVFTH+7OxsvHjxQvyqeX4/pz4+PlLzyNvS4/79+8Web2EaNWok/j7+3//+hzlz5uDWrVvo0qWLeOAhIP1znp6ejufPn6Nx48YAINfvmX379iEnJwczZ86U2U/405WotWrVEucL5K6QrVmzpjj3gqipqeH8+fPitgYhISEYPHgwTExMMHbs2CK3i8jz7t07eHh4QFdXF9u2bRMPiJT3ucnbg3n//v3f1EGIREREROUFtxQgonLF2tpa5iubpSVvX8rt27ejTZs2AHK3E6hbt674tW4VFRUsWLAAkyZNgpGRERo3boxOnTph4MCB+W4J8LHXr18jIiICY8aMkdpHsVmzZti9ezfu3r0r8/Xxj7+O+zElJSVUqVKlyDkpKSmhe/fuCAsLQ0ZGBlRUVLBnzx5kZWVJJVwBYPPmzVi8eDH+/vtvZGVlFRmDPIYNG4aePXsiPT0dJ06cwIoVK5CdnS1T7+zZswgICMC5c+ek9vgEchOwRSV/7t27h+vXr4tfO/5UcnJysWNPSEiAgoKCmBzPY2xsjIoVKyIhIUGq/HPu06dSU1MB5CbkS8OzZ8/w+vVrrF27FmvXrs23zqf36NP55CUHCzvgLSUlRWq7iKpVq0pdz7v26tUrqb0wS6pv375YvHgxJkyYgJiYmAL3t8z7rD7d5kFZWRnVqlUTr+f9/08PeapQoQKqVasmVXbv3j3ExsaW6JmbNWsW3N3dUaNGDdSuXRsdOnTAgAEDpA7+y09OTg6WL1+OlStX4sGDB1I/S/r6+jL1C7v/QPHmWxgDAwOp38tubm6oWbMmevTogfXr12Ps2LEAcvegDQoKQnh4uMz9kecfVuLi4qCgoIBatWoVWffTuQO585fnH8p0dHTw888/4+eff0ZCQgKOHz+ORYsW4ddff4WOjg5mz55dZB9Dhw5FXFwcYmJipD4beZ+b3r17Y/369RgyZAimTp2KNm3aoFu3bujRo8c3c3gdERER0X8ZE65ERKVERUUFXbt2xd69e7Fy5UokJSXh7NmzmDt3rlQ9X19fdO7cGfv27cPRo0cxY8YMzJs3DydOnEC9evUK7H/nzp3IyMjA4sWLsXjxYpnroaGhCAoKkioraHXrxyvditKnTx+sWbMGhw8fRteuXbFjxw7Y2NhIrQzeunUrvL290bVrV0yePBmGhoZQVFTEvHnzZA64Ko6PE+SdOnWCoqIipk6dilatWsHJyQlAbhKlTZs2sLGxwZIlS2BmZgZlZWVERERg6dKlcq3uysnJQbt27fDjjz/mez2/fVDlJe8ejgV9ViWRd0Dbp8neksq7h/379y8wYfppsu/T+eT1sXDhwgL39M1bmZsnb1XfpwRBKDJmeXh6emLatGkYOnQo9PX1pfbw/NJycnJgb2+PJUuW5Hu9sP1PW7Rogbi4OOzfvx/Hjh3D+vXrsXTpUqxevRpDhgwpsN3cuXMxY8YMDBo0CMHBwdDT04OCggJ8fX3z/Tn50ve/MHn/aHX69Gkx4dqrVy/ExMRg8uTJqFu3LjQ1NZGTk4MOHTqU+irO0pq7ubk5Bg0aBA8PD1SrVg2hoaFFJlyXL1+Obdu2YevWrTI/K/I+N2pqajh9+jROnjyJQ4cO4ciRI9i+fTtat26NY8eOFTg/IiIiIiodTLgSEZWi3r17Y/PmzTh+/DhiY2MhCILMSlAg92CfSZMmYdKkSbh37x7q1q2LxYsXY+vWrQX2HRoaitq1ayMgIEDm2po1axAWFiaTcC0NLVq0gImJCbZv344ffvgBJ06cEA+dyrNr1y5Uq1YNe/bskUow5hfr55g+fTrWrVsHf39/8WCjgwcPIiMjAwcOHJBalZbfV7ILSn5aWVkhNTW1VFc/m5ubIycnB/fu3YOtra1YnpSUhNevX4uH63wJW7ZsgUQiKfQApeKoVKkStLS0kJ2dXeJ7lHeYlba2dqne5+IcSvSpqlWrolmzZoiKisLIkSOhpJT/H4vyPqs7d+5IrdzMzMzEgwcPxPnk1bt37574FW8g91CoBw8eSP0jhZWVFa5du4Y2bdqUaA56enrw8fGBj48PUlNT0aJFCwQGBhaacN21axdatWqFDRs2SJW/fv1aPFyqOIoz3+L68OEDgP9brf3q1SscP34cQUFBmDlzplgvv20VCvs5z8nJwe3bt0vtID956erqwsrKSvzHkIJER0fDz88Pvr6+6Nevn8z14jw3CgoKaNOmDdq0aYMlS5Zg7ty5mD59Ok6ePPnFvulBRERERLn4nSIiolLUtm1b6OnpYfv27di+fTsaNmwo9dXqtLQ0pKenS7WxsrKClpZWoXv7PXr0CKdPn0avXr3Qo0cPmZePjw/++ecfqdPDS4uCggJ69OiBgwcPYsuWLfjw4YNMEjlvtdTHq7/Onz+Pc+fOlWosFStWxPDhw3H06FFcvXq1wLFTUlKwadMmmfYaGhp4/fq1THmvXr1w7tw5HD16VOba69evxeRPcbi6ugIAli1bJlWetzLNzc2t2H3KY/78+Th27Bh69+4t81XvklJUVET37t2xe/fufBNGz549K7IPR0dHWFlZYdGiRWISrbh95Kegz1Res2fPRkBAgLiKMj9t27aFsrIyVqxYIfWcbdiwASkpKeJn6eTkhEqVKmH16tXIzMwU64WEhMjE2KtXLzx+/Bjr1q2TGe/9+/d49+5dgfG8ePFC6r2mpiaqV69e5P6gioqKMis0d+7cicePHxfariDFmW9xHTx4EADEpG1+P+eA7M8XkPtMAJCJoWvXrlBQUMCsWbNkVsSW1qrda9eu4fnz5zLlCQkJuH37tsy2FB9LTExEr1698MMPP2DhwoX51pH3uXn58qXM9bwks7z7yBIRERFRyXGFKxFRKapQoQK6deuG8PBwvHv3DosWLZK6fvfuXbRp0wa9evVCrVq1oKSkhL179yIpKQl9+vQpsN+wsDAIgoAuXbrke93V1RVKSkoIDQ1Fo0aNSnVOQO7K3V9++QUBAQGwt7eXWrEJ5H7df8+ePfDw8ICbmxsePHiA1atXo1atWvkm1z7H+PHjsWzZMsyfPx/h4eFo3749lJWV0blzZwwfPhypqalYt24dDA0NkZiYKNXW0dERq1atwuzZs1G9enUYGhqidevWmDx5Mg4cOIBOnTrB29sbjo6OePfuHW7cuIFdu3YhPj6+2CsAHRwc4OXlhbVr1+L169dwdnbGhQsXsHnzZnTt2hWtWrX6rPvw4cMHcUV0eno6EhIScODAAVy/fh2tWrUqcK/Vkpo/fz5OnjyJRo0aYejQoahVqxZevnyJy5cv448//sg3wfMxBQUFrF+/Hh07doSdnR18fHxQuXJlPH78GCdPnoS2traYZCsOR0dH/PHHH1iyZAlMTU1haWlZrJ8BZ2dnODs7F1qnUqVKmDZtGoKCgtChQwd06dIFd+7cwcqVK9GgQQP0798fQO7P/+zZszF8+HC0bt0avXv3xoMHD7Bp0yaZPU0HDBiAHTt2YMSIETh58iSaNWuG7Oxs/P3339ixYweOHj0qbpvxqVq1aqFly5ZwdHSEnp4eLl26hF27dmHMmDGFzqNTp06YNWsWfHx80LRpU9y4cQOhoaHF2m/1Y8WZb2EeP34sPsuZmZm4du0a1qxZAwMDAzERrq2tjRYtWuDnn39GVlYWKleujGPHjuHBgwcy/Tk6OgLIXRHfp08fVKhQAZ07d0b16tUxffp0BAcHo3nz5ujWrRtUVFRw8eJFmJqaYt68eSW6Dx+LjIxEQEAAunTpgsaNG0NTUxP379/Hxo0bkZGRgcDAwALbjhs3Ds+ePcOPP/6I8PBwqWt16tRBnTp15H5uZs2ahdOnT8PNzQ3m5uZITk7GypUrUaVKFfzwww+fPU8iIiIiKoJARPSNGz16tCDPr6vC6j148EAAICxcuFCuMZ89eyYAEAICAooTqiAIghAZGSkAECQSifDo0SOpa8+fPxdGjx4t2NjYCBoaGoKOjo7QqFEjYceOHYX2aW9vL1StWrXQOi1bthQMDQ2FrKws4eTJkwIAYefOnTL1vLy8BA0NjXz78PLyEszNzWXKc3JyBDMzMwGAMHv27Hyvz507VzA3NxdUVFSEevXqCf/73//y7U+e+1rU5+Xt7S0oKioK//zzjyAIgnDgwAGhTp06gqqqqmBhYSEsWLBA2LhxowBAePDggdju6dOngpubm6ClpSUAEJydncVrb9++FaZNmyZUr15dUFZWFgwMDISmTZsKixYtEjIzMwuNt6B7mpWVJQQFBQmWlpZChQoVBDMzM2HatGlCenq6VD1zc3PBzc2t0DE+HQ+A+FJXVxcsLCyE7t27C7t27RKys7Nl2jg7O0vNVxByP4vRo0dLlRV275OSkoTRo0cLZmZmQoUKFQRjY2OhTZs2wtq1a8U6hT17giAIV65cEbp16ybo6+sLKioqgrm5udCrVy/h+PHjYp2AgAABgPDs2TOptps2bZL5TP/++2+hRYsWgpqamgBA8PLyKui2yf17oKDP89dffxVsbGyEChUqCEZGRsLIkSOFV69eydRbuXKlYGlpKaioqAhOTk7C6dOn873/mZmZwoIFCwQ7OztBRUVF0NXVFRwdHYWgoCAhJSVFrGdubi41r9mzZwsNGzYUKlasKKipqQk2NjbCnDlzinxO09PThUmTJgkmJiaCmpqa0KxZM+HcuXMysRX0Gebdv02bNpVovvkxNzeXepYVFBQEQ0NDwdPTU/z5zvPvv/8KHh4eQsWKFQUdHR2hZ8+ewpMnT/L9nRIcHCxUrlxZUFBQkHlmNm7cKNSrV0+8587OzkJkZKRUTPn9PMozp/v37wszZ84UGjduLBgaGgpKSkpCpUqVBDc3N+HEiRNSdfOe84/7//hefPz6eH7yPDfHjx8X3N3dBVNTU0FZWVkwNTUVPD09hbt37xYaPxERERGVDokgfIWTD4iIiIiIiIiIiIjKAe7hSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiL6KkJAQSCQSXLp0qcA68fHxkEgkWLRoUaF9WVhYQCKRoG3btvleX7duHSQSSZHjFSYwMBASiQTPnz8vsE5UVBQkEgl27dold7+9evWCRCLBlClTCu1TIpFg69at+dZp1qwZJBIJateune/17OxsmJqaQiKR4PDhw3LHBgATJkxA/fr1oaenB3V1ddja2iIwMBCpqalytV+1ahV69uyJqlWrQiKRwNvbu1jj5322+b2sra1l6m/YsAG2trZQVVWFtbU1fvnlF5k63t7eUv0oKSnBzMwMffr0we3bt+WKS57n93Pcvn0bgYGBiI+P/yL9fy8xEBERERH9FyiVdQBEREQloaqqipMnT+Lp06cwNjaWuhYaGgpVVVWkp6eXUXT5e/PmDQ4ePAgLCwts27YN8+fPh0QiybeuqqoqwsLC0L9/f6ny+Ph4xMTEQFVVtcBxTpw4gcTERFhYWCA0NBQdO3aUO8aLFy+iefPm8PHxgaqqKq5cuYL58+fjjz/+wOnTp6GgUPi/1S5YsABv375Fw4YNkZiYKPe4eZYtWyaT3E1ISIC/vz/at28vVb5mzRqMGDEC3bt3x8SJExEdHY1x48YhLS1NJqGtoqKC9evXAwA+fPiAuLg4rF69GkeOHMHt27dhampa7FhL0+3btxEUFISWLVvCwsKi3MZARERERPRfwIQrERF9l5o1a4aLFy9i+/btGD9+vFj+77//Ijo6Gh4eHti9e3cZRihr9+7dyM7OxsaNG9G6dWucPn0azs7O+dZ1dXXFgQMH8Pz5cxgYGIjlYWFhMDIygrW1NV69epVv261bt6J+/frw8vLCTz/9hHfv3kFDQ0OuGM+cOSNTZmVlBT8/P1y4cAGNGzcutP2pU6fE1a2amppyjfmxrl27ypTNnj0bANCvXz+x7P3795g+fTrc3NzEFcZDhw5FTk4OgoODMWzYMOjq6or1lZSUZJLXjRs3RqdOnXDo0CEMHTq02LGWFUEQkJ6eDjU1tbIOhYiIiIiI8sEtBYiI6LukqqqKbt26ISwsTKp827Zt0NXVhYuLi0ybrKws/P333yVaeVkaQkND0a5dO7Rq1Qq2trYIDQ0tsK67uztUVFSwc+dOqfKwsDD06tULioqK+bZ7//499u7diz59+qBXr154//499u/f/1lx5612fP36dZF1zc3NC1y1W1JhYWGwtLRE06ZNxbKTJ0/ixYsXGDVqlFTd0aNH4927dzh06FCR/eatjFZSKtm/P3t7e0NTUxOPHz9G165doampiUqVKsHPzw/Z2dlSdcPDw+Ho6AgtLS1oa2vD3t4ey5cvB5C7XUHPnj0BAK1atRK3PoiKigKQe/87deqEo0ePwsnJCWpqalizZo24BUdISIhMbBKJBIGBgVJljx8/xuDBg2FqagoVFRVYWlpi5MiRyMzMLDIGIiIiIiKSHxOuRET03erbty8uXLiAuLg4sSwsLAw9evRAhQoVZOo/fvwYtra2mDZt2tcMEwDw5MkTnDx5Ep6engAAT09P7Nq1C5mZmfnWV1dXh7u7O7Zt2yaWXbt2Dbdu3ULfvn0LHOfAgQNITU1Fnz59YGxsjJYtWxaa2M3Phw8f8Pz5czx58gTHjh2Dv78/tLS00LBhw2L1UxquXLmC2NhYmTlfuXIFAODk5CRV7ujoCAUFBfH6x54/f47nz58jKSkJ586dw4QJE6Cvr49OnTqVOL7s7Gy4uLhAX18fixYtgrOzMxYvXoy1a9eKdSIjI+Hp6QldXV0sWLAA8+fPR8uWLXH27FkAQIsWLTBu3DgAwE8//YQtW7Zgy5YtsLW1Ffu4c+cOPD090a5dOyxfvhx169YtVpxPnjxBw4YNER4ejt69e2PFihUYMGAATp06hbS0NLliICIiIiIi+XBLASIi+m61bt0axsbG2LZtG/z9/REbG4urV69i+fLluH//flmHJ2Xbtm1QUVGBu7s7AKBPnz6YOXMmIiIi8v0aPZCbUO7cuTMePXoEMzMzhIaGolq1aoV+rX/r1q1o2rQpzMzMxHFGjRqFZ8+eoVKlSnLFeunSJTRp0kR8X7NmTRw4cAB6enpyzrb05CWLP95OAAASExOhqKgIQ0NDqXJlZWXo6+vjyZMnUuXv3r2TmX/lypVx7Ngxue9LftLT09G7d2/MmDEDADBixAjUr18fGzZswMiRIwEAhw4dgra2No4ePZrvyuRq1aqhefPmWLFiBdq1a4eWLVvK1Pnnn39w5MgRqZXbxTncatq0aXj69CnOnz8vlaSeNWsWBEFAxYoVi4yBiIiIiIjkwxWuRET03VJUVESvXr3EVaChoaEwMzND8+bN861vYWEBQRDy/Qr2lxYaGgo3NzdoaWkBAKytreHo6Fjo6tP27dtDT08P4eHhEAQB4eHh4grZ/Lx48QJHjx6VqtO9e3dIJBLs2LFD7lhr1aqFyMhI7Nu3Dz/++CM0NDRkDrL6GnJychAeHo569erJrLR8//49lJWV822nqqqK9+/fy5RFRkYiMjISR48exZo1a6CpqQlXV1fcvXv3s+IcMWKE1PvmzZtLJfwrVqyId+/eITIyssRjWFpa5rtNhjxycnKwb98+dO7cWWZFMIBS3wKCiIiIiKi84wpXIiL6rvXt2xcrVqzAtWvXEBYWhj59+nxzCaTY2FhcuXIFAwcOxD///COWt2zZEr/99hvevHkDbW1tmXYVKlRAz549ERYWhoYNG+LRo0eFbiewfft2ZGVloV69elLjNGrUCKGhoRg9ejQA4OXLl1JbGaipqUFHR0d8r62tjbZt2wLI3Us2LCwM7u7uuHz5MhwcHEp+I/6/9+/fIyUlRaosbz/Vj506dQqPHz/GhAkTZK6pqakVuB1DfgdKKSoqinPK4+rqCmtra0ybNk080OzZs2dSdfT09ApM7AK5idxPV8jq6upKHWg2atQo7NixAx07dkTlypXRvn179OrVCx06dCiw309ZWlrKXfdTz549w5s3b1C7du0S90FERERERPLjClciIvquNWrUCFZWVvD19cWDBw8KTUiWla1btwIAJkyYAGtra/G1ePFipKenY/fu3QW27du3L65evYrAwEA4ODigVq1aBdbNWy3brFkzqXHOnDmDc+fOiasuu3XrBhMTE/E1fvz4QuPv1q0bgNyDn0rD9u3bpcY3MTEpcD4KCgr5ruo1MTFBdnY2kpOTpcozMzPx4sULmJqaFhlHlSpVULNmTZw+fRoA8OjRI5m4YmJiCu2joMPLPmZoaIirV6/iwIED6NKlC06ePImOHTvCy8uryLZ5Pk0gAwWvTP30wC4iIiIiIvq6uMKViIi+e56enpg9ezZsbW2LfZjQlyYIAsLCwtCqVSuMGjVK5npwcDBCQ0Ph4+OTb/sffvgBVatWRVRUFBYsWFDgOA8ePEBMTAzGjBkDZ2dnqWs5OTkYMGAAwsLC4O/vj8WLF0utwCwqOZmRkYGcnByZVakl5eLiUuTX6zMyMrB79260bNky3/jyPudLly7B1dVVLL906RJycnLkfg4+fPggbpdgbGwsE1dprOgFcveW7dy5Mzp37oycnByMGjUKa9aswYwZM1C9evUSrcrW1dUFALx+/VqqPCEhQep9pUqVoK2tjZs3bxba37e2MpyIiIiI6HvFhCsREX33hgwZAkVFRTRq1KjQellZWYiLi4OOjk6BqypL29mzZxEfH49Zs2ahR48eMtfv3r2LGTNm4MmTJ/kmFiUSCVasWIErV65gwIABBY6Tt7r1xx9/FA/M+tj69esRGhoKf39/ODo65tvH69evoaGhgQoVKsi0BSC1/2daWhoePnwIAwMDGBgYFBhXfgpb1ZonIiICr1+/ljksK0/r1q2hp6eHVatWSSVcV61aBXV1dbi5uRUZx927d3Hnzh3xfqiqqspsO1AaXrx4AX19ffG9goIC6tSpAyA3sQwAGhoaAGSTp4XR1taGgYEBTp8+DV9fX7F85cqVUvUUFBTQtWtXbN26FZcuXZLZx1UQBEgkkhLFQEREREREsphwJSKir2rjxo04cuSITPnHX2s/fvw40tPTZep07do1330ozc3NERgYWOTYjx8/hq2tLby8vOQ+OGvJkiVQV1eXKlNQUMBPP/0kvt+9ezf+/vtvmbZeXl4IDQ2FoqJigQnALl26YPr06QgPD8fEiRPzrePu7g53d/dC4wwNDUXdunXzTbbmjTN27FhcvnwZ9evXz7dOVFQUxo0bhx49esDa2hqZmZmIjo7Gnj174OTkhP79+4t1L1y4gFatWiEgIEDq3h88eBDXrl0DkJvgvn79OmbPni3GkJdoLEpoaChUVFTQvXv3fK+rqakhODgYo0ePRs+ePeHi4oLo6Ghs3boVc+bMgZ6enlT9Dx8+iFs75OTkID4+HqtXr0ZOTg4CAgLkiqmkhgwZgpcvX6J169aoUqUKEhIS8Msvv6Bu3briYWB169aFoqIiFixYgJSUFKioqKB169YwNDQssu/58+djyJAhcHJywunTp/M9BGzu3Lk4duwYnJ2dMWzYMNja2iIxMRE7d+7EmTNnULFixRLHQERERERE0phwJSKir2rVqlX5lnt7e4v/feTIkXyTshYWFl/94J958+bJlCkqKkolXAva29TZ2Rk7d+5E06ZNZRKAeWrXrg1LS0ts3bq1wIRrUS5fvoy///4bM2bMKLBO586dMXbsWGzdurXAhKu9vT1atWqF/fv3IzExEYIgwMrKCjNnzsTkyZMLPTwqz+7du7F582bx/ZUrV3DlyhUAuXumypNwffPmDQ4dOgQ3Nzepw7w+NWrUKFSoUAGLFy/GgQMHYGZmhqVLl+a7J21GRobUCmFtbW00aNAAW7ZsQZs2bYqM6XP0798fa9euxcqVK/H69WsYGxujd+/eCAwMhIJC7nb6xsbGWL16NebNm4fBgwcjOzsbJ0+eLDLZOXPmTDx79gy7du0SD+Y6fPiwTLvKlSvj/PnzmDFjBkJDQ/HmzRtUrlwZHTt2FP9BoaQxEBERERGRNIkgCEJZB0FERERERERERET0X6BQ1gEQERERERERERER/Vcw4UpERERERERERERUSphwJSIiIiIiIiIiIiolTLgSERERERERERERlRImXImIiIiIiIiIiIhKiVJZB/AtyMnJwZMnT6ClpQWJRFLW4RAREREREdF/nCAIePv2LUxNTaGgwLVQRET/JUy4Anjy5AnMzMzKOgwiIiIiIiIqZx49eoQqVaqUdRhERFSKmHAFoKWlBSD3f+i0tbXLOJryISsrC8eOHUP79u1RoUKFsg6H6Ivgc07lAZ9zKg/4nFN5wOf863vz5g3MzMzEv48SEdF/BxOugLiNgLa2NhOuX0lWVhbU1dWhra3NP9DRfxafcyoP+JxTecDnnMoDPudlh9vaERH993CjGCIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolHAPVyIiIiIiIiIqVdnZ2cjKyirrMIiISo2ysjIUFORbu8qEKxERERERERGVCkEQ8PTpU7x+/bqsQyEiKlUKCgqwtLSEsrJykXWZcCUiIiIiIiKiUpGXbDU0NIS6ujokEklZh0RE9NlycnLw5MkTJCYmomrVqkX+bmPClYiIiIiIiIg+W3Z2tphs1dfXL+twiIhKVaVKlfDkyRN8+PABFSpUKLQuD80iIiIiIiIios+Wt2erurp6GUdCRFT68rYSyM7OLrIuE65EREREREREVGq4jQAR/RcV53cbE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrEREREREREVE5FBUVBYlEgtevX8vdxsLCAsuWLftiMRWkJLF+Td7e3ujatWup99uyZUv4+vqWer/0ZTHhSkRERERERET0jfH29oZEIsGIESNkro0ePRoSiQTe3t5fP7BvmIWFBSQSCSQSCRQVFWFqaorBgwfj1atXxernayU5s7OzMX/+fNjY2EBNTQ16enpo1KgR1q9fL9bZs2cPgoODv3gsVLqYcCUiIiIiIiIi+gaZmZkhPDwc79+/F8vS09MRFhaGqlWrlmFk365Zs2YhMTERDx8+RGhoKE6fPo1x48aVdVj5CgoKwtKlSxEcHIzbt2/j5MmTGDZsmNQqXj09PWhpaZVdkFQiTLgSEREREREREX2D6tevDzMzM+zZs0cs27NnD6pWrYp69epJ1c3IyMC4ceNgaGgIVVVV/PDDD7h48aJUnYiICNSoUQNqampo1aoV4uPjZcY8c+YMmjdvDjU1NZiZmWHcuHF49+6d3DFfvHgR7dq1g4GBAXR0dODs7IzLly9L1ZFIJFi/fj08PDygrq4Oa2trHDhwoNix5kdLSwvGxsaoXLkyWrVqBS8vL6nxX7x4AU9PT1SuXBnq6uqwt7fHtm3bxOve3t44deoUli9fLq6WzRv71q1b6NSpE7S1taGlpYXmzZsjLi5OavxFixbBxMQE+vr6GD16NLKysgqM9cCBAxg1ahR69uwJS0tLODg4YPDgwfDz8xPrfLzaNm9bhU9fH6903r9/P+rXrw9VVVVUq1YNQUFB+PDhg1z3jkoPE65ERERERERERN+oQYMGYdOmTeL7jRs3wsfHR6bejz/+iN27d2Pz5s24fPkyqlevDhcXF7x8+RIA8OjRI3Tr1g2dO3fG1atXMWTIEEydOlWqj7i4OHTo0AHdu3fH9evXsX37dpw5cwZjxoyRO963b9/Cy8sLZ86cwZ9//glra2u4urri7du3UvWCgoLQq1cvXL9+Ha6urujXr1+xYpXH48ePcfDgQTRq1EgsS09Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuAAAWL58OZo0aYKhQ4ciMTERiYmJMDMzw+PHj9GiRQuoqKjgxIkT+OuvvzBo0CCpZObJkycRFxeHkydPYvPmzQgJCUFISEiB8RkbG+PEiRN49uyZXPNp2rSpGFNiYiJOnDgBVVVVtGjRAgAQHR2NgQMHYvz48bh9+zbWrFmDkJAQzJkzp9j3jj6TQEJKSooAQEhJSSnrUMqNzMxMYd++fUJmZmZZh0L0xfA5p/KAzzmVB3zOqTzgc/71/Rf/Hvr+/Xvh9u3bwvv378s6lP8ELy8vwd3dXUhOThZUVFSE+Ph4IT4+XlBVVRWePXsmuLu7C15eXoIgCEJqaqpQoUIFITQ0VGyfmZkpmJqaCj///LMgCIIwbdo0oVatWlJjTJkyRQAgvHr1ShAEQRg8eLAwbNgwqTrR0dGCgoKC+Lmam5sLS5culXse2dnZgpaWlnDw4EGxDIDg7+8vvk9NTRUACIcPH5Y71vyYm5sLysrKgoaGhqCqqioAEBo1alRoG0EQBDc3N2HSpEnie2dnZ2H8+PFSdaZNmyZYWloW+HvSy8tLMDc3Fz58+CCW9ezZU+jdu3eB4966dUuwtbUVFBQUBHt7e2H48OFCRESEVJ38YhEEQXj+/LlQrVo1YdSoUWJZmzZthLlz50rV27Jli2BiYlJgDCS/4vyO4wpXIiIiIiIiIqJvVKVKleDm5oaQkBBs2rQJbm5uMDAwkKoTFxeHrKwsNGvWTCyrUKECGjZsiNjYWABAbGys1EpPAGjSpInU+2vXriEkJASampriy8XFBTk5OXjw4IFc8SYlJWHo0KGwtraGjo4OtLW1kZqaiocPH0rVq1OnjvjfGhoa0NbWRnJystyxFmTy5Mm4evUqrl+/juPHjwMA3NzckJ2dDSD3oKrg4GDY29tDT08PmpqaOHr0qEx8n7p69SqaN2+OChUqFFjHzs4OioqK4nsTExNxTvmpVasWbt68iT///BODBg1CcnIyOnfujCFDhhQaS1ZWFrp37w5zc3MsX75cLL927RpmzZol9fnlrdRNS0srtE8qXUplHQARERERERERERVs0KBB4tf6f/vtty82TmpqKoYPH57vIVPyHtLl5eWFFy9eYPny5TA3N4eKigqaNGmCzMxMqXqfJi4lEglycnJKHvz/Z2BggOrVqwMArK2tsWzZMjRp0gQnT55E27ZtsXDhQixfvhzLli2Dvb09NDQ04OvrKxPfp9TU1IocuyRzUlBQQIMGDdCgQQP4+vpi69atGDBgAKZPnw5LS8t824wcORKPHj3ChQsXoKT0f6m91NRUBAUFoVu3bjJtVFVVi4yfSg8TrkRERERERERE37AOHTogMzMTEokELi4uMtetrKygrKyMs2fPwtzcHEDuKsiLFy+KBy7Z2trKHEz1559/Sr2vX78+bt++LSYsS+Ls2bNYuXIlXF1dAeTux/r8+fNi9SFPrPLKW3H6/v17MT53d3f0798fAJCTk4O7d++iVq1aYhtlZWVxRWyeOnXqYPPmzcjKyip0levnyoujoIPKlixZgh07diAmJgb6+vpS1+rXr487d+581udHpYNbChARERERERERfcMUFRURGxuL27dvS31lPY+GhgZGjhyJyZMn48iRI7h9+zaGDh2KtLQ0DB48GAAwYsQI3Lt3D5MnT8adO3cQFhYmc6DTlClTEBMTgzFjxuDq1au4d+8e9u/fX6xDs6ytrbFlyxbExsbi/Pnz6Nevn1yrQz8mT6wFefv2LZ4+fYrExERcuHABkydPRqVKldC0aVMxvsjISMTExCA2NhbDhw9HUlKSVB8WFhY4f/484uPj8fz5c+Tk5GDMmDF48+YN+vTpg0uXLuHevXvYsmUL7ty5U6y5faxHjx5YunQpzp8/j4SEBERFRWH06NGoUaMGbGxsZOr/8ccf+PHHH7Fw4UIYGBjg6dOnePr0KVJSUgAAM2fOxO+//46goCDcunULsbGxCA8Ph7+/f4ljpJJhwpWIiIiIiIiI6Bunra0NbW3tAq/Pnz8f3bt3x4ABA1C/fn38888/OHr0KHR1dQHkbgmwe/du7Nu3Dw4ODli9ejXmzp0r1UedOnVw6tQp3L17F82bN0e9evUwc+ZMmJqayh3nhg0b8OrVK9SvXx8DBgzAuHHjYGhoWKy5yhNrQWbOnAkTExOYmpqiU6dO0NDQwLFjx8TVoP7+/qhfvz5cXFzQsmVLGBsbo2vXrlJ9+Pn5QVFREbVq1UKlSpXw8OFD6Ovr48SJE0hNTYWzszMcHR2xbt26z1rt6uLigoMHD6Jz586oUaMGvLy8YGNjg2PHjkltFZDnzJkzyM7OxogRI2BiYiK+xo8fL/b3v//9D8eOHUODBg3QuHFjLF26VFz1TF+PRBAEoayDKGtv3ryBjo4OUlJSCv3lRaUnKysLERERcHV1/aJL8YnKEp9zKg/4nFN5wOecygM+51/ff/Hvoenp6Xjw4AEsLS25XyQR/ecU53ccV7gSERERERERERERlRImXImIiIiIiIiIiIhKCROuRERERERERERERKWECVciIiIiIiIiIiKiUiJ75BnRt+jeseLVt27/ZeIgIiIiIiIiIiIqBFe4EhEREREREREREZUSJlyJiIiIiIiIiIiISgkTrkRERERERERERESlhAlXIiIiIiIiIiIiolLCQ7OIiIiIiIiI6Ita/mr5VxtrvO74ErV78eIFbG1tceHCBVhYWJRuUN+hFi1aYMSIEejbty8AQCKRYO/evejatWuZxWRhYQFfX1/4+vp+sZgaN26MyZMno3v37qXWJwBkZmaiRo0a2LVrF5ycnEq1b/r2cIUrEREREREREZV7c+bMgbu7u5hsjY+Ph0QiwdWrV796LN7e3pBIJJBIJKhQoQKMjIzQrl07bNy4ETk5OV98/AMHDiApKQl9+vT54mN9jsTERHTs2LFU+/T398fUqVM/6z7Pnz8fEolETAwDgLKyMvz8/DBlypRSiJK+dUy4EhEREREREVG5lpaWhg0bNmDw4MFfddzMzMwCr3Xo0AGJiYmIj4/H4cOH0apVK4wfPx6dOnXChw8fvmhcK1asgI+PDxQUvu20kbGxMVRUVEq1z44dO+Lt27c4fPhwidpfvHgRa9asQZ06dWSu9evXD2fOnMGtW7c+N0z6xn3bPzlERERERERERF9YREQEVFRU0Lhx4wLrREVFQSKR4Pjx43BycoK6ujqaNm2KO3fuSNU7ePAgGjRoAFVVVRgYGMDDw0O8ZmFhgeDgYAwcOBDa2toYNmxYgeOpqKjA2NgYlStXRv369fHTTz9h//79OHz4MEJCQsR6r1+/xpAhQ1CpUiVoa2ujdevWuHbtmtwxferZs2c4ceIEOnfuLHMtb0WpmpoaqlWrhl27dkldnzJlCmrUqAF1dXVUq1YNM2bMQFZWlnj92rVraNWqFbS0tKCtrQ1HR0dcunRJvH7mzBk0b94campqMDMzw7hx4/Du3bsCY5VIJNi3bx+A/1uRvGfPHrRq1Qrq6upwcHDAuXPnpNoUNYaioiJcXV0RHh5e4LgFSU1NRb9+/bBu3Tro6urKXNfV1UWzZs1K1Dd9X5hwJSIiIiIiIqJyLTo6Go6OjnLVnT59OhYvXoxLly5BSUkJgwYNEq8dOnQIHh4ecHV1xZUrV3D8+HE0bNhQqv2iRYvg4OCAK1euYMaMGcWKs3Xr1nBwcMCePXvEsp49eyI5ORmHDx/GX3/9hfr166NNmzZ4+fKl3DF97MyZM1BXV4etra3MtRkzZqB79+64du0a+vXrhz59+iA2Nla8rqWlhZCQENy+fRvLly/HunXrsHTpUvF6v379UKVKFVy8eBF//fUXpk6digoVKgAA4uLi0KFDB3Tv3h3Xr1/H9u3bcebMGYwZM6ZY92j69Onw8/PD1atXUaNGDXh6eoorguUdo2HDhoiOji7WuAAwevRouLm5oW3btgXWKWnf9H3hoVlEREREREREVK4lJCTA1NRUrrpz5syBs7MzAGDq1Klwc3NDeno6VFVVMWfOHPTp0wdBQUFifQcHB6n2rVu3xqRJk0ocq42NDa5fvw4gNzl64cIFJCcni1+tX7RoEfbt24ddu3Zh2LBhcsX0sYSEBBgZGeW7nUDPnj0xZMgQAEBwcDAiIyPxyy+/YOXKlQBy9z/NY2FhAT8/P4SHh+PHH38EADx8+BCTJ0+GjY0NAMDa2lqsP2/ePPTr10/c99Ta2horVqyAs7MzVq1aBVVVVbnuj5+fH9zc3AAAQUFBsLOzwz///AMbGxu5xzA1NcWjR4+Qk5Mj97YK4eHhuHz5Mi5evFhoPVNTUyQkJMjVJ32/uMKViIiIiIiIiMq19+/fy53Q+3hvThMTEwBAcnIyAODq1ato06ZNoe0/94R6QRAgkUgA5H5FPzU1Ffr6+tDU1BRfDx48QFxcnNwxfaywe9GkSROZ9x+vcN2+fTuaNWsGY2NjaGpqwt/fHw8fPhSvT5w4EUOGDEHbtm0xf/58Mca8uYSEhEjNw8XFBTk5OXjw4IHc8Rf2+cg7hpqaGnJycpCRkSHXmI8ePcL48eMRGhpa5HOkpqaGtLQ0uedD3yeucCUiIiIiIiKics3AwACvXr2Sq27eV+ABiInPvBPt1dTUimyvoaFRggj/T2xsLCwtLQHk7hlqYmKCqKgomXoVK1aUO6aPFedefOzcuXPo168fgoKC4OLiAh0dHYSHh2Px4sVincDAQPTt2xeHDh3C4cOHERAQgPDwcHh4eCA1NRXDhw/HuHHjZPquWrWq3HEU9vnIO8bLly+hoaEh973766+/kJycjPr164tl2dnZOH36NH799VdkZGRAUVFR7LtSpUpyz4e+T0y4EhEREREREVG5Vq9ePWzduvWz+6lTpw6OHz8OHx+fUohK1okTJ3Djxg1MmDABAFC/fn08ffoUSkpKsLCwKJWY6tWrh6dPn+LVq1cyBz/9+eefGDhwoNT7evXqAQBiYmJgbm6O6dOni9fz++p8jRo1UKNGDUyYMAGenp7YtGkTPDw8UL9+fdy+fRvVq1eXK86SkHeMmzdvivOSR5s2bXDjxg2pMh8fH9jY2GDKlClisrUkfdP3iVsKEBEREREREVG55uLiglu3bpVoZefHAgICsG3bNgQEBCA2NhY3btzAggULStRXRkYGnj59isePH+Py5cuYO3cu3N3d0alTJzHp2bZtWzRp0gRdu3bFsWPHEB8fj5iYGEyfPh2XLl0qUUz16tWDgYEBzp49K3Nt586d2LhxI+7evYuAgABcuHBBPHDK2toaDx8+RHh4OOLi4rBixQrs3btXbPv+/XuMGTMGUVFRSEhIwNmzZ3Hx4kXxcK4pU6YgJiYGY8aMwdWrV3Hv3j3s37+/2IdmFUbeMaKjo9G+fXu5+9XS0kLt2rWlXhoaGtDX10ft2rU/q2/6PnGFKxERERERERF9UeN1x5d1CIWyt7dH/fr1sWPHDgwfPrzE/bRs2RI7d+5EcHAw5s+fD21tbbRo0aJEfR05cgQmJiZQUlKCrq4uHBwcsGLFCnh5eYkHOUkkEkRERGD69Onw8fHBs2fPYGxsjBYtWsDIyKhEMSkqKsLHxwehoaHo1KmT1LWgoCCEh4dj1KhRMDExwbZt21CrVi0AQJcuXTBhwgSMGTMGGRkZcHNzw4wZMxAYGCj2++LFCwwcOBBJSUkwMDBAt27dxMO86tSpg1OnTmH69Olo3rw5BEGAlZUVevfuXaL7lx95xnj8+DFiYmKkVjzHx8fD0tISJ0+eRMuWLUs8/rlz55CSkoIePXp8zjToOyARBEEo6yDK2ps3b6Cjo4OUlBRoa2uXdTjlQlZWFiIiIuDq6iq1v0qB7h0r3gDW/NciKnvFfs6JvkN8zqk84HNO5QGf86/vv/j30PT0dDx48ACWlpZyH0D1LTl06BAmT56Mmzdvyn0y/X/V06dPYWdnh8uXL8Pc3Lysw/mqpkyZglevXmHt2rVi2cmTJ9GtWzfcv39fZpuF4ujduzccHBzw008/lUao9JUV53ccV7gSERERERERUbnn5uaGe/fu4fHjxzAzMyvrcMqUsbExNmzYgIcPH5a7hKuhoSEmTpwoVRYREYGffvrps5KtmZmZsLe3F/ffpf82JlyJiIiIiIiIiAD4+vqWdQjfjK5du5Z1CGVi0qRJMmULFy787H6VlZXh7+//2f3Q96F8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolCiVdQBERERERERE9N82/8rzrzbW1HoGJWr34sUL2Nra4sKFC7CwsCjdoL5DLVq0wIgRI9C3b18AgEQiwd69e9G1a9cyi8nCwgK+vr7w9fX9YjE1btwYkydPRvfu3Uutz6/RN31buMKViIiIiIiIiMq9OXPmwN3dXUy2xsfHQyKR4OrVq189Fm9vb0gkEkgkElSoUAFGRkZo164dNm7ciJycnC8+/oEDB5CUlIQ+ffp88bE+R2JiIjp27Fiqffr7+2Pq1KnFvs/Z2dmYMWMGLC0toaamBisrKwQHB0MQhM/um74/TLgSERERERERUbmWlpaGDRs2YPDgwV913MzMzAKvdejQAYmJiYiPj8fhw4fRqlUrjB8/Hp06dcKHDx++aFwrVqyAj48PFBS+7bSRsbExVFRUSrXPjh074u3btzh8+HCx2i1YsACrVq3Cr7/+itjYWCxYsAA///wzfvnll8/um74/3/ZPDhERERERERHRFxYREQEVFRU0bty4wDpRUVGQSCQ4fvw4nJycoK6ujqZNm+LOnTtS9Q4ePIgGDRpAVVUVBgYG8PDwEK9ZWFggODgYAwcOhLa2NoYNG1bgeCoqKjA2NkblypVRv359/PTTT9i/fz8OHz6MkJAQsd7r168xZMgQVKpUCdra2mjdujWuXbsmd0yfevbsGU6cOIHOnTvLXMtbUaqmpoZq1aph165dUtenTJmCGjVqQF1dHdWqVcOMGTOQlZUlXr927RpatWoFLS0taGtrw9HREZcuXRKvnzlzBs2bN4eamhrMzMwwbtw4vHv3rsBYJRIJ9u3bB+D/ViTv2bMHrVq1grq6OhwcHHDu3DmpNkWNoaioCFdXV4SHhxc4bn5iYmLg7u4ONzc3WFhYoEePHmjfvj0uXLjw2X3T94cJVyIiIvp67h0r3ouIiIjoK4iOjoajo6NcdadPn47Fixfj0qVLUFJSwqBBg8Rrhw4dgoeHB1xdXXHlyhUcP34cDRs2lGq/aNEiODg44MqVK5gxY0ax4mzdujUcHBywZ88esaxnz55ITk7G4cOH8ddff6F+/fpo06YNXr58KXdMHztz5gzU1dVha2src23GjBno3r07rl27hn79+qFPnz6IjY0Vr2tpaSEkJAS3b9/G8uXLsW7dOixdulS83q9fP1SpUgUXL17EX3/9halTp6JChQoAgLi4OHTo0AHdu3fH9evXsX37dpw5cwZjxowp1j2aPn06/Pz8cPXqVdSoUQOenp7iimB5x2jYsCGio6OLNW7Tpk1x/Phx3L17F0BucvnMmTMyWx6UpG/6/vDQLCIiIiIiIiIq1xISEmBqaipX3Tlz5sDZ2RkAMHXqVLi5uSE9PR2qqqqYM2cO+vTpg6CgILG+g4ODVPvWrVtj0qRJJY7VxsYG169fB5CbHL1w4QKSk5PFr9YvWrQI+/btw65duzBs2DC5YvpYQkICjIyM8t1OoGfPnhgyZAgAIDg4GJGRkfjll1+wcuVKALl7lOaxsLCAn58fwsPD8eOPPwIAHj58iMmTJ8PGxgYAYG1tLdafN28e+vXrJx6IZW1tjRUrVsDZ2RmrVq2CqqqqXPfHz88Pbm5uAICgoCDY2dnhn3/+gY2NjdxjmJqa4tGjR8jJyZF7W4WpU6fizZs3sLGxgaKiIrKzszFnzhz069dPql5J+qbvDz9ZIiIiIiIiIirX3r9/L3dCr06dOuJ/m5iYAACSk5MBAFevXkWbNm0Kbe/k5FTCKHMJggCJRAIgdxVlamoq9PX1oampKb4ePHiAuLg4uWP6WGH3okmTJjLvP17hun37djRr1gzGxsbQ1NSEv78/Hj58KF6fOHEihgwZgrZt22L+/PlijHlzCQkJkZqHi4sLcnJy8ODBA7njL+zzkXcMNTU15OTkICMjQ+5xd+zYgdDQUISFheHy5cvYvHkzFi1ahM2bN0vVK0nf9P3hClciIiIiIiIiKtcMDAzw6tUruermfQUegJj4zDt1Xk1Nrcj2GhoaJYjw/8TGxsLS0hIAkJqaChMTE0RFRcnUq1ixotwxfaw49+Jj586dQ79+/RAUFAQXFxfo6OggPDwcixcvFusEBgaib9++OHToEA4fPoyAgACEh4fDw8MDqampGD58OMaNGyfTd9WqVeWOo7DPR94xXr58CQ0NjWLdu8mTJ2Pq1Kno06cPAMDe3h4JCQmYN28evLy8Pqtv+v4w4UpEREREVJqKu/+wdfsvEwcREcmtXr162Lp162f3U6dOHRw/fhw+Pj6lEJWsEydO4MaNG5gwYQIAoH79+nj69CmUlJRgYWFRKjHVq1cPT58+xatXr6Crqyt17c8//8TAgQOl3terVw9A7qFR5ubmmD59ung9ISFBpv8aNWqgRo0amDBhAjw9PbFp0yZ4eHigfv36uH37NqpXry5XnCUh7xg3b94U5yWvtLQ0mS0CFBUVxWTv5/RN3x9uKUBERERERERE5ZqLiwtu3bpVopWdHwsICMC2bdsQEBCA2NhY3LhxAwsWLChRXxkZGXj69CkeP36My5cvY+7cuXB3d0enTp3EpGfbtm3RpEkTdO3aFceOHUN8fDxiYmIwffp0XLp0qUQx1atXDwYGBjh79qzMtZ07d2Ljxo24e/cuAgICcOHCBfHAKWtrazx8+BDh4eGIi4vDihUrsHfvXrHt+/fvMWbMGERFRSEhIQFnz57FxYsXxcO5pkyZgpiYGIwZMwZXr17FvXv3sH///mIfmlUYeceIjo5G+/bF+wfRzp07Y86cOTh06BDi4+Oxd+9eLFmyBB4eHp/dN31/uMKViIiIiIiIiL6oqfUMyjqEQtnb26N+/frYsWMHhg8fXuJ+WrZsiZ07dyI4OBjz58+HtrY2WrRoUaK+jhw5AhMTEygpKUFXVxcODg5YsWIFvLy8xJWUEokEERERmD59Onx8fPDs2TMYGxujRYsWMDIyKlFMioqK8PHxQWhoKDp16iR1LSgoCOHh4Rg1ahRMTEywbds21KpVCwDQpUsXTJgwAWPGjEFGRgbc3NwwY8YMBAYGiv2+ePECAwcORFJSEgwMDNCtWzfxMK86derg1KlTmD59Opo3bw5BEGBlZYXevXuX6P7lR54xHj9+jJiYGKkVz/Hx8bC0tMTJkyfRsmXLfPv+5ZdfMGPGDIwaNQrJyckwNTXF8OHDMXPmzEL7pv8miSAIQlkHUdbevHkDHR0dpKSkQFtbu6zDKReysrIQEREBV1dXqf1VCsSv5tF3qNjPOdF3iL/PqTzgc07lAf/c8vX9F/8emp6ejgcPHsDS0lLuA6i+JYcOHcLkyZNx8+bNcn96/NOnT2FnZ4fLly/D3Ny8rMP5qqZMmYJXr15h7dq1YtnJkyfRrVs33L9/X2abhc/tm74fxfkdxxWuRERERERERFTuubm54d69e3j8+DHMzMzKOpwyZWxsjA0bNuDhw4flLuFqaGiIiRMnSpVFRETgp59++qxka0F9038TE65ERERERERERAB8fX3LOoRvRteuXcs6hDIxadIkmbKFCxd+sb7pv6l8r5EnIiIiIiIiIiIiKkVMuBIRERERERERERGVEiZciYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSolTWARAR0f9371jx6lu3/zJxEBEREREREVGJcYUrEREREREREZV7L168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJQcWKFctsfCq+1atXo3PnzmUdxneNK1yJiIiIiIiI6ItKCQr6amPpBASUqN2cOXPg7u4OCwsLALmJ1FOnThVYPyoqCs7OziUa62tp2rQpEhMToaOj80XHkUgk2Lt3L7p27fpFx/kSAgMDER4ejkePHkFZWRmOjo6YM2cOGjVqVGCbFy9eoF+/frh+/bqYqHd3d8fcuXOhra0NADhz5gymTJmCv//+G2lpaTA3N8fw4cMxYcIEqb5+++03LFy4EE+fPoWDgwN++eUXNGzYULxuYWGBhIQEbNu2DX369JFqa2dnh9u3b2PTpk3w9vYu9txfvHgBBwcHPH78GK9evRIT44MGDUJwcDCio6PRvHnzYvdLXOFKREREREREROVcWloaNmzYgMGDB4tle/bsQWJiotQrISEBtWvXhpOTU6EJuW+FsrIyjI2NIZFIyjqUb1aNGjXw66+/4saNGzhz5gwsLCzQvn17PHv2rMA2CgoKcHd3x4EDB3D37l2EhITgjz/+wIgRI8Q6GhoaGDNmDE6fPo3Y2Fj4+/vD398fa9euFets374dEydOREBAAC5fvgwHBwe4uLggOTlZajwzMzNs2rRJquzPP//E06dPoaGhUeK5Dx48GHXq1JEpV1ZWRt++fbFixYoS913eMeFKREREREREROVaREQEVFRU0LhxY7FMT08PxsbGUq/g4GA8f/4ce/fuhaqqqlg3JycHP/74o9gmMDBQvBYfHw+JRIKrV6+KZa9fv4ZEIkFUVJRYdurUKTRs2BAqKiowMTHB1KlT8eHDB/F6y5YtMXbsWPj6+kJXVxdGRkZYt24d3r17Bx8fH2hpaaF69eo4fPiw2ObTLQXyvt5/9OhR2NraQlNTEx06dEBiYqLU/di4cSPs7OzEWMaMGfOZd/j/xMXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23bthUZT9++fdG2bVtUq1YNdnZ2WLJkCd68eYPr168X2EZXVxcjR46Ek5MTzM3N0aZNG4waNQrR0dFinXr16sHT0xN2dnawsLBA//794eLiIlVnyZIlGDp0KHx8fFCrVi2sXr0a6urq2Lhxo9R4/fr1w6lTp/Do0SOxbOPGjejXrx+UlEr25fVVq1bh9evX8PPzy/d6586dceDAAbx//75E/Zd3TLgSERERERERUbkWHR0NR0fHQuusXLkSv//+O3bv3o0qVapIXdu8eTM0NDRw/vx5/Pzzz5g1axYiIyPlHv/x48dwdXVFgwYNcO3aNaxatQobNmzA7NmzZcYxMDDAhQsXMHbsWIwcORI9e/ZE06ZNcfnyZbRv3x4DBgxAWlpagWOlpaVh0aJF2LJlC06fPo2HDx9KJd1WrVqF0aNHY9iwYbhx4wYOHDiA6tWryz2XoqSmpsLV1RXHjx/HlStX0KFDB3Tu3BkPHz6Uqrd06VI0a9YMV65cgZubGwYMGICBAweif//+uHz5MqysrDBw4EAIggAASE9Ph6OjIw4dOoSbN29i2LBhGDBgAC5cuCB3bJmZmVi7di10dHTg4OAgd7snT55gz549hW4xceXKFcTExIh1MjMz8ddff6Ft27ZiHQUFBbRt2xbnzp2TamtkZAQXFxds3rwZQO5nuH37dgwaNEjuGD92+/ZtzJo1C7///jsUFPJPDTo5OeHDhw84f/58icYo75hwJSIiIiIiIqJyLSEhAaampgVeP336NHx9ffHbb7+hadOmMtfr1KmDgIAAWFtbY+DAgXBycsLx48flHn/lypUwMzPDr7/+ChsbG3Tt2hVBQUFYvHgxcnJyxHoODg7w9/eHtbU1pk2bBlVVVRgYGGDo0KGwtrbGzJkz8eLFi0JXZ2ZlZWH16tVwcnJC/fr1MWbMGKlYZ8+ejUmTJmH8+PGoUaMGGjRoUKqHgjk4OGD48OGoXbs2rK2tERwcDCsrKxw4cECqnqurK4YPHy7O682bN2jQoAF69uyJGjVqYMqUKYiNjUVSUhIAoHLlyvDz80PdunVRrVo1jB07Fh06dMCOHTuKjOl///sfNDU1oaqqiqVLlyIyMhIGBgZFtvP09IS6ujoqV64MbW1trF+/XqZOlSpVoKKiAicnJ4wePRpDhgwBADx//hzZ2dkwMjKSqm9kZISnT5/K9DNo0CCEhIRAEATs2rULVlZWqFu3bpExfiojIwOenp5YuHAhqlatWmA9dXV16OjoICEhodhjEBOuRERERERERFTOvX//XmqLgI89fPgQPXr0wLBhw8Rk2ac+3QfTxMREZh/OwsTGxqJJkyZSe602a9YMqamp+Pfff/MdR1FREfr6+rC3txfL8pJ3hY2trq4OKyurfGNNTk7GkydP0KZNm3zbjhgxApqamuKrJFJTU+Hn5wdbW1tUrFgRmpqaiI2NlVnh+vFc8+ZV2Fyzs7MRHBwMe3t76OnpQVNTE0ePHhX7DQ0NlYr946/2t2rVClevXkVMTAw6dOiAXr16if127NhRbGNnZycV49KlS3H58mXs378fcXFxmDhxosx8o6OjcenSJaxevRrLli2Ta5uD/Li5uSE1NRWnT5/Gxo0bS7y6ddq0abC1tUX//v2LrKumplboamkqWMk2eiAiIiL6SHxYPBShWGS9arILQoiIiIjKnIGBAV69eiVT/v79e3h4eMDOzg7Lli0rsH2FChWk3kskEnFlat5XtvO++g7krjItifzG+bgsL2H78apYefrIi01NTa3Q8WfNmlXgnp/y8vPzQ2RkJBYtWoTq1atDTU0NPXr0QGZmZoFx5s2rsLkuXLgQy5cvx7Jly2Bvbw8NDQ34+vqK/Xbp0kXqoLPKlSuL/62hoYHq1aujevXqaNy4MaytrbFhwwZMmzYN69evF/cx/fTe5e3ta2NjAz09PTRv3hwzZsyAiYmJWMfS0hJAbrI4KSkJgYGB8PT0hIGBARQVFcUVunmSkpJgbGwsc9+UlJQwYMAABAQE4Pz589i7d2+h97kgJ06cwI0bN7Br1y4A//dcGhgYYPr06QgKChLrvnz5EpUqVSrROOUdE65EREREREREVK7Vq1cPW7dulSkfMmQIXr58iaNHj5b4cKK8hFViYiLq1asHAFIHaAGAra0tdu/eDUEQxETi2bNnoaWlJbNf7JekpaUFCwsLHD9+HK1atZK5bmhoCENDw88a4+zZs/D29oaHhweA3BWv8fHxn9VnXr/u7u7iys2cnBzcvXsXtWrVApA7Ny0tLbn6ysnJQUZGBgDpxGxRbQCI7YrqV1lZGY6Ojjh+/Di6du0qXj9+/HiBh5QNGjQIixYtQu/evaGrqytXXJ/avXu31EFYFy9exKBBgxAdHS218jkuLg7p6eniM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRMLPNToSwkMDMSIESNgaGiIjh074u3btzh79izGjh1baLsHDx7IJJKtra1l6llbW2PPnj3o3LkzJBIJZsyYUeiKXHlZW1tj165diImJga6uLpYsWYKkpCQx4Zqfd+/eYc6cOejSpQtMTEzw/Plz/Pbbb3j8+DF69uxZYLuIiAgkJSWhQYMG0NTUxK1btzB58mQ0a9YMFhYWAIDffvsNVatWhY2NDYDcfYAXLVqEcePGif1MnDgRXl5ecHJyQsOGDbFs2TK8e/cOPj4++Y5ra2uL58+fQ11dvQR3KNfHSVUgdy/ZvL4rVqwolkdHR6NatWoy9Uk+TLgSERERERERUblmb2+P+vXrY8eOHRg+fDiA3IOssrKy0KFDh3zbbNq0Cd7e3nL1v3HjRgwePBiOjo6oWbMmfv75Z7Rv3168XrlyZURERGDy5MlwcHCAnp4eBg8eLJOY/Rq8vLyQnp6OpUuXws/PDwYGBujRo0eR7Qrav/RTS5YswaBBg9C0aVMYGBhgypQpePPmzWfH7e/vj/v378PFxQXq6uoYNmwYunbtipSUlALbKCoq4u+//8bmzZvx/Plz6Ovro0GDBoiOjpbZr/VjampqWLduHSZMmICMjAyYmZmhW7dumDp1qlgnJycH06ZNw4MHD6CkpAQrKyssWLBAfL4AoHfv3nj27BlmzpyJp0+fom7dujhy5IjMQVof09fXL/Q+eHt7Iz4+HlFRUYXWK8q2bdswdOjQz+qjPJMIH28iUk69efMGOjo6SElJgba2dlmHUy5kZWUhIiICrq6uMnug5OveseINYN2+6DpEXxifcyoP8p7zWq9rybmH6z/FG4DPOX0D+PucygM+51/ff/Hvoenp6Xjw4AEsLS0LPIDqW3bo0CFMnjwZN2/e/OqrSolKi7OzM1q1aoXAwMAS93Hr1i20bt0ad+/ehY6OTukF950rzu84rnAlIiIiIiIionLPzc0N9+7dw+PHj2FmZlbW4RAVW0pKCuLi4nDo0KHP6icxMRG///47k62fgQlXIiIiIiIiIiIAvr6+ZR0CUYnp6Ojg33///ex+2rZtWwrRlG/fzBr5+fPnQyKRSP1yS09Px+jRo6Gvrw9NTU10794dSUlJUu0ePnwINzc3qKurw9DQEJMnT8aHDx++cvRERERERERERERE30jC9eLFi1izZg3q1KkjVT5hwgQcPHgQO3fuxKlTp/DkyRN069ZNvJ6dnQ03NzdkZmYiJiYGmzdvRkhICGbOnPm1p0BERERERERERERU9lsKpKamol+/fli3bh1mz54tlqekpGDDhg0ICwtD69atAeSeAGhra4s///wTjRs3xrFjx3D79m388ccfMDIyQt26dREcHIwpU6YgMDAQysrKZTUtIiJRfFi8nIcJfYVgiIiIiArBP7cQERF9vjJPuI4ePRpubm5o27atVML1r7/+QlZWltS+ETY2NqhatSrOnTuHxo0b49y5c7C3t4eRkZFYx8XFBSNHjsStW7dQr169fMfMyMhARkaG+P7NmzcAck/mzMrKKu0pUj7y7rPc9ztbKO4AxYyIqPTlPd/ZyJavPp9z+g7xOafygH9uofKAv8+/Pv7dk4jov6tME67h4eG4fPkyLl68KHPt6dOnUFZWRsWKFaXKjYyM8PTpU7HOx8nWvOt51woyb948BAUFyZQfO3YM6urqxZ0GfYbIyMgv0/GdiC/TL1EJ3Kl4R656t+Wr9lHHfM7p28HnnMoD/rmFygP+Pv960tLSyjoEIiL6Qsos4fro0SOMHz8ekZGRUFVV/apjT5s2DRMnThTfv3nzBmZmZmjfvj20tbW/aizlVVZWFiIjI9GuXTtUqFCh6AZxJ4o3gFXrkgVGVIrynvOar2vK9dU8i0b3izcAn3P6BvA5p/KAf26h8oC/z7++vG9aEhHRf0+ZJVz/+usvJCcno379+mJZdnY2Tp8+jV9//RVHjx5FZmYmXr9+LbXKNSkpCcbGxgAAY2NjXLhwQarfpKQk8VpBVFRUoKKiIlNeoUIF+f4QTaXm8c7Hcu4RJSlex/wc6Rui+P//rygVFPmc0/eLzzmVB3L/WZHPOX3H+Pv86+HfPYmI/rsUymrgNm3a4MaNG7h69ar4cnJyQr9+/cT/rlChAo4fPy62uXPnDh4+fIgmTZoAAJo0aYIbN24gOTlZrBMZGQltbW3UqlXrq8+JiIiIiIiIiL5PL168gKGhIeLj44vVrmXLlvD19f0iMX2uqKgoSCQSvH79usxiCAkJkdkukr5tR44cQd26dZGTk1PWoXy3yizhqqWlhdq1a0u9NDQ0oK+vj9q1a0NHRweDBw/GxIkTcfLkSfz111/w8fFBkyZN0LhxYwBA+/btUatWLQwYMADXrl3D0aNH4e/vj9GjR+e7gpWIiIiIiIiIysDfkq/3KqE5c+bA3d0dFhYWAHITqRKJpMDXqVOnSunmfDlNmzZFYmIidHR0vug4EokE+/bt+6JjfCmBgYGwsbGBhoYGdHV10bZtW5w/f77QNi9evECHDh1gamoKFRUVmJmZYcyYMVJbhZw5cwbNmjWDvr4+1NTUYGNjg6VLl8r09dtvv8HCwgKqqqpo1KiRzDe5LSwsIJFIEB4eLtPWzs4OEokEISEhxZrzxYsX0aZNG1SsWBG6urpwcXHBtWvXxOsdOnRAhQoVEBoaWqx+6f+UWcJVHkuXLkWnTp3QvXt3tGjRAsbGxtizZ494XVFREf/73/+gqKiIJk2aoH///hg4cCBmzZpVhlETERERERER0fckLS0NGzZswODBg8WyPXv2IDExUeqVkJCA2rVrw8nJCY0aNSrDiOWjrKwMY2NjSCQlT0T/19WoUQO//vorbty4gTNnzsDCwgLt27fHs2fPCmyjoKAAd3d3HDhwAHfv3kVISAj++OMPjBgxQqyjoaGBMWPG4PTp04iNjYW/vz/8/f2xdu1asc727dsxceJEBAQE4PLly3BwcICLi4vUN7kBwMzMDJs2bZIq+/PPP/H06VNoaGgUa76pqano0KEDqlativPnz+PMmTPQ0tKCi4sLsrKyxHre3t5YsWJFsfqm//NNJVyjoqKwbNky8b2qqip+++03vHz5Eu/evcOePXtk9mY1NzdHREQE0tLS8OzZMyxatAhKSmW2NS0RERERERERfWciIiKgoqIifqMWAPT09GBsbCz1Cg4OxvPnz7F3716pA8BzcnLw448/im0CAwPFa/Hx8ZBIJLh69apY9vr1a0gkEkRFRYllp06dQsOGDaGiogITExNMnToVHz58EK+3bNkSY8eOha+vL3R1dWFkZIR169bh3bt38PHxgZaWFqpXr47Dhw+LbT7dUiDv6/1Hjx6Fra0tNDU10aFDByQmJkrdj40bN8LOzk6MZcyYMZ95h/9PXFwc3N3dYWRkBE1NTTRo0AB//PGHVB0LCwvMnj0bAwcOhKamJszNzXHgwAE8e/YM7u7u0NTURJ06dXDp0iWxzYsXL+Dp6YnKlStDXV0d9vb22LZtW5Hx9O3bF23btkW1atVgZ2eHJUuW4M2bN7h+/XqBbXR1dTFy5Eg4OTnB3Nwcbdq0wahRoxAdHS3WqVevHjw9PWFnZwcLCwv0798fLi4uUnWWLFmCoUOHwsfHB7Vq1cLq1auhrq6OjRs3So3Xr18/nDp1Co8ePRLLNm7ciH79+hU7B/b333/j5cuXmDVrFmrWrAk7OzsEBAQgKSkJCQkJYr3OnTvj0qVLiIuLK1b/lOubSrgSEREREX2r4sPicX/z/SJfRET0/YmOjoajo2OhdVauXInff/8du3fvRpUqVaSubd68GRoaGjh//jx+/vlnzJo1C5GRkXKP//jxY7i6uqJBgwa4du0aVq1ahQ0bNmD27Nky4xgYGODChQsYO3YsRo4ciZ49e6Jp06a4fPky2rdvjwEDBiAtLa3AsdLS0rBo0SJs2bIFp0+fxsOHD+Hn5ydeX7VqFUaPHo1hw4bhxo0bOHDgAKpXry73XIqSmpoKV1dXHD9+HFeuXEGHDh3QuXNnPHz4UKre0qVL0axZM1y5cgVubm4YMGAABg4ciP79++Py5cuwsrLCwIEDIQgCACA9PR2Ojo44dOgQbt68iWHDhmHAgAEyX9EvTGZmJtauXQsdHR04ODjI3e7JkyfYs2cPnJ2dC6xz5coVxMTEiHUyMzPx119/oW3btmIdBQUFtG3bFufOnZNqa2RkBBcXF2zevBlA7me4fft2DBo0SO4Y89SsWRP6+vrYsGEDMjMz8f79e2zYsAG2trbidhoAULVqVRgZGUkliEl+TLgSERERERERUbmWkJAAU1PTAq+fPn0avr6++O2339C0aVOZ63Xq1EFAQACsra0xcOBAODk5SR0CXpSVK1fCzMwMv/76K2xsbNC1a1cEBQVh8eLFUgcXOTg4wN/fH9bW1pg2bRpUVVVhYGCAoUOHwtraGjNnzsSLFy8KXZ2ZlZWF1atXw8nJCfXr18eYMWOkYp09ezYmTZqE8ePHo0aNGmjQoEGpHgrm4OCA4cOHo3bt2rC2tkZwcDCsrKxw4MABqXqurq4YPny4OK83b96gQYMG6NmzJ2rUqIEpU6YgNjYWSUlJAIDKlSvDz88PdevWRbVq1TB27Fh06NABO3bsKDKm//3vf9DU1ISqqiqWLl2KyMhIGBgYFNnO09MT6urqqFy5MrS1tbF+/XqZOlWqVIGKigqcnJwwevRoDBkyBADw/PlzZGdnw8jISKq+kZERnj59KtPPoEGDEBISAkEQsGvXLlhZWaFu3bpFxvgpLS0tREVFYevWrVBTU4OmpiaOHDmCw4cPy6yWNTU1lVr1SvJjwpWIiIiIiIiIyrX3799LbRHwsYcPH6JHjx4YNmyYmCz7VJ06daTem5iYyOzDWZjY2Fg0adJEaq/VZs2aITU1Ff/++2++4ygqKkJfXx/29vZiWV7yrrCx1dXVYWVllW+sycnJePLkCdq0aZNv2xEjRkBTU1N8lURqair8/Pxga2uLihUrQlNTE7GxsTIrXD+ea968CptrdnY2goODYW9vDz09PWhqauLo0aNiv6GhoVKxf7xys1WrVrh69SpiYmLQoUMH9OrVS+y3Y8eOYhs7OzupGJcuXYrLly9j//79iIuLw8SJE2XmGx0djUuXLmH16tVYtmyZXNsc5MfNzQ2pqak4ffo0Nm7cWKLVrUDusz548GA0a9YMf/75J86ePYvatWvDzc0N79+/l6qrpqZW6GppKhg3OyUiIiIiIiKics3AwACvXr2SKX///j08PDxgZ2cndebMpypUqCD1XiKRiCtTFRRy17rlffUdgNThRMWR3zgfl+UlbD9eFStPH3mxqampFTr+rFmzpLYfKAk/Pz9ERkZi0aJFqF69OtTU1NCjRw9kZmYWGGfevAqb68KFC7F8+XIsW7YM9vb20NDQgK+vr9hvly5dpA46q1y5svjfGhoaqF69OqpXr47GjRvD2toaGzZswLRp07B+/XoxEfnpvcvb29fGxgZ6enpo3rw5ZsyYARMTE7GOpaUlgNxkcVJSEgIDA+Hp6QkDAwMoKiqKK3TzJCUlyZxfBABKSkoYMGAAAgICcP78eezdu7fQ+1yQsLAwxMfH49y5c+KzGRYWBl1dXezfvx99+vQR6758+RKVKlUq0TjlHROuRERERERERFSu1atXD1u3bpUpHzJkCF6+fImjR4+W+IDuvIRVYmIi6tWrBwBSB2gBgK2tLXbv3g1BEMRE4tmzZ6GlpSWzX+yXpKWlBQsLCxw/fhytWrWSuW5oaAhDQ8PPGuPs2bPw9vaGh4cHgNwVr/Hx8Z/VZ16/7u7u6N+/P4DcROzdu3dRq1YtALlz09LSkquvnJwcZGRkAJBOzBbVBoDYrqh+lZWV4ejoiOPHj6Nr167i9ePHjxd4SNmgQYOwaNEi9O7dG7q6unLF9am0tDQoKChIrabOe/9xoj49PR1xcXHiM0vFw4QrEREREREREZVrLi4umDZtGl69eiUmshYuXIidO3fi4MGD+PDhg8y+mjo6OkWuCAVyV402btwY8+fPh6WlJZKTk+Hv7y9VZ9SoUVi2bBnGjh2LMWPG4M6dOwgICMDEiRPFVYhfS2BgIEaMGAFDQ0N07NgRb9++xdmzZzF27NhC2z148EAmkWxtbS1Tz9raGnv27EHnzp0hkUgwY8aMQlfkysva2hq7du1CTEwMdHV1sWTJEiQlJYkJ1/y8e/cOc+bMQZcuXWBiYoLnz5/jt99+w+PHj9GzZ88C20VERCApKQkNGjSApqYmbt26hcmTJ6NZs2biwVO//fYbqlatChsbGwC5+wAvWrQI48aNE/uZOHEivLy84OTkhIYNG2LZsmV49+4dfHx88h3X1tYWz58/h7q6egnuUK527dph8uTJGD16NMaOHYucnBzMnz8fSkpKUkn2P//8EyoqKmjSpEmJxyrPmHAlIiIiIiIionLN3t4e9evXx44dOzB8+HAAuQdZZWVloUOHDvm22bRpE7y9veXqf+PGjRg8eDAcHR1Rs2ZN/Pzzz2jfvr14vXLlyoiIiMDkyZPh4OAAPT09DB48WCYx+zV4eXkhPT0dS5cuhZ+fHwwMDNCjR48i2xW0f+mnlixZgkGDBqFp06YwMDDAlClT8ObNm8+O29/fH/fv34eLiwvU1dUxbNgwdO3aFSkpKQW2UVRUxN9//43Nmzfj+fPn0NfXR4MGDRAdHS2zX+vH1NTUsG7dOkyYMAEZGRkwMzNDt27dMHXqVLFOTk4Opk2bhgcPHkBJSQlWVlZYsGCB+HwBQO/evfHs2TPMnDkTT58+Rd26dXHkyBGZg7Q+pq+vX+h98Pb2Rnx8PKKiovK9bmNjg4MHDyIoKAhNmjSBgoIC6tWrhyNHjkhthbBt2zb069fvs5K75ZlE+HgTkXLqzZs30NHRQUpKCrS1tcs6nHIhKysLERERqPW6FhShWGT9ak3/Kd4A1u2LrkP0hfE5p/KAzzmVB3zOqTzgc/71/Rf/Hpqeno4HDx7A0tKywAOovmWHDh3C5MmTcfPmza++qpSotDg7O6NVq1YIDAwscR/Pnz9HzZo1cenSJXEPWire7ziucCUiIiIiIiKics/NzQ337t3D48ePYWZmVtbhEBVbSkoK4uLicOjQoc/qJz4+HitXrmSy9TMw4UpEREREREREBMDX17esQyAqMR0dHfz777+f3Y+TkxOcnJxKIaLyi2vkiYiIiIiIiIiIiEoJE65EREREREREREREpYQJVyIiIiIiIiIiIqJSwoQrERERERERERERUSlhwpWIiIiIiIiIiIiolDDhSkRERERERERERFRKmHAlIiIiIiIiIiIiKiVMuBIRERERERFRuffixQsYGhoiPj6+rEORW1RUFCQSCV6/fg0ACAkJQcWKFcs0ppKKj4+HRCLB1atXyzoUKqcaN26M3bt3l0pfSqXSCxERERERERFRAe5vvv/VxqrmVa1E7ebMmQN3d3dYWFiUbkCfiIqKQqtWrfDq1avvNjlaUi1btkTdunWxbNmysg6l2Pbs2YO5c+fin3/+QVZWFqytrTFp0iQMGDCgwDaJiYmYNGkSLl26hH/++Qfjxo2Ta+7Z2dkIDAzE1q1b8fTpU5iamsLb2xv+/v6QSCQAcu/lqVOnxDaGhoZo0aIFFi1aBHNz8wL7Lu3nLzAwEPv27ftqiXILCwv4+vrC19e3xH2Eh4fD09MT7u7u2Ldvn1ju7++PCRMmwMPDAwoKn7dGlStciYiIiIiIiKhcS0tLw4YNGzB48OCyDoW+UXp6epg+fTrOnTuH69evw8fHBz4+Pjh69GiBbTIyMlCpUiX4+/vDwcFB7rEWLFiAVatW4ddff0VsbCwWLFiAn3/+Gb/88otUvaFDhyIxMRFPnjzB/v378ejRI/Tv37/Ec/ySsrKyyjoEALkrqf38/NC8eXOZax07dsTbt29x+PDhzx6HCVciIiIiIiIiKtciIiKgoqKCxo0bi2V5X9c/evQo6tWrBzU1NbRu3RrJyck4fPgwbG1toa2tjb59+yItLU1sl5OTg3nz5sHS0hJqampwcHDArl27AOQme1q1agUA0NXVhUQigbe3NwDgyJEj+OGHH1CxYkXo6+ujU6dOiIuL+6x5xcXFwd3dHUZGRtDU1ESDBg3wxx9/SNWxsLDA7NmzMXDgQGhqasLc3BwHDhzAs2fP4O7uDk1NTdSpUweXLl0S27x48QKenp6oXLky1NXVYW9vj23btn1WrJ/Kzs7G4MGDxftYs2ZNLF++XKqOt7c3unbtirlz58LIyAgVK1bErFmz8OHDB0yePBl6enqoUqUKNm3aJNVuypQpqFGjBtTV1VGtWjXMmDGjyIRgy5Yt4eHhAVtbW1hZWWH8+PGoU6cOzpw5U2AbCwsLLF++HAMHDoSOjo7cc4+JiYG7uzvc3NxgYWGBHj16oH379rhw4YJUPXV1dRgbG8PExASNGzfGmDFjcPnyZbnHAf5vG4qjR4/C1tYWmpqa6NChAxITE8U6UVFRaNiwITQ0NFCxYkU0a9YMCQkJCAkJQVBQEK5duwaJRAKJRIKQkBAAgEQiwapVq9ClSxdoaGhgzpw5+W55sW/fPnHVbp6DBw+iQYMGUFVVhYGBATw8PADkfgYJCQmYMGGCOF5xZGdno1+/fggKCkK1arIr4RUVFeHq6orw8PBi9ZsfJlyJiIiIiIiIqFyLjo6Go6NjvtcCAwPx66+/IiYmBo8ePUKvXr2wbNkyhIWF4dChQzh27JjUysN58+bh999/x+rVq3Hr1i1MmDAB/fv3x6lTp2BmZibuEXnnzh0kJiaKScR3795h4sSJuHTpEo4fPw4FBQV4eHggJyenxPNKTU2Fq6srjh8/jitXrqBDhw7o3LkzHj58KFVv6dKlaNasGa5cuQI3NzcMGDAAAwcORP/+/XH58mVYWVlh4MCBEAQBAJCeng5HR0ccOnQIN2/exLBhwzBgwACZhODnyMnJQZUqVbBz507cvn0bM2fOxE8//YQdO3ZI1Ttx4gSePHmC06dPY8mSJQgICECnTp2gq6uL8+fPY8SIERg+fDj+/fdfsY2WlhZCQkJw+/ZtLF++HOvWrcPSpUvljk0QBBw/fhx37txBixYtSm3OeZo2bYrjx4/j7t27AIBr167hzJkz6NixY4FtXr58iR07dqBRo0bFHi8tLQ2LFi3Cli1bcPr0aTx8+BB+fn4AgA8fPqBr165wdnbG9evXce7cOQwbNgwSiQS9e/fGpEmTYGdnh8TERCQmJqJ3795iv4GBgfDw8MCNGzcwaNAguWI5dOgQPDw84OrqiitXruD48eNo2LAhgNxtHapUqYJZs2aJ4xXHrFmzYGhoWOhK9oYNGyI6OrpY/eaHe7gSERERERERUbmWkJAAU1PTfK/Nnj0bzZo1AwAMHjwY06ZNQ1xcnLhCrkePHjh58iSmTJmCjIwMzJ07F3/88QeaNGkCAKhWrRrOnDmDNWvWwNnZGXp6egBy99z8eLVf9+7dpcbduHEjKlWqhNu3b6N27dolmpeDg4PUV9mDg4Oxd+9eHDhwAGPGjBHLXV1dMXz4cADAzJkzsWrVKjRo0AA9e/YEkLsitEmTJkhKSoKxsTEqV64sJuQAYOzYsTh69Ch27NghJsc+V4UKFRAUFCS+t7S0xLlz57Bjxw706tVLLNfT08OKFSugoKCAmjVr4ueff0ZaWhp++uknAMC0adMwf/58nDlzBn369AGQu1dnHgsLC/j5+SE8PBw//vhjoTGlpKSgcuXKyMjIgKKiIlauXIl27dqVynw/NnXqVLx58wY2NjZQVFREdnY25syZg379+knVW7lyJdavXw9BEJCWloYaNWoUusVBQbKysrB69WpYWVkBAMaMGYNZs2YBAN68eYOUlBR06tRJvG5rayu21dTUhJKSEoyNjWX67du3L3x8fIoVy5w5c9CnTx+pzz7vGdbT04OioiK0tLTyHa8wZ86cwYYNG4rca9bU1BSPHj1CTk7OZ+3jyhWuRERERERERFSuvX//Hqqqqvleq1OnjvjfRkZG4tfQPy5LTk4GAPzzzz9IS0tDu3btoKmpKb5+//33IrcHuHfvHjw9PVGtWjVoa2uLh3flrUbt2LGj2J+dnZ1c80pNTYWfnx9sbW1RsWJFaGpqIjY2VmaF66dzBAB7e3uZsrx5ZmdnIzg4GPb29tDT04OmpiaOHj0q9hsaGio1/5KuGPztt9/g6OiISpUqQVNTE2vXrpWJ3c7OTioxZmRkJBW7oqIi9PX1xdgBYPv27WjWrBmMjY2hqakJf39/sd+HDx9KxT537lyxnZaWFq5evYqLFy9izpw5mDhxIqKioko0NyB3ZfXHY4WGhgIAduzYgdDQUISFheHy5cvYvHkzFi1ahM2bN0u179evH65evSqugK1evTrat2+Pt2/fivcmr+/CVseqq6uLyVQAMDExEe+Xnp4evL294eLigs6dO2P58uVyryx1cnIq1v0AgKtXr6JNmzbFbleYt2/fYsCAAVi3bh0MDAwKraumpoacnBxkZGR81phc4UpERERERERE5ZqBgQFevXqV77UKFSqI/y2RSKTe55Xlfe0/NTUVQO7XoitXrixVT0VFpdAYOnfuDHNzc6xbtw6mpqbIyclB7dq1kZmZCQBYv3493r9/LxNTYfz8/BAZGYlFixahevXqUFNTQ48ePcQ+C5pjQWV581y4cCGWL1+OZcuWwd7eHhoaGvD19RX77dKli9RX2z+9F/IIDw+Hn58fFi9ejCZNmkBLSwsLFy7E+fPnC4w9L9bCPqNz586J+3i6uLhAR0cH4eHhWLx4MYDcFY4fr4LMW5EMAAoKCqhevToAoG7duoiNjcW8efPQsmXLYs8PyE1IfjxWXmJ78uTJmDp1qrgi197eHgkJCZg3bx68vLzE+jo6OmI81atXx4YNG2BiYoLt27djyJAhiIiIEPemVVNTKzCO/O5X3vYRALBp0yaMGzcOR44cwfbt2+Hv74/IyEipPY/zo6GhIfVeQUFBql9A9jCtwuIsqbi4OMTHx6Nz585iWd7zoKSkhDt37ogJ55cvX0JDQ+Oz42DClYiIiIiIiIjKtXr16mHr1q2f3U+tWrWgoqKChw8fwtnZOd86ysrKAHJXieZ58eIF7ty5g3Xr1omnp396GFNJkpZnz56Ft7e3eOhQamoq4uPji91Pfv26u7ujf//+AHKTV3fv3kWtWrUA5K4E1dLS+uwxmjZtilGjRolln3uIGJB7IJW5uTmmT58uliUkJIj/raSkJCYxi/K5KyHV1NTyHSstLU3m6+yKiopF7uerqKgIAGJi3tzcvMSxfapevXqoV68epk2bhiZNmiAsLAyNGzeGsrKy1LNcmEqVKuHt27d49+6dmIz99Cv+derUwfHjxwvciqA44+WxsbHBjRs3pMr8/f3x9u1bLF++HGZmZmL5zZs3Ua9evWL1nx8mXImIiIiIiIioXHNxccG0adPw6tUr6OrqlrgfLS0t+Pn5YcKECcjJycEPP/yAlJQUnD17Ftra2vDy8oK5uTkkEgn+97//wdXVFWpqatDV1YW+vj7Wrl0LExMTPHz4EFOnTv3seVlbW2PPnj3o3LkzJBIJZsyY8VmHcH3c765duxATEwNdXV0sWbIESUlJYsK1MM+ePZNJspmYmOQ7xu+//46jR4/C0tISW7ZswcWLF2FpafnZsT98+BDh4eFo0KABDh06hL179xbZbt68eXBycoKVlRUyMjIQERGBLVu2YNWqVWKdadOm4fHjx/j999/Fsry5pqaminNXVlYu9F517twZc+bMQdWqVWFnZ4crV65gyZIlMgdPpaWl4enTpwCApKQkBAcHQ1VVFe3bty/OLSnUgwcPsHbtWnTp0gWmpqa4c+cO7t27h4EDBwLI3QP3wYMHuHr1KqpUqQItLa0CV3M3atQI6urq+OmnnzBu3DicP38eISEhUnUCAgLQpk0bWFlZoU+fPvjw4QMiIiIwZcoUcbzTp0+jT58+UFFRKXKLAABQVVWV2Qc5b//kT8ujo6NL5f4x4UpEREREREREX1Q1r2pFVypD9vb2qF+/Pnbs2CEeHlVSwcHBqFSpEubNm4f79++jYsWKqF+/vniIU+XKlREUFISpU6fCx8cHAwcOREhICMLDwzFu3DjUrl0bNWvWxIoVK0r8VfU8eUm6pk2bwsDAAFOmTMGbN28+q08gd3Xg/fv34eLiAnV1dQwbNgxdu3ZFSkpKkW3DwsIQFhYmVRYcHCyuls0zfPhwXLlyBb1794ZEIoGnpydGjRqFw4cPf1bsXbp0wYQJEzBmzBhkZGTAzc0NM2bMQGBgYKHt3r17h1GjRuHff/+FmpoabGxssHXrVvTu3Vusk5iYKLPH7MerJf/66y+EhYXB3Ny80JXGv/zyC2bMmIFRo0YhOTkZpqamGD58OGbOnClVb926dVi3bh0AQFdXF3Xq1EFERARq1qwp590omrq6Ov7++29s3rwZL168gImJCUaPHi3+nHTv3h179uxBq1at8Pr1a2zatAne3t759qWnp4etW7di8uTJWLduHdq0aYPAwEAMGzZMrNOyZUvs3LkTwcHBmD9/PrS1tdGiRQvx+qxZszB8+HAx8Z23RYFEIil0bHk8fvwYMTExpbLaXSJ8unlCOfTmzRvo6OggJSUF2traZR1OuZCVlYWIiAjUel0LilAssn61pv8UbwDr0vvXHKKS4nNO5QGfcyoP+JxTecDn/Ov7L/49ND09HQ8ePIClpWWBB1B9yw4dOoTJkyfj5s2bn3U6ORF9XQ8ePECNGjVw+/ZtWFtbl7ifKVOm4NWrV1i7dm2+14vzO44rXImIiIiIiIio3HNzc8O9e/fw+PFjqT0diejbFhERgWHDhn1WshUADA0NMXHixFKJiQlXIiIiIiIiIiIAvr6+ZR0CERXT6NGjS6WfSZMmlUo/AMA18kRERERERERERESlhAlXIiIiIiIiIiIiolLChCsRERERERERERFRKWHClYiIiIiIiIiIiKiUMOFKREREREREREREVEqYcCUiIiIiIiIiIiIqJUy4EhEREREREREREZUSJlyJiIiIiIiIqNx78eIFDA0NER8fX9ahyC0qKgoSiQSvX78GAISEhKBixYplGlNJxcfHQyKR4OrVq2UdCpVTffr0weLFi0ulLyZciYiIiIiIiOjLunfs671KaM6cOXB3d4eFhUXpzTsfnyZJy5OWLVvC19e3rMMokT179sDJyQkVK1aEhoYG6tatiy1bthTaJjExEX379kWNGjWgoKAg99yzs7MxY8YMWFpaQk1NDVZWVggODoYgCGKdli1bQiKRiC8jIyP07NkTCQkJhfZd2s9fYGAg6tatWyp9ycPCwgLLli0rUdtly5ahZs2aUFNTg5mZGSZMmID09HTxur+/P+bMmYOUlJTPjpMJVyIiIiIiIiIq19LS0rBhwwYMHjy4rEOhb5Senh6mT5+Oc+fO4fr16/Dx8YGPjw+OHj1aYJuMjAxUqlQJ/v7+cHBwkHusBQsWYNWqVfj1118RGxuLBQsW4Oeff8b/a+/O46qq9v+Pvw+ICDKJCooDOI+opGZIpeaskWM5T5laag6EKeWAs5VjmXY1EydCG+yaY0bOes05Ta+ZE5Y43BwQUUQ4vz/8eb4eQQTZiMDr2cPHw7P32mt/9mYp8XbttT/77DOrdn369FF0dLTOnz+vf//73zp37py6du36xNeYmRISErL0/OHh4RoxYoTGjBmjY8eOacGCBVq+fLk++OADS5uqVauqTJkyWrp0aYbPR+AKAAAAAABytbVr18re3l4vvPCCZdv9mYAbNmyQn5+fHBwc9Morr+jSpUtat26dKlWqJBcXF3Xu3FlxcXGW45KSkjR58mTL7MTq1avr22+/lXTvsfkGDRpIkgoUKCCTyaSePXtKktavX68XX3xRbm5uKliwoF599VWdPHkyQ9d18uRJtWrVSp6ennJyclLt2rX1888/W7Xx8fHRhAkT1L17dzk5Ocnb21urVq3S5cuX1apVKzk5OalatWrau3ev5Zh//vlHnTp1UrFixeTo6ChfX199/fXXGar1YYmJierdu7flPlaoUEGzZs2yatOzZ0+1bt1akyZNkqenp9zc3DRu3DjdvXtXw4YNk7u7u4oXL66FCxdaHTd8+HCVL19ejo6OKl26tEaNGvXYQLB+/fpq06aNKlWqpDJlymjw4MGqVq2atm/f/shjfHx8NGvWLHXv3l2urq5pvvadO3eqVatWatmypXx8fNS+fXs1adJEv/76q1U7R0dHFSlSREWLFtULL7yggQMHav/+/Wk+j/R/y1Bs2LBBlSpVkpOTk5o1a6bo6GhLm82bN+v5559X/vz55ebmpoCAAJ09e1ZhYWEaO3asDh06ZJlpGxYWJkkymUyaO3euXnvtNeXPn18TJ05MccmLH374QSaTyWrbjz/+qNq1aytfvnwqVKiQ2rRpI+ne1+Ds2bMaOnSo5XxptXPnTgUEBKhz587y8fFRkyZN1KlTp2T3NDAwUBEREem4gykjcAUAAAAAALnatm3bVLNmzRT3hYaGavbs2dq5c6fOnTunN954QzNnzlR4eLjWrFmjn376yWrm4eTJk7V48WJ98cUX+v333zV06FB17dpVW7ZsUYkSJfTdd99Jko4fP67o6GhLiHjz5k0FBQVp7969ioyMlI2Njdq0aaOkpKQnvq7Y2Fi1aNFCkZGROnDggJo1a6bAwEBFRUVZtZsxY4YCAgJ04MABtWzZUt26dVP37t3VtWtX7d+/X2XKlFH37t0tj7Tfvn1bNWvW1Jo1a3TkyBH17dtX3bp1SxZeZURSUpKKFy+ub775RkePHtXo0aP1wQcfaMWKFVbtfvnlF50/f15bt27V9OnTNWbMGL366qsqUKCAdu/erbffflv9+vXTX3/9ZTnG2dlZYWFhOnr0qGbNmqX58+drxowZaa7NbDYrMjJSx48f18svv2zYNd9Xt25dRUZG6o8//pAkHTp0SNu3b1fz5s0fecyVK1e0YsUK1alTJ93ni4uL09SpU7VkyRJt3bpVUVFRCg4OliTdvXtXrVu3Vr169fTbb79p165d6tu3r0wmkzp06KD33ntPVapUUXR0tKKjo9WhQwdLv6GhoWrTpo0OHz6sN998M021rFmzRm3atFGLFi104MABRUZG6vnnn5d0b1mH4sWLa9y4cZbzpVXdunW1b98+yxg9deqU1q5dqxYtWli1e/755/Xrr78qPj4+zX2nJE+GjgYAAAAAAMjmzp49Ky8vrxT3TZgwQQEBAZKk3r17KyQkRCdPnlTp0qUlSe3bt9emTZs0fPhwxcfHa9KkSfr555/l7+8vSSpdurS2b9+uf/3rX6pXr57c3d0lSR4eHlaz/dq1a2d13q+++kqFCxfW0aNHVbVq1Se6rurVq1s9yj5+/HitXLlSq1at0sCBAy3bW7RooX79+kmSRo8erblz56p27dp6/fXXJd2bEerv76+LFy+qSJEiKlasmCWQk6R3331XGzZs0IoVKyzhWEbZ2dlp7Nixls+lSpXSrl27tGLFCr3xxhuW7e7u7vr0009lY2OjChUq6OOPP1ZcXJzlUfGQkBBNmTJF27dvV8eOHSXdW6vzPh8fHwUHBysiIkLvv/9+qjVdv35dxYoVU3x8vGxtbTVnzhw1btzYkOt90IgRIxQTE6OKFSvK1tZWiYmJmjhxorp06WLVbs6cOfryyy9lNpsVFxen8uXLp7rEwaMkJCToiy++UJkyZSRJAwcO1Lhx4yRJMTExun79ul599VXL/kqVKlmOdXJyUp48eVSkSJFk/Xbu3Fm9evVKVy0TJ05Ux44drb7298ewu7u7bG1t5ezsnOL5UtO5c2f973//04svviiz2ay7d+/q7bfftlpSQJK8vLx0584dXbhwQd7e3uk6x4OY4QoAAAAAAHK1W7duKV++fCnuq1atmuX3np6elsfQH9x26dIlSdKff/6puLg4NW7cWE5OTpZfixcvfuzyACdOnFCnTp1UunRpubi4WF7edX82avPmzS39ValSJU3XFRsbq+DgYFWqVElubm5ycnLSsWPHks1wffgaJcnX1zfZtvvXmZiYqPHjx8vX11fu7u5ycnLShg0bLP0uW7bM6vq3bduWpnof9vnnn6tmzZoqXLiwnJycNG/evGS1V6lSRTY2/xdveXp6WtVua2urggULWmqXpOXLlysgIEBFihSRk5OTRo4caek3KirKqvZJkyZZjnN2dtbBgwe1Z88eTZw4UUFBQdq8efMTXZt0b2b1g+datmyZJGnFihVatmyZwsPDtX//fi1atEhTp07VokWLrI7v0qWLDh48aJkBW7ZsWTVp0kQ3btyw3Jv7fac2O9bR0dESpkpS0aJFLffL3d1dPXv2VNOmTRUYGKhZs2aleWZprVq10nU/JOngwYNq2LBhuo97nM2bN2vSpEmaM2eO9u/fr++//15r1qzR+PHjrdo5ODhIktUyIU+CGa4AAAAAACBXK1SokK5evZriPjs7O8vvTSaT1ef72+4/9h8bGyvp3mPRxYoVs2pnb2+fag2BgYHy9vbW/Pnz5eXlpaSkJFWtWlV37tyRJH355Ze6detWsppSExwcrI0bN2rq1KkqW7asHBwc1L59e0ufj7rGR227f52ffPKJZs2apZkzZ8rX11f58+fXkCFDLP2+9tprVo+2P3wv0iIiIkLBwcGaNm2a/P395ezsrE8++US7d+9+ZO33a03ta7Rr1y516dJFY8eOVdOmTeXq6qqIiAhNmzZN0r0ZjgcPHrQce39GsiTZ2NiobNmykqQaNWro2LFjmjx5surXr5/u65PuBZIPnut+sD1s2DCNGDHCMiPX19dXZ8+e1eTJk9WjRw9Le1dXV0s9ZcuW1YIFC1S0aFEtX75cb731ltauXWtZm/Z+kJiSlO7X/eUjJGnhwoUaNGiQ1q9fr+XLl2vkyJHauHGj1ZrHKcmfP7/VZxsbG6t+peQv00qtzowYNWqUunXrprfeekvSvXt68+ZN9e3bVx9++KEltL9y5YokqXDhwhk6H4ErAAAAAADI1fz8/Ax5M3nlypVlb2+vqKgo1atXL8U2efPmlXRvluh9//zzj44fP6758+frpZdekqRkL2N6ktByx44d6tmzp+WlQ7GxsTpz5ky6+0mp31atWqlr166S7gWxf/zxhypXrizp3kxQZ2fnDJ+jbt266t+/v2VbRl8iJt17eZK3t7c+/PBDy7azZ89afp8nTx5LiPk4SUlJGVrr08HBIcVzxcXFWc3ale7N1H3cer62traSZAnmM/JI/MP8/Pzk5+enkJAQ+fv7Kzw8XC+88ILy5s1rNZZTU7hwYd24cUM3b960hLEPBs7SvdnWkZGRj1yKID3ne9Cj7qkkqxD4yJEjKl68uAoVKpTuczyIwBUAAAAAAORqTZs2VUhIiK5evaoCBQo8cT/Ozs4KDg7W0KFDlZSUpBdffFHXr1/Xjh075OLioh49esjb21smk0mrV69WixYt5ODgoAIFCqhgwYKaN2+eihYtqqioKI0YMSLD11WuXDl9//33CgwMlMlk0qhRozL0Eq4H+/3222+1c+dOFShQQNOnT9fFixctgWtqLl++nCxkK1q0aIrnWLx4sTZs2KBSpUppyZIl2rNnj0qVKpXh2qOiohQREaHatWtrzZo1Wrly5WOPmzx5smrVqqUyZcooPj5ea9eu1ZIlSzR37lxLm5CQEP39999avHixZdv9a42NjbVce968eVO9V4GBgZo4caJKliypKlWq6MCBA5o+fXqyF0/FxcXpwoULkqSLFy9q/Pjxypcvn5o0aZKeW5Kq06dPa968eXrttdfk5eWl48eP68SJE+revbuke2vgnj59WgcPHlTx4sXl7Oz8yNncderUkaOjoz744AMNGjRIu3fvVlhYmFWbMWPGqGHDhipTpow6duyou3fvau3atRo+fLjlfFu3blXHjh1lb2+f5mA0MDBQ06dPl5+fn+rUqaM///xTo0aNUmBgoCV4le4t82DE/SNwBQAAAAAAmauccQFQZvD19dVzzz2nFStWWF4e9aTGjx+vwoULa/LkyTp16pTc3Nz03HPPWV7OU6xYMY0dO1YjRoxQr1691L17d4WFhSkiIkKDBg1S1apVVaFCBX366adP/Kj6ffdDurp166pQoUIaPny4YmJiMtSndO+lU6dOnVLTpk3l6Oiovn37qnXr1rp+/fpjjw0PD1d4eLjVtvHjx1tmy97Xr18/HThwQB06dJDJZFKnTp3Uv39/rVu3LkO1v/baaxo6dKgGDhyo+Ph4tWzZUqNGjVJoaGiqx928eVP9+/fXX3/9JQcHB1WsWFFLly5Vhw4dLG2io6OTrTHr5+dn+f2+ffsUHh4ub2/vVGcaf/bZZxo1apT69++vS5cuycvLS/369dPo0aOt2s2fP1/z58+XJBUoUEDVqlXT2rVrVaFChTTejcdzdHTUf//7Xy1atEj//POPihYtqgEDBlj+nLRr107ff/+9GjRooGvXrmnhwoXq2bNnin25u7tr6dKlGjZsmObPn6+GDRsqNDRUffv2tbSpX7++vvnmG40fP15TpkyRi4uLXn75Zcv+cePGqV+/fpbg+/7sVJPJlOq5R44cKZPJpJEjR+rvv/9W4cKFLcH2fbdv39YPP/yg9evXZ/CuSSbzw4sn5EIxMTFydXXV9evX5eLiktXl5AoJCQlau3atKl+rLFvZPrZ96bp/pu8Ez/g3c+QOjHPkBoxz5AaMc+QGjPOnLyf+HHr79m2dPn1apUqVeuQLqJ5la9as0bBhw3TkyJFkjx4DeHadPn1a5cuX19GjR1WuXLkn7mfu3LlauXKlfvrppxT3p+fvOGa4AgAAAACAXK9ly5Y6ceKE/v77b5UoUSKrywGQRmvXrlXfvn0zFLZK914e9tlnnxlSE4ErAAAAAACApCFDhmR1CQDSacCAAYb089ZbbxnSjyQxRx4AAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAADI9f755x95eHjozJkzWV1Kmm3evFkmk0nXrl2TJIWFhcnNzS1La3pSZ86ckclk0sGDB7O6FORCd+7ckY+Pj/bu3WtIf3kM6QUAAAAAAOARmv+276mda121mk903MSJE9WqVSv5+PgYW9BDNm/erAYNGujq1avZNhx9UvXr11eNGjU0c+bMrC4l3b7//ntNmjRJf/75pxISElSuXDm999576tatW6rHzJ07VwcPHlR8fLyqVKmi0NBQNW3a9LHn+uKLL7Rv3z5duXJFBw4cUI0aNaza+Pj46OzZs5IkGxsbeXp6qnnz5po6daoKFCjwyL7DwsI0ZMgQS0ifUT179tS1a9f0ww8/GNLf45hMJq1cuVKtW7d+4j6mTJmikJAQDR482DIW8+bNq+DgYA0fPlyRkZEZrpMZrgAAAAAAIFeLi4vTggUL1Lt376wuBc8od3d3ffjhh9q1a5d+++039erVS7169dKGDRseeczWrVvVuHFjrV27Vvv27VODBg0UGBioAwcOpHqumzdv6sUXX9RHH32Uartx48YpOjpaUVFRWrZsmbZu3apBgwY90fVltoSEhKwuQZK0Z88e/etf/1K1atWS7evSpYu2b9+u33//PcPnIXAFAAAAAAC52tq1a2Vvb68XXnjBsu3+4/obNmyQn5+fHBwc9Morr+jSpUtat26dKlWqJBcXF3Xu3FlxcXGW45KSkjR58mSVKlVKDg4Oql69ur799ltJ9x6bb9CggSSpQIECMplM6tmzpyRp/fr1evHFF+Xm5qaCBQvq1Vdf1cmTJzN0XSdPnlSrVq3k6ekpJycn1a5dWz///LNVGx8fH02YMEHdu3eXk5OTvL29tWrVKl2+fFmtWrWSk5OTqlWrZvWo9T///KNOnTqpWLFicnR0lK+vr77++usM1fqwxMRE9e7d23IfK1SooFmzZlm16dmzp1q3bq1JkybJ09NTbm5uGjdunO7evathw4bJ3d1dxYsX18KFC62OGz58uMqXLy9HR0eVLl1ao0aNemwgWL9+fbVp00aVKlVSmTJlNHjwYFWrVk3bt29/5DEzZ87U+++/r9q1a6tcuXKaNGmSypUrpx9//DHVc3Xr1k2jR49Wo0aNUm3n7OysIkWKqFixYmrQoIF69Oih/fv3p3rMw0JDQ1WjRg0tWbJEPj4+cnV1VceOHXXjxg1Lm2+//Va+vr5ycHBQwYIF1ahRI928eVOhoaFatGiR/v3vf8tkMslkMmnz5s2W5SGWL1+uevXqKV++fFq2bJnlXA/fo4dnlX/11VeqUqWK7O3tVbRoUQ0cOFCSLO3atGkjk8mU7tnosbGx6tKli+bPn5/iLOACBQooICBAERER6eo3JQSuAAAAAAAgV9u2bZtq1kx5KYLQ0FDNnj1bO3fu1Llz5/TGG29o5syZCg8P15o1a/TTTz/ps88+s7SfPHmyFi9erC+++EK///67hg4dqq5du2rLli0qUaKEvvvuO0nS8ePHFR0dbQkRb968qaCgIO3du1eRkZGysbFRmzZtlJSU9MTXFRsbqxYtWigyMlIHDhxQs2bNFBgYqKioKKt2M2bMUEBAgA4cOKCWLVuqW7du6t69u7p27ar9+/erTJky6t69u8xmsyTp9u3bqlmzptasWaMjR46ob9++6tatm3799dcnrvVhSUlJKl68uL755hsdPXpUo0eP1gcffKAVK1ZYtfvll190/vx5bd26VdOnT9eYMWP06quvqkCBAtq9e7fefvtt9evXT3/99ZflGGdnZ4WFheno0aOaNWuW5s+frxkzZqS5NrPZrMjISB0/flwvv/xyuq7pxo0bcnd3T/MxafX333/rxx9/VJ06ddJ97MmTJ/XDDz9o9erVWr16tbZs2aIpU6ZIkqKjo9WpUye9+eabOnbsmDZv3qy2bdvKbDYrODhYb7zxhpo1a6bo6GhFR0erbt26ln5HjBihwYMH69ixY49dRuG+uXPnasCAAerbt68OHz6sVatWqWzZspLuzU6VpIULFyo6OtryOa0GDBigli1bphpkP//889q2bVu6+k0Ja7gCAAAAAIBc7ezZs/Ly8kpx34QJExQQECBJ6t27t0JCQnTy5EmVLl1aktS+fXtt2rRJw4cPV3x8vCZNmqSff/5Z/v7+kqTSpUtr+/bt+te//qV69epZwjYPDw+rNVzbtWtndd6vvvpKhQsX1tGjR1W1atUnuq7q1aurevXqls/jx4/XypUrtWrVKsusQUlq0aKF+vXrJ0kaPXq05s6dq9q1a+v111+XdG9GqL+/vy5evGiZURkcHGw5/t1339WGDRu0YsUKPf/8809U68Ps7Ow0duxYy+dSpUpp165dWrFihd544w3Ldnd3d3366aeysbFRhQoV9PHHHysuLk4ffPCBJCkkJERTpkzR9u3b1bFjR0nSyJEjLcf7+PgoODhYERERev/991Ot6fr16ypWrJji4+Nla2urOXPmqHHjxmm+pqlTpyo2Ntaq/owYPny4Ro4cqcTERN2+fVt16tTR9OnT091PUlKSwsLC5OzsLOneDNvIyEhNnDhR0dHRunv3rtq2bStvb29Jkq+vr+VYBwcHxcfHq0iRIsn6HTJkiNq2bZuuWiZMmKD33ntPgwcPtmyrXbu2JKlw4cKSJDc3txTPl5qIiAjt37//sSGtl5eXZW3cjGCGKwAAAAAAyNVu3bqlfPnypbjvwbUePT09LY+hP7jt0qVLkqQ///xTcXFxaty4sZycnCy/Fi9e/NjlAU6cOKFOnTqpdOnScnFxsTwufX82avPmzS39ValSJU3XFRsbq+DgYFWqVElubm5ycnLSsWPHks1wffgaJetQ7f62+9eZmJio8ePHy9fXV+7u7nJyctKGDRss/S5btszq+p90xuDnn3+umjVrqnDhwnJyctK8efOS1V6lShXZ2PxfvOXp6WlVu62trQoWLGipXZKWL1+ugIAAFSlSRE5OTho5cqSl36ioKKvaJ02aZDnO2dlZBw8e1J49ezRx4kQFBQVp8+bNabqW8PBwjR07VitWrJCHh4ekjN+nYcOG6eDBg/rtt98sL3pq2bKlEhMTJcmq77fffvuR/fj4+FjCVkkqWrSo5X5Vr15dDRs2lK+vr15//XXNnz9fV69eTVN9tWrVStf1XLp0SefPn1fDhg3TddzjnDt3ToMHD9ayZcse+ef8PgcHB6slQp4UM1wBAAAAAECuVqhQoUeGSHZ2dpbfm0wmq8/3t91/7D82NlaStGbNGhUrVsyqnb29fao1BAYGytvbW/Pnz5eXl5eSkpJUtWpV3blzR5L05Zdf6tatW8lqSk1wcLA2btyoqVOnqmzZsnJwcFD79u0tfT7qGh+17f51fvLJJ5o1a5ZmzpwpX19f5c+fX0OGDLH0+9prr1k92v7wvUiLiIgIBQcHa9q0afL395ezs7M++eQT7d69+5G13681ta/Rrl271KVLF40dO1ZNmzaVq6urIiIiNG3aNEn3ZjgePHjQcuyDj//b2NhYHm+vUaOGjh07psmTJ6t+/fqPvZa33npL33zzjdXj7Bm9T4UKFbLUU65cOc2cOVP+/v7atGmTGjVqZHUdLi4uj+wntftla2urjRs3aufOnZblMz788EPt3r1bpUqVSrW+/PnzW322sbGxLEtx34Nr5zo4OKTa35Pat2+fLl26pOeee86yLTExUVu3btXs2bMtM5Yl6cqVK5aZtBlB4AoAAAAAAHI1Pz8/LV26NMP9VK5cWfb29oqKilK9evVSbJM3b15JssxClO69hOr48eOaP3++XnrpJUlK9jKmJwktd+zYoZ49e6pNmzaS7gXCZ86cSXc/KfXbqlUrde3aVdK9IPaPP/5Q5cqVJd2bCfrgjMknPUfdunXVv39/y7aMvkRMknbu3Clvb299+OGHlm0PPkKeJ08eS4j5OElJSYqPj0+1zddff60333xTERERatmypdU+I+7Tg+6HhveD+bRex+OYTCYFBAQoICBAo0ePlre3t1auXKmgoCDlzZvXaiynpnDhwrpw4YLMZrMlxH8wFHZ2dpaPj48iIyMtL5d7mJ2dXZrPd1/Dhg11+PBhq229evVSxYoVNXz4cMt9k6QjR47Iz88vXf2nhMAVAAAAAADkak2bNlVISIiuXr2a4tvL08rZ2VnBwcEaOnSokpKS9OKLL+r69evasWOHXFxc1KNHD3l7e8tkMmn16tVq0aKFHBwcVKBAARUsWFDz5s1T0aJFFRUVpREjRmT4usqVK6fvv/9egYGBMplMGjVqVIZewvVgv99++6127typAgUKaPr06bp48aIlcE3N5cuXrUI26d4j7CmdY/HixdqwYYNKlSqlJUuWaM+ePY+dVZmW2qOiohQREaHatWtrzZo1Wrly5WOPmzx5smrVqqUyZcooPj5ea9eu1ZIlSzR37lxLm5CQEP39999avHixpHvLCPTo0UOzZs1SnTp1dOHCBUn3ZnK6uro+8lxXrlxRVFSUzp8/L+neC9YkqUiRIlZrl964ccMSYJ47d07vv/++ChcubPXiqozavXu3IiMj1aRJE3l4eGj37t26fPmyKlWqJOnecgQbNmzQ8ePHVbBgwVSvq379+rp8+bI+/vhjtW/fXuvXr9e6deusZt+Ghobq7bffloeHh5o3b64bN25ox44devfddy3ni4yMVEBAgOzt7dP059XZ2TnZOsj58+dXwYIFk23ftm2bxo8fn+b78ygErgAAAAAAIFOtq1Yzq0tIla+vr5577jmtWLHC8vKoJzV+/HgVLlxYkydP1qlTp+Tm5qbnnnvO8hKnYsWKaezYsRoxYoR69eql7t27KywsTBERERo0aJCqVq2qChUq6NNPP33so+qPM336dL355puqW7euChUqpOHDhysmJiZDfUr3Xjp16tQpNW3aVI6Ojurbt69at26t69evP/bY8PBwhYeHW20bP368Zbbsff369dOBAwfUoUMHmUwmderUSf3799e6desyVPtrr72moUOHauDAgYqPj1fLli01atQohYaGpnrczZs31b9/f/31119ycHBQxYoVtXTpUnXo0MHSJjo62mqN2Xnz5unu3bsaMGCABgwYYNneo0cPhYWFPfJcq1atUq9evSyf77/sa8yYMVZ1jh49WqNHj5Z0b/Zo7dq19dNPP6lgwYJpuRVp4uLioq1bt2rmzJmKiYmRt7e3pk2bpubNm0uS+vTpo82bN6tWrVqKjY3Vpk2bLOsPP6xSpUqaM2eOJk2apPHjx6tdu3YKDg7WvHnzLG169Oih27dva8aMGQoODlahQoXUvn17y/5p06YpKChI8+fPV7FixXTmzBmdOXNGpUqV0qZNmzL0Z2bXrl26fv261fmelMn88OIJuVBMTIxcXV11/fr1VNe0gHESEhK0du1aVb5WWbayfWz70nX/TN8JyjV5wsoA4zDOkRswzpEbMM6RGzDOn76c+HPo7du3dfr0aZUqVeqxL6Z5Fq1Zs0bDhg3TkSNHrF7CBODZtmnTJrVt21anTp3K0Az1Dh06qHr16pZ/HHlYev6OY4YrAAAAAADI9Vq2bKkTJ07o77//VokSJbK6HABptHbtWn3wwQcZClvv3LkjX19fDR061JCaCFwBAAAAAAAkDRkyJKtLAJBOn3zySYb7yJs3r0aOHGlANfcwRx4AAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAADI9f755x95eHjozJkzWV1Kmm3evFkmk0nXrl2TJIWFhcnNzS1La3pSZ86ckclk0sGDB7O6FORSL7zwgr777jtD+spjSC8AAAAAAACPkDD2vad2Lrsx057ouIkTJ6pVq1by8fExtqCHbN68WQ0aNNDVq1ezbTj6pOrXr68aNWpo5syZWV1Kun3//feaNGmS/vzzTyUkJKhcuXJ677331K1bt1SPmTt3rg4ePKj4+HhVqVJFoaGhatq06WPP9cUXX2jfvn26cuWKDhw4oBo1ali18fHx0dmzZyVJNjY28vT0VPPmzTV16lQVKFDgkX2HhYVpyJAhlpA+o3r27Klr167phx9+MKS/xzGZTFq5cqVat26druMSExMVGhqqpUuX6sKFC/Ly8lLPnj01cuRImUwmSdLIkSM1dOhQtWnTRjY2GZujygxXAAAAAACQq8XFxWnBggXq3bt3VpeCZ5S7u7s+/PBD7dq1S7/99pt69eqlXr16acOGDY88ZuvWrWrcuLHWrl2rffv2qUGDBgoMDNSBAwdSPdfNmzf14osv6qOPPkq13bhx4xQdHa2oqCgtW7ZMW7du1aBBg57o+jJbQkJClp7/o48+0ty5czV79mwdO3ZMH330kT7++GN99tlnljbNmzfXjRs3tG7dugyfj8AVAAAAAADkamvXrpW9vb1eeOEFy7b7j+tv2LBBfn5+cnBw0CuvvKJLly5p3bp1qlSpklxcXNS5c2fFxcVZjktKStLkyZNVqlQpOTg4qHr16vr2228l3XtsvkGDBpKkAgUKyGQyqWfPnpKk9evX68UXX5Sbm5sKFiyoV199VSdPnszQdZ08eVKtWrWSp6ennJycVLt2bf38889WbXx8fDRhwgR1795dTk5O8vb21qpVq3T58mW1atVKTk5Oqlatmvbu3Ws55p9//lGnTp1UrFgxOTo6ytfXV19//XWGan1YYmKievfubbmPFSpU0KxZs6za9OzZU61bt9akSZPk6ekpNzc3jRs3Tnfv3tWwYcPk7u6u4sWLa+HChVbHDR8+XOXLl5ejo6NKly6tUaNGPTYQrF+/vtq0aaNKlSqpTJkyGjx4sKpVq6bt27c/8piZM2fq/fffV+3atVWuXDlNmjRJ5cqV048//pjqubp166bRo0erUaNGqbZzdnZWkSJFVKxYMTVo0EA9evTQ/v37Uz3mYaGhoapRo4aWLFkiHx8fubq6qmPHjrpx44alzbfffitfX185ODioYMGCatSokW7evKnQ0FAtWrRI//73v2UymWQymbR582bL8hDLly9XvXr1lC9fPi1btsxyrofv0cOzyr/66itVqVJF9vb2Klq0qAYOHChJlnZt2rSRyWRK12z0nTt3qlWrVmrZsqV8fHzUvn17NWnSRL/++qulja2trVq0aKGIiIh03cOUELgCAAAAAIBcbdu2bapZs2aK+0JDQzV79mzt3LlT586d0xtvvKGZM2cqPDxca9as0U8//WQ1S27y5MlavHixvvjiC/3+++8aOnSounbtqi1btqhEiRKWNSKPHz+u6OhoS4h48+ZNBQUFae/evYqMjJSNjY3atGmjpKSkJ76u2NhYtWjRQpGRkTpw4ICaNWumwMBARUVFWbWbMWOGAgICdODAAbVs2VLdunVT9+7d1bVrV+3fv19lypRR9+7dZTabJUm3b99WzZo1tWbNGh05ckR9+/ZVt27drMKrjEpKSlLx4sX1zTff6OjRoxo9erQ++OADrVixwqrdL7/8ovPnz2vr1q2aPn26xowZo1dffVUFChTQ7t279fbbb6tfv37666+/LMc4OzsrLCxMR48e1axZszR//nzNmDEjzbWZzWZFRkbq+PHjevnll9N1TTdu3JC7u3uaj0mrv//+Wz/++KPq1KmT7mNPnjypH374QatXr9bq1au1ZcsWTZkyRZIUHR2tTp066c0339SxY8e0efNmtW3bVmazWcHBwXrjjTfUrFkzRUdHKzo6WnXr1rX0O2LECA0ePFjHjh177DIK982dO1cDBgxQ3759dfjwYa1atUply5aVJO3Zs0eStHDhQkVHR1s+p0XdunUVGRmpP/74Q5J06NAhbd++Xc2bN7dq9/zzz2vbtm1p7vdRWMMVAADkGOlZH+5J13cDAAA5z9mzZ+Xl5ZXivgkTJiggIECS1Lt3b4WEhOjkyZMqXbq0JKl9+/batGmThg8frvj4eE2aNEk///yz/P39JUmlS5fW9u3b9a9//Uv16tWzhG0eHh5Wa7i2a9fO6rxfffWVChcurKNHj6pq1apPdF3Vq1dX9erVLZ/Hjx+vlStXatWqVZZZg5LUokUL9evXT5I0evRozZ07V7Vr19brr78u6d6MUH9/f128eNEyozI4ONhy/LvvvqsNGzZoxYoVev7555+o1ofZ2dlp7Nixls+lSpXSrl27tGLFCr3xxhuW7e7u7vr0009lY2OjChUq6OOPP1ZcXJw++OADSVJISIimTJmi7du3q2PHjpLurdV5n4+Pj4KDgxUREaH3338/1ZquX7+uYsWKKT4+Xra2tpozZ44aN26c5muaOnWqYmNjrerPiOHDh2vkyJFKTEzU7du3VadOHU2fPj3d/SQlJSksLEzOzs6S7s2wjYyM1MSJExUdHa27d++qbdu28vb2liT5+vpajnVwcFB8fLyKFCmSrN8hQ4aobdu26aplwoQJeu+99zR48GDLttq1a0uSChcuLElyc3NL8XypGTFihGJiYlSxYkXZ2toqMTFREydOVJcuXazaeXl56dy5c0pKSsrQOq7McAUAAAAAALnarVu3lC9fvhT3VatWzfJ7T09Py2PoD267dOmSJOnPP/9UXFycGjduLCcnJ8uvxYsXP3Z5gBMnTqhTp04qXbq0XFxcLI9L35+N2rx5c0t/VapUSdN1xcbGKjg4WJUqVZKbm5ucnJx07NixZDNcH75GyTpUu7/t/nUmJiZq/Pjx8vX1lbu7u5ycnLRhwwZLv8uWLbO6/iedMfj555+rZs2aKly4sJycnDRv3rxktVepUsUqGPP09LSq3dbWVgULFrTULknLly9XQECAihQpIicnJ40cOdLSb1RUlFXtkyZNshzn7OysgwcPas+ePZo4caKCgoK0efPmNF1LeHi4xo4dqxUrVsjDw0NSxu/TsGHDdPDgQf3222+KjIyUJLVs2VKJiYmSZNX322+//ch+fHx8LGGrJBUtWtRyv6pXr66GDRvK19dXr7/+uubPn6+rV6+mqb5atWql63ouXbqk8+fPq2HDhuk6Li1WrFihZcuWKTw8XPv379eiRYs0depULVq0yKqdg4ODkpKSFB8fn6HzMcMVAAAAAADkaoUKFXpkiGRnZ2f5vclksvp8f9v9x/5jY2MlSWvWrFGxYsWs2tnb26daQ2BgoLy9vTV//nx5eXkpKSlJVatW1Z07dyRJX375pW7dupWsptQEBwdr48aNmjp1qsqWLSsHBwe1b9/e0uejrvFR2+5f5yeffKJZs2Zp5syZ8vX1Vf78+TVkyBBLv6+99prVo+0P34u0iIiIUHBwsKZNmyZ/f385Ozvrk08+0e7dux9Z+/1aU/sa7dq1S126dNHYsWPVtGlTubq6KiIiQtOm3Xv6ycvLSwcPHrQc++Dj/zY2NpbH22vUqKFjx45p8uTJql+//mOv5a233tI333xjtS5rRu9ToUKFLPWUK1dOM2fOlL+/vzZt2qRGjRpZXYeLi8sj+0ntftna2mrjxo3auXOnZfmMDz/8ULt371apUqVSrS9//vxWn21sbCzLUtz34Nq5Dg4OqfaXEcOGDdOIESMss5x9fX119uxZTZ48WT169LC0u3LlivLnz5/hWghcAQAAgGyEpTMAwHh+fn5aunRphvupXLmy7O3tFRUVpXr16qXYJm/evJJkmYUo3XsJ1fHjxzV//ny99NJLkpTsZUxPElru2LFDPXv2VJs2bSTdC4TPnDmT7n5S6rdVq1bq2rWrpHtB7B9//KHKlStLujcT9MEZk096jrp166p///6WbRl9iZh07+VJ3t7e+vDDDy3bzp49a/l9njx5LCHm46RlJuTXX3+tN998UxEREWrZsqXVPiPu04NsbW0lyRLMp/U6HsdkMikgIEABAQEaPXq0vL29tXLlSgUFBSlv3rxWYzk1hQsX1oULF2Q2my0h/oOhsLOzs3x8fBQZGWl5udzD7Ozs0ny+B8XFxSVbIsDW1jbZGslHjhyRn59fuvt/GIErAAAAAADI1Zo2baqQkBBdvXpVBQoUeOJ+nJ2dFRwcrKFDhyopKUkvvviirl+/rh07dsjFxUU9evSQt7e3TCaTVq9erRYtWsjBwUEFChRQwYIFNW/ePBUtWlRRUVEaMWJEhq+rXLly+v777xUYGCiTyaRRo0Zl6CVcD/b77bffaufOnSpQoICmT5+uixcvWgLX1Fy+fNkqZJPuPcKe0jkWL16sDRs2qFSpUlqyZIn27Nnz2FmVaak9KipKERERql27ttasWaOVK1c+9rjJkyerVq1aKlOmjOLj47V27VotWbJEc+fOtbQJCQnR33//rcWLF0u6t4xAjx49NGvWLNWpU0cXLlyQdG8mp6ur6yPPdeXKFUVFRen8+fOS7r1gTZKKFClitXbpjRs3LAHmuXPn9P7776tw4cJWL67KqN27dysyMlJNmjSRh4eHdu/ercuXL6tSpUqS7i1HsGHDBh0/flwFCxZM9brq16+vy5cv6+OPP1b79u21fv16rVu3zmr2bWhoqN5++215eHioefPmunHjhnbs2KF3333Xcr7IyEgFBATI3t4+zX9eAwMDNXHiRJUsWVJVqlTRgQMHNH36dL355ptW7bZt26YmTZqk9zYlQ+AKAAAAAAAy1bM+497X11fPPfecVqxYYXl51JMaP368ChcurMmTJ+vUqVNyc3PTc889Z3mJU7FixTR27FiNGDFCvXr1Uvfu3RUWFqaIiAgNGjRIVatWVYUKFfTpp58+9lH1x7kfKNWtW1eFChXS8OHDFRMTk6E+pXsvnTp16pSaNm0qR0dH9e3bV61bt9b169cfe2x4eLjCw8Otto0fP94yW/a+fv366cCBA+rQoYNMJpM6deqk/v37a926dRmq/bXXXtPQoUM1cOBAxcfHq2XLlho1apRCQ0NTPe7mzZvq37+//vrrLzk4OKhixYpaunSpOnToYGkTHR1ttcbsvHnzdPfuXQ0YMEADBgywbO/Ro4fCwsIeea5Vq1apV69els/3H4MfM2aMVZ2jR4/W6NGjJd2bPVq7dm399NNPKliwYFpuRZq4uLho69atmjlzpmJiYuTt7a1p06apefPmkqQ+ffpo8+bNqlWrlmJjY7Vp0ybL+sMPq1SpkubMmaNJkyZp/PjxateunYKDgzVv3jxLmx49euj27duaMWOGgoODVahQIbVv396yf9q0aQoKCtL8+fNVrFgxnTlzRmfOnFGpUqW0adOmR/6Z+eyzzzRq1Cj1799fly5dkpeXl/r162e5f5L0999/a+fOnYbMdjeZH148IReKiYmRq6urrl+/nuqaFjBOQkKC1q5dq8rXKstWto9tX7run+k7QbmM/2sEkFGMc+QGz9o451FrZAbGOXKDZ22c5wY58efQ27dv6/Tp0ypVqtQjX0D1LFuzZo2GDRumI0eOZOjt5ACerk2bNqlt27Y6depUhmaoDx8+XFevXrUKgB+Unr/jmOEKAAAAAAByvZYtW+rEiRP6+++/VaJEiawuB0AarV27Vh988EGGwlZJ8vDwUFBQkCE1EbgCAAAAAABIGjJkSFaXACCdPvnkE0P6ee+9tD9F9DjMkQcAAAAAAAAAgzDDFQAAAADwTGGtYgBAdsYMVwAAAAAAYBjezQ0gJ0rP320ErgAAAAAAIMPs7OwkSXFxcVlcCQAY786dO5IkW1vbx7ZlSQEAyKaa/7YvXe3XVauZSZUAmSe943xVJtUBAAAez9bWVm5ubrp06ZIkydHRUSaTKYurAoCMS0pK0uXLl+Xo6Kg8eR4fpxK4AkAuwVpoAAAAyGxFihSRJEvoCgA5hY2NjUqWLJmmf0gicAUAAAAAAIYwmUwqWrSoPDw8lJCQkNXlAIBh8ubNKxubtK3OSuAKiJl/AAAAAGAkW1vbNK1zCAA5ES/NAgAAAAAAAACDMMMVAAAAyEK8HA4AACBnYYYrAAAAAAAAABiEwBUAAAAAAAAADMKSAgAAAACATMXSGQCA3IQZrgAAAAAAAABgEAJXAAAAAAAAADAISwogR+KRJQAAAAAAAGQFZrgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIFkauM6dO1fVqlWTi4uLXFxc5O/vr3Xr1ln23759WwMGDFDBggXl5OSkdu3a6eLFi1Z9REVFqWXLlnJ0dJSHh4eGDRumu3fvPu1LAQAAAAAAAICsDVyLFy+uKVOmaN++fdq7d69eeeUVtWrVSr///rskaejQofrxxx/1zTffaMuWLTp//rzatm1rOT4xMVEtW7bUnTt3tHPnTi1atEhhYWEaPXp0Vl0SAAAAAAAAgFwsT1aePDAw0OrzxIkTNXfuXP3nP/9R8eLFtWDBAoWHh+uVV16RJC1cuFCVKlXSf/7zH73wwgv66aefdPToUf3888/y9PRUjRo1NH78eA0fPlyhoaHKmzdvVlwWAAAAAAAAgFwqSwPXByUmJuqbb77RzZs35e/vr3379ikhIUGNGjWytKlYsaJKliypXbt26YUXXtCuXbvk6+srT09PS5umTZvqnXfe0e+//y4/P78UzxUfH6/4+HjL55iYGElSQkKCEhISMukK8aD79zlRiWlrn2hOV/92SUnpq8fGNh2NGSNIG8Y5cgPGOXIDxjlyA8b508fPngCQc5nMZnP6vlMa7PDhw/L399ft27fl5OSk8PBwtWjRQuHh4erVq5dVMCpJzz//vBo0aKCPPvpIffv21dmzZ7VhwwbL/ri4OOXPn19r165V8+bNUzxnaGioxo4dm2x7eHi4HB0djb1AAAAAAAAeEhcXp86dO+v69etycXHJ6nIAAAbK8hmuFSpU0MGDB3X9+nV9++236tGjh7Zs2ZKp5wwJCVFQUJDlc0xMjEqUKKEmTZrwje4pSUhI0MaNG1XhWgXZ6vH/eu1T51S6+m932z1d7SN+/CbNbe1GTExX38i9GOfIDRjnyA0Y58gNGOdP3/0nLQEAOU+WB6558+ZV2bJlJUk1a9bUnj17NGvWLHXo0EF37tzRtWvX5ObmZml/8eJFFSlSRJJUpEgR/frrr1b9Xbx40bLvUezt7WVvb59su52dnezs7DJ6SUgH2///3+PY2ZrS1W+CTfreB2eXlLZHpyQxRpBujHPkBoxz5AaMc+QGjPOnJ7vWDQB4vPR913sKkpKSFB8fr5o1a8rOzk6RkZGWfcePH1dUVJT8/f0lSf7+/jp8+LAuXbpkabNx40a5uLiocuXKT712AAAAAAAAALlbls5wDQkJUfPmzVWyZEnduHFD4eHh2rx5szZs2CBXV1f17t1bQUFBcnd3l4uLi9599135+/vrhRdekCQ1adJElStXVrdu3fTxxx/rwoULGjlypAYMGJDiDFYAAAAAAAAAyExZGrheunRJ3bt3V3R0tFxdXVWtWjVt2LBBjRs3liTNmDFDNjY2ateuneLj49W0aVPNmTPHcrytra1Wr16td955R/7+/sqfP7969OihcePGZdUlAQAAAAAAAMjFsjRwXbBgQar78+XLp88//1yff/75I9t4e3tr7dq1RpcGAAAAAAAAAOn2zK3hCgAAAAAAAADZFYErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIMQuAIAAAAAAACAQQhcAQAAAAAAAMAgBK4AAAAAAAAAYBACVwAAAAAAAAAwCIErAAAAAAAAABiEwBUAAAAAAAAADELgCgAAAAAAAAAGIXAFAAAAAAAAAIPkSe8BSUlJ2rJli7Zt26azZ88qLi5OhQsXlp+fnxo1aqQSJUpkRp0AAAAAAAAA8MxL8wzXW7duacKECSpRooRatGihdevW6dq1a7K1tdWff/6pMWPGqFSpUmrRooX+85//ZGbNAAAAAAAAAPBMSvMM1/Lly8vf31/z589X48aNZWdnl6zN2bNnFR4ero4dO+rDDz9Unz59DC0WAAAAAAAAAJ5laQ5cf/rpJ1WqVCnVNt7e3goJCVFwcLCioqIyXBwAAAAAAAAAZCdpXlLgcWHrg+zs7FSmTJknKggAAAAAAAAAsqs0B64PWr9+vbZv3275/Pnnn6tGjRrq3Lmzrl69alhxAAAAAAAAAJCdPFHgOmzYMMXExEiSDh8+rPfee08tWrTQ6dOnFRQUZGiBAAAAAAAAAJBdpHkN1wedPn1alStXliR99913evXVVzVp0iTt379fLVq0MLRAAAAAAAAAAMgunmiGa968eRUXFydJ+vnnn9WkSRNJkru7u2XmKwAAAAAAAADkNk80w/XFF19UUFCQAgIC9Ouvv2r58uWSpD/++EPFixc3tEAAAAAAAAAAyC6eaIbr7NmzlSdPHn377beaO3euihUrJklat26dmjVrZmiBAAAAAAAAAJBdPNEM15IlS2r16tXJts+YMSPDBQEAAAAAAABAdvVEget9ly5d0qVLl5SUlGS1vVq1ahkqCgAAAAAAAACyoycKXPft26cePXro2LFjMpvNkiSTySSz2SyTyaTExERDiwQAAAAAAACA7OCJAtc333xT5cuX14IFC+Tp6SmTyWR0XQAAAAAAAACQ7TxR4Hrq1Cl99913Klu2rNH1AAAAAAAAAEC2ZfMkBzVs2FCHDh0yuhYAAAAAAAAAyNaeaIbrl19+qR49eujIkSOqWrWq7OzsrPa/9tprhhQHAAAAAAAAANnJEwWuu3bt0o4dO7Ru3bpk+3hpFgAAAAAAAIDc6okC13fffVddu3bVqFGj5OnpaXRNAAAAAICM+m86Xm6c6CDp60wrBQCA3OSJAtd//vlHQ4cOJWwFAABA9kQQBQAAgEzyRC/Natu2rTZt2mR0LQAAAAAAAACQrT3RDNfy5csrJCRE27dvl6+vb7KXZg0aNMiQ4gDgmcOMKOQGjHMAAAAAeGJPFLh++eWXcnJy0pYtW7RlyxarfSaTicA1t+IHdAAAAAAAAORyTxS4nj592ug6AAAAAAAAACDbe6I1XAEAAAAAAAAAyaU5cJ0yZYpu3bqVpra7d+/WmjVrnrgoAAAAAAAAAMiO0rykwNGjR1WyZEm9/vrrCgwMVK1atVS4cGFJ0t27d3X06FFt375dS5cu1fnz57V48eJMKxpPx/WxY9PV3rVDJhUCAAAAAAAAZBNpDlwXL16sQ4cOafbs2ercubNiYmJka2sre3t7xcXFSZL8/Pz01ltvqWfPnsqXL1+mFQ0AAAAAAAAAz6J0vTSrevXqmj9/vv71r3/pt99+09mzZ3Xr1i0VKlRINWrUUKFChTKrTgAAAOCReDIHAAAAz4p0Ba732djYqEaNGqpRo4bB5QAAAKMRRAEAAADA0/NEgSsA5BQEUQAAILvg/1sAAMgebLK6AAAAAAAAAADIKQhcAQAAAAAAAMAgBK4AAAAAAAAAYJB0Ba62tra6dOlSZtUCAAAAAAAAANlaugJXs9mcWXUAAAAAAAAAQLbHkgIAAAAAAAAAYJA86T3gyy+/lJOTU6ptBg0a9MQFAQAAAAAAAEB2le7A9YsvvpCtre0j95tMJgJXAAAAAAAAALlSugPXvXv3ysPDIzNqAQAAAAAAAIBsLV1ruJpMpsyqAwAAAAAAAACyvXQFrmazObPqAAAAAAAAAIBsL12B65gxYx77wiwAAAAAAAAAyK3SFbguWLBAt27dsnyePXu2YmJiDC8KAAAAAAAAALKjdAWuf/31lxITEy2fP/jgA/3vf/8zvCgAAAAAAAAAyI7SFbg+jDVdAQAAAAAAAOD/ZChwBQAAAAAAAAD8nzzpPeDLL7+0vDjr7t27CgsLU6FChazaDBo0yJjqAAAAAAAAACAbSVfgWrJkSc2fP9/yuUiRIlqyZIlVG5PJROAKAAAAAAAAIFdKV+B65syZTCoDAAAAAAAAALK/dK3h+sorr+jatWuZVAoAAAAAAAAAZG/pClw3b96sO3fuZFYtAAAAAAAAAJCtpStwBQAAAAAAAAA8WrrWcJWko0eP6sKFC6m2qVat2hMXBAAAAAAAAADZVboD14YNG8psNifbbjKZZDabZTKZlJiYaEhxAAAAAAAAAJCdpDtw3b17twoXLpwZtQAAAAAAAABAtpbuwLVkyZLy8PDIjFoAAAAAAAAAIFvjpVkAAAAAAAAAYJB0Ba716tXTnTt3MqsWAAAAAAAAAMjW0hW4bt26VXnz5s2sWgAAAAAAAAAgW0tX4Go2mzOrDgAAAAAAAADI9tK9hqvJZMqMOgAAAAAAAAAg28uT3gPKly//2ND1ypUrT1wQAAAAAAAAAGRX6Q5cx44dK1dX18yoBQAAAAAAAACytXQHrh07dpSHh0dm1AIAAAAAAAAA2Vq61nBl/VYAAAAAAAAAeLR0Ba5mszmz6gAAAAAAAACAbC9dSwokJSVlVh0AAAAAAAAAkO2la4YrAAAAAAAAAODRCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMkqWB6+TJk1W7dm05OzvLw8NDrVu31vHjx63a3L59WwMGDFDBggXl5OSkdu3a6eLFi1ZtoqKi1LJlSzk6OsrDw0PDhg3T3bt3n+alAAAAAAAAAEDWBq5btmzRgAED9J///EcbN25UQkKCmjRpops3b1raDB06VD/++KO++eYbbdmyRefPn1fbtm0t+xMTE9WyZUvduXNHO3fu1KJFixQWFqbRo0dnxSUBAAAAAAAAyMXyZOXJ169fb/U5LCxMHh4e2rdvn15++WVdv35dCxYsUHh4uF555RVJ0sKFC1WpUiX95z//0QsvvKCffvpJR48e1c8//yxPT0/VqFFD48eP1/DhwxUaGqq8efMmO298fLzi4+Mtn2NiYiRJCQkJSkhIyMQrzl7u2qQvj09IdEh726R7bROVmMa+zemqxS4pKV3tE2xs09GYMZKTMM4f1ZhxnpMwzh/VmHGekzDOH9WYcZ6TMM4f1Th7jnN+9gSAnMtkNpvT950yE/35558qV66cDh8+rKpVq+qXX35Rw4YNdfXqVbm5uVnaeXt7a8iQIRo6dKhGjx6tVatW6eDBg5b9p0+fVunSpbV//375+fklO09oaKjGjh2bbHt4eLgcHR0z49IAAAAAALCIi4tT586ddf36dbm4uGR1OQAAA2XpDNcHJSUlaciQIQoICFDVqlUlSRcuXFDevHmtwlZJ8vT01IULFyxtPD09k+2/vy8lISEhCgoKsnyOiYlRiRIl1KRJE77RPSBmypR0tXdpOznNbROSHLTx1FeqcK2CbPX4f732qXMqXbW0u+2ervYRP36T5rZ2Iyamq2882xjnKWOc5yyM85QxznMWxnnKGOc5C+M8Zdl1nN9/0hIAkPM8M4HrgAEDdOTIEW3fvj3Tz2Vvby97e/tk2+3s7GRnZ5fp588u8qTzsR8721vpPoft///v8X2b0tVvQjoft7JLStujU5IYIzkM4/wRbRnnOQrj/BFtGec5CuP8EW0Z5zkK4/wRbbPpOM+udQMAHi9LX5p138CBA7V69Wpt2rRJxYsXt2wvUqSI7ty5o2vXrlm1v3jxoooUKWJpc/HixWT77+8DAAAAAAAAgKclSwNXs9msgQMHauXKlfrll19UqlQpq/01a9aUnZ2dIiMjLduOHz+uqKgo+fv7S5L8/f11+PBhXbp0ydJm48aNcnFxUeXKlZ/OhQAAAAAAAACAsnhJgQEDBig8PFz//ve/5ezsbFlz1dXVVQ4ODnJ1dVXv3r0VFBQkd3d3ubi46N1335W/v79eeOEFSVKTJk1UuXJldevWTR9//LEuXLigkSNHasCAASkuGwAAAAAAAAAAmSVLA9e5c+dKkurXr2+1feHCherZs6ckacaMGbKxsVG7du0UHx+vpk2bas6cOZa2tra2Wr16td555x35+/srf/786tGjh8aNG/e0LgMAAAAAAAAAJGVx4Go2mx/bJl++fPr888/1+eefP7KNt7e31q5da2RpAAAAAAAAAJBuz8RLswAAAAAAAAAgJyBwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACDELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABiFwBQAAAAAAAACD5MnqAgAAAAAgt5py4H9pbvtOJtYBAACMQ+AKIMfhBxfkBoxzAAAAAHg2saQAAAAAAAAAABiEGa4AAAB4JjGTGwAAANkRgWsuww8uAAAAAAAAQOZhSQEAAAAAAAAAMAiBKwAAAAAAAAAYhMAVAAAAAAAAAAxC4AoAAAAAAAAABuGlWQCeebOuzkrnEV0ypQ4AAAAAAIDHIXDN5giiAAAAAAAAgGcHSwoAAAAAAAAAgEGY4QoAwDOAJxYAIGfg73MAAEDgCgAAgKeCIAoAAAC5AUsKAAAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAbJ0sB169atCgwMlJeXl0wmk3744Qer/WazWaNHj1bRokXl4OCgRo0a6cSJE1Ztrly5oi5dusjFxUVubm7q3bu3YmNjn+JVAAAAAAAAAMA9WRq43rx5U9WrV9fnn3+e4v6PP/5Yn376qb744gvt3r1b+fPnV9OmTXX79m1Lmy5duuj333/Xxo0btXr1am3dulV9+/Z9WpcAAAAAAAAAABZ5svLkzZs3V/PmzVPcZzabNXPmTI0cOVKtWrWSJC1evFienp764Ycf1LFjRx07dkzr16/Xnj17VKtWLUnSZ599phYtWmjq1Kny8vJ6atcCAAAAAAAAAFkauKbm9OnTunDhgho1amTZ5urqqjp16mjXrl3q2LGjdu3aJTc3N0vYKkmNGjWSjY2Ndu/erTZt2qTYd3x8vOLj4y2fY2JiJEkJCQlKSEjIpCvKHKa7pnS1t0m6m+a2d23SNwE6IdEh7W2T7rVNVGIa+zanqxa7pKR0tU+wsU1H4+w1RnICxnnKGOc5C+M8ZYzznIVxnjLGec7COE8Z4zy57PazJwAg7Uxmszl93ykziclk0sqVK9W6dWtJ0s6dOxUQEKDz58+raNGilnZvvPGGTCaTli9frkmTJmnRokU6fvy4VV8eHh4aO3as3nnnnRTPFRoaqrFjxybbHh4eLkdHR+MuCgAAAACAFMTFxalz5866fv26XFxcsrocAICBntkZrpkpJCREQUFBls8xMTEqUaKEmjRpku2+0c29Njdd7W9HvZHmtr3Xzk9X3y5tJ6e5bUKSgzae+koVrlWQrR7/r9c+dU6lq5Z2t93T1T7ix2/S3NZuxMR09Y2MY5ynjHGeszDOU8Y4z1kY5yljnOcsjPOUMc6Tu/+kJQAg53lmA9ciRYpIki5evGg1w/XixYuqUaOGpc2lS5esjrt7966uXLliOT4l9vb2sre3T7bdzs5OdnZ2BlT/9JjzpG+CcpJN2r/kedL52I+d7a10tZck2///3+P7Tt+jWQnpfNzKLiltj05JynZjJCdgnKeMcZ6zMM5TxjjPWRjnKWOc5yyM85QxzpPLrnUDAB4vfd/1nqJSpUqpSJEiioyMtGyLiYnR7t275e/vL0ny9/fXtWvXtG/fPkubX375RUlJSapTp85TrxkAAAAAAABA7palM1xjY2P1559/Wj6fPn1aBw8elLu7u0qWLKkhQ4ZowoQJKleunEqVKqVRo0bJy8vLss5rpUqV1KxZM/Xp00dffPGFEhISNHDgQHXs2FFeXl5ZdFUAAAAAAAAAcqssDVz37t2rBg0aWD7fX1e1R48eCgsL0/vvv6+bN2+qb9++unbtml588UWtX79e+fLlsxyzbNkyDRw4UA0bNpSNjY3atWunTz/99KlfCwAAAAAAAABkaeBav359mc2PXuPIZDJp3LhxGjdu3CPbuLu7Kzw8PDPKAwAAAAAAAIB0eWbXcAUAAAAAAACA7IbAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgEAJXAAAAAAAAADBIjglcP//8c/n4+ChfvnyqU6eOfv3116wuCQAAAAAAAEAukyMC1+XLlysoKEhjxozR/v37Vb16dTVt2lSXLl3K6tIAAAAAAAAA5CJ5sroAI0yfPl19+vRRr169JElffPGF1qxZo6+++kojRoxI1j4+Pl7x8fGWz9evX5ckXblyRQkJCU+naIPEX49/fKMH3Im5mua2V+/cSVffd6/nS3PbhKR8iouL0/Vb12Ur28e2/+d6bLpq0W27dDX/587dNLe1++ef9NWCDGOcPwLjPEdhnD8C4zxHYZw/AuM8R2GcPwLjPJkbN25IksxmcxZXAgAwmsmczf92v3PnjhwdHfXtt9+qdevWlu09evTQtWvX9O9//zvZMaGhoRo7duxTrBIAAAAAgOTOnTun4sWLZ3UZAAADZfsZrv/73/+UmJgoT09Pq+2enp7673//m+IxISEhCgoKsnxOSkrSlStXVLBgQZlMpkytF/fExMSoRIkSOnfunFxcXLK6HCBTMM6RGzDOkRswzpEbMM6fPrPZrBs3bsjLyyurSwEAGCzbB65Pwt7eXvb29lbb3NzcsqaYXM7FxYX/oUOOxzhHbsA4R27AOEduwDh/ulxdXbO6BABAJsj2L80qVKiQbG1tdfHiRavtFy9eVJEiRbKoKgAAAAAAAAC5UbYPXPPmzauaNWsqMjLSsi0pKUmRkZHy9/fPwsoAAAAAAAAA5DY5YkmBoKAg9ejRQ7Vq1dLzzz+vmTNn6ubNm+rVq1dWl4ZHsLe315gxY5It7QDkJIxz5AaMc+QGjHPkBoxzAACMYzKbzeasLsIIs2fP1ieffKILFy6oRo0a+vTTT1WnTp2sLgsAAAAAAABALpJjAlcAAAAAAAAAyGrZfg1XAAAAAAAAAHhWELgCAAAAAAAAgEEIXAEAAAAAAADAIASuAAAAAAAAAGAQAlcAAPDEePcmAAAAAFgjcAUAAE/M3t5ex44dy+oyAABPYNu2beratav8/f31999/S5KWLFmi7du3Z3FlAABkb3myugDg3LlzGjNmjL766qusLgXIkFu3bmnfvn1yd3dX5cqVrfbdvn1bK1asUPfu3bOoOiBjgoKCUtyemJioKVOmqGDBgpKk6dOnP82ygEwxe/Zs/frrr2rRooU6duyoJUuWaPLkyUpKSlLbtm01btw45cnD/0Yje/vuu+/UrVs3denSRQcOHFB8fLwk6fr165o0aZLWrl2bxRUCAJB9mcw8C4gsdujQIT333HNKTEzM6lKAJ/bHH3+oSZMmioqKkslk0osvvqiIiAgVLVpUknTx4kV5eXkxzpFt2djYqHr16nJzc7PavmXLFtWqVUv58+eXyWTSL7/8kjUFAgaZMGGCPv74YzVp0kQ7duzQkCFD9Mknn2jo0KGysbHRjBkz9M4772js2LFZXSqQIX5+fho6dKi6d+8uZ2dnHTp0SKVLl9aBAwfUvHlzXbhwIatLBAAg2+Kf5pHpVq1aler+U6dOPaVKgMwzfPhwVa1aVXv37tW1a9c0ZMgQBQQEaPPmzSpZsmRWlwdk2KRJkzRv3jxNmzZNr7zyimW7nZ2dwsLCks3qBrKrsLAwhYWFqW3btjp06JBq1qypRYsWqUuXLpKkihUr6v333ydwRbZ3/Phxvfzyy8m2u7q66tq1a0+/IAAAchACV2S61q1by2QypfpiFZPJ9BQrAoy3c+dO/fzzzypUqJAKFSqkH3/8Uf3799dLL72kTZs2KX/+/FldIpAhI0aMUMOGDdW1a1cFBgZq8uTJsrOzy+qyAMOdP39etWrVkiRVr15dNjY2qlGjhmX/c889p/Pnz2dRdYBxihQpoj///FM+Pj5W27dv367SpUtnTVEAAOQQvDQLma5o0aL6/vvvlZSUlOKv/fv3Z3WJQIbdunXLaj0/k8mkuXPnKjAwUPXq1dMff/yRhdUBxqhdu7b27duny5cvq1atWjpy5Aj/YIYcp0iRIjp69Kgk6cSJE0pMTLR8lqTff/9dHh4eWVUeYJg+ffpo8ODB2r17t0wmk86fP69ly5YpODhY77zzTlaXBwBAtsYMV2S6mjVrat++fWrVqlWK+x83+xXIDipWrKi9e/eqUqVKVttnz54tSXrttdeyoizAcE5OTlq0aJEiIiLUqFEj1iVGjtOlSxd1795drVq1UmRkpN5//30FBwfrn3/+kclk0sSJE9W+ffusLhPIsBEjRigpKUkNGzZUXFycXn75Zdnb2ys4OFjvvvtuVpcHAEC2xkuzkOm2bdummzdvqlmzZinuv3nzpvbu3at69eo95coA40yePFnbtm175Bt9+/fvry+++EJJSUlPuTIg8/z111/at2+fGjVqxLIZyDGSkpI0ZcoU7dq1S3Xr1tWIESO0fPlyvf/++4qLi1NgYKBmz57NmEeOcefOHf3555+KjY1V5cqV5eTklNUlAQCQ7RG4AgAAAAAAAIBBWFIAAAAAAHKZmzdvasqUKYqMjNSlS5eSPYVz6tSpLKoMAIDsj8AVAAAAAHKZt956S1u2bFG3bt1UtGhRXoIIAICBWFIAAAAAAHIZNzc3rVmzRgEBAVldCgAAOY5NVhcAAAAAAHi6ChQoIHd396wuAwCAHInAFQAAAABymfHjx2v06NGKi4vL6lIAAMhxWFIAAAAAAHIZPz8/nTx5UmazWT4+PrKzs7Pav3///iyqDACA7I+XZgEAAABALtO6deusLgEAgByLGa4AAAAAAAAAYBDWcAUAAAAAAAAAg7CkAAAAAADkAu7u7vrjjz9UqFAhFShQQCaT6ZFtr1y58hQrAwAgZyFwBQAAAIBcYMaMGXJ2dpYkzZw5M2uLAQAgB2MNVwAAAAAAAAAwCDNcAQAAACAXiImJSXNbFxeXTKwEAICcjRmuAAAAAJAL2NjYpLpuqySZzWaZTCYlJiY+paoAAMh5mOEKAAAAALnApk2bsroEAAByBWa4AgAAAEAu0LZtW4WFhcnFxUWLFy9Whw4dZG9vn9VlAQCQ4xC4AgAAAEAukDdvXp09e1ZFixaVra2toqOj5eHhkdVlAQCQ47CkAAAAAADkAhUrVlRISIgaNGggs9msFStWPPLlWN27d3/K1QEAkHMwwxUAAAAAcoEdO3bovffe08mTJ3XlyhU5Ozun+BItk8mkK1euZEGFAADkDASuAAAAAJDL2NjY6MKFCywpAABAJrDJ6gIAAAAAAJmvbdu2iomJkSQtXLhQzs7OWVwRAAA5EzNcAQAAACAX4KVZAAA8Hbw0CwAAAAByAV6aBQDA08EMVwAAAADIBXbu3KmgoCBemgUAQCYjcAUAAACAXMbGxkbR0dHy9PTM6lIAAMhxCFwBAAAAIJc5e/asXFxc9NVXX+nYsWOSpCpVqqh3796PXGYAAACkDYErAAAAAOQye/fuVdOmTeXg4KDnn39ekrRnzx7dunVLGzZsUM2aNbO4QgAAsi8CVwAAAADIZV566SWVLVtW8+fPV548996lfPfuXb311ls6deqUtm7dmsUVAgCQfRG4AgAAAEAu4+DgoAMHDqhixYpW248ePapatWopLi4uiyoDACD7s8nqAgAAAAAAT5eLi4uioqKSbT937pycnZ2zoCIAAHIOAlcAAAAAyGU6dOig3r17a/ny5Tp37pzOnTuniIgIvfXWW+rUqVNWlwcAQLaWJ6sLAAAAAAA8XVOnTpXJZFL37t119+5dSZKdnZ3eeecdTZkyJYurAwAge2MNVwAAAADIpeLi4nTy5ElJUpkyZeTo6JjFFQEAkP0RuAIAAAAAAACAQVjDFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAEgDk8mkH374wbD+Nm/eLJPJpGvXrhnW59Nw584dlS1bVjt37szqUnKF9evXq0aNGkpKSsrqUgAAAACkEYErACDH2rVrl2xtbdWyZcsM9xUdHa3mzZsbUFXa+fj4yGQyyWQyydHRUb6+vvryyy/T3Y+RYfEXX3yhUqVKqW7duob0l9V8fHw0c+bMdB939uxZOTg4KDY21viiHtCsWTPZ2dlp2bJlmXoeAAAAAMYhcAUA5FgLFizQu+++q61bt+r8+fOptjWbzbp7926y7Xfu3JEkFSlSRPb29plSZ2rGjRun6OhoHTlyRF27dlWfPn20bt26p16HdO8ezZ49W7179061XUJCwlOqKOv8+9//VoMGDeTk5JTp5+rZs6c+/fTTTD8PAAAAAGMQuAIAcqTY2FgtX75c77zzjlq2bKmwsDCr/fcf6V+3bp1q1qwpe3t7bd++XfXr19fAgQM1ZMgQFSpUSE2bNpVkPUu0bt26Gj58uFV/ly9flp2dnbZu3SpJWrJkiWrVqiVnZ2cVKVJEnTt31qVLl9J9HfePL126tIYPHy53d3dt3LjRsn/Pnj1q3LixChUqJFdXV9WrV0/79++37Pfx8ZEktWnTRiaTyfJZuhcaPvfcc8qXL59Kly6tsWPHphg637dv3z6dPHnSasbwmTNnZDKZtHz5ctWrV0/58uWzzMb88ssvValSJeXLl08VK1bUnDlzrPr79ddf5efnp3z58qlWrVpauXKlTCaTDh48KEkKCwuTm5ub1TE//PCDTCaT1bbUrsNsNis0NFQlS5aUvb29vLy8NGjQIElS/fr1dfbsWQ0dOtQyk1i6N3s1MDBQBQoUUP78+VWlShWtXbs22Tlfe+01SffG0vPPP6/8+fPLzc1NAQEBOnv2bJrv87Vr19SvXz95enoqX758qlq1qlavXm3ZHxgYqL179+rkyZOP/NoAAAAAeHbkyeoCAADIDCtWrFDFihVVoUIFde3aVUOGDFFISEiysG7EiBGaOnWqSpcurQIFCkiSFi1apHfeeUc7duxIse8uXbro448/1pQpUyz9LV++XF5eXnrppZck3ZvlOX78eFWoUEGXLl1SUFCQevbsmSy4S6ukpCStXLlSV69eVd68eS3bb9y4oR49euizzz6T2WzWtGnT1KJFC504cULOzs7as2ePPDw8tHDhQjVr1ky2traSpG3btql79+769NNP9dJLL+nkyZPq27evJGnMmDEp1rBt2zaVL19ezs7OyfaNGDFC06ZNswSoy5Yt0+jRozV79mz5+fnpwIED6tOnj/Lnz68ePXooNjZWr776qho3bqylS5fq9OnTGjx4cLrvy+Ou47vvvtOMGTMUERGhKlWq6MKFCzp06JAk6fvvv1f16tXVt29f9enTx9LngAEDdOfOHW3dulX58+fX0aNHrWayXrt2Tdu3b9eSJUt09+5dtW7dWn369NHXX3+tO3fu6Ndff7WMi8fVl5SUpObNm+vGjRtaunSpypQpo6NHj1q+TpJUsmRJeXp6atu2bSpTpky67xEAAACAp8wMAEAOVLduXfPMmTPNZrPZnJCQYC5UqJB506ZNlv2bNm0ySzL/8MMPVsfVq1fP7Ofnl6w/SeaVK1eazWaz+dKlS+Y8efKYt27datnv7+9vHj58+CPr2bNnj1mS+caNG1bnv3r16iOP8fb2NufNm9ecP39+c548ecySzO7u7uYTJ0488pjExESzs7Oz+ccff0yx9vsaNmxonjRpktW2JUuWmIsWLfrIvgcPHmx+5ZVXrLadPn3aLMlyr+8rU6aMOTw83Grb+PHjzf7+/maz2Wz+17/+ZS5YsKD51q1blv1z5841SzIfOHDAbDabzQsXLjS7urpa9bFy5Urzg//78rjrmDZtmrl8+fLmO3fupHhN3t7e5hkzZlht8/X1NYeGhqbY3mw2m5ctW2auVauW2Ww2m//55x+zJPPmzZtTbPu4+jZs2GC2sbExHz9+/JHnM5vNZj8/v1RrAgAAAPDsYEkBAECOc/z4cf3666/q1KmTJClPnjzq0KGDFixYkKxtrVq1km2rWbNmqv0XLlxYTZo0sTw6f/r0ae3atUtdunSxtNm3b58CAwNVsmRJOTs7q169epKkqKiodF3LsGHDdPDgQf3yyy+qU6eOZsyYobJly1r2X7x4UX369FG5cuXk6uoqFxcXxcbGPvY8hw4d0rhx4+Tk5GT51adPH0VHRysuLi7FY27duqV8+fKluO/B+3jz5k2dPHlSvXv3tup/woQJlsfijx07pmrVqln15+/vn+b7ktbreP3113Xr1i2VLl1affr00cqVK1NdNkGSBg0apAkTJiggIEBjxozRb7/9ZrX/weUE3N3d1bNnTzVt2lSBgYGaNWuWoqOj01zfwYMHVbx4cZUvXz7VmhwcHB75dQEAAADwbCFwBQDkOAsWLNDdu3fl5eWlPHnyKE+ePJo7d66+++47Xb9+3apt/vz5kx2f0raHdenSRd9++60SEhIUHh4uX19f+fr6SroXODZt2lQuLi5atmyZ9uzZo5UrV0r6v5dwpVWhQoVUtmxZvfTSS/rmm280aNAgHT161LK/R48eOnjwoGbNmqWdO3fq4MGDKliw4GPPExsbq7Fjx+rgwYOWX4cPH9aJEyceGaoWKlRIV69eTXHfg/csNjZWkjR//nyr/o8cOaL//Oc/ab52Gxsbmc1mq20Pv5DrcddRokQJHT9+XHPmzJGDg4P69++vl19+OdUXe7311ls6deqUunXrpsOHD6tWrVr67LPPJN37+q1fv94SuErSwoULtWvXLtWtW1fLly9X+fLlLdf5uPocHBzSdC+uXLmiwoULp6ktAAAAgKxF4AoAyFHu3r2rxYsXa9q0aVYh16FDh+Tl5aWvv/7akPO0atVKt2/f1vr16xUeHm41u/W///2v/vnnH02ZMkUvvfSSKlas+EQvzHpYiRIl1KFDB4WEhFi27dixQ4MGDVKLFi1UpUoV2dvb63//+5/VcXZ2dkpMTLTa9txzz+n48eMqW7Zssl82Nin/74Gfn5/++9//JgtBH+bp6SkvLy+dOnUqWd+lSpWSJFWqVEm//fabbt++bTnu4TC2cOHCunHjhm7evGnZdv+FWum5DgcHBwUGBurTTz/V5s2btWvXLh0+fFiSlDdv3mT3Rrp3r99++219//33eu+99zR//nxJ916QVaBAAVWvXj3ZvQkJCdHOnTtVtWpVhYeHp6m+atWq6a+//tIff/zxyPt5+/ZtnTx5Un5+fqnedwAAAADPBl6aBQDIUVavXq2rV6+qd+/ecnV1tdrXrl07LViwQG+//XaGz5M/f361bt1ao0aN0rFjxyzLF0j3XnKUN29effbZZ3r77bd15MgRjR8/PsPnlKTBgweratWq2rt3r2rVqqVy5cppyZIlqlWrlmJiYjRs2LBksyZ9fHwUGRmpgIAA2dvbq0CBAho9erReffVVlSxZUu3bt5eNjY0OHTqkI0eOaMKECSmeu0GDBoqNjdXvv/+uqlWrplrn2LFjNWjQILm6uqpZs2aKj4/X3r17dfXqVQUFBalz58768MMP1adPH4WEhOjMmTOaOnWqVR916tSRo6OjPvjgAw0aNEi7d+9WWFiYVZvHXUdYWJgSExMtfS1dulQODg7y9va23JutW7eqY8eOsre3V6FChTRkyBA1b95c5cuX19WrV7Vp0yZVqlRJkrRq1Sqr2a2nT5/WvHnz9Nprr8nLy0vHjx/XiRMn1L179zTVV69ePb388stq166dpk+frrJly+q///2vTCaTmjVrJuleEG1vb/9ESy4AAAAAePqY4QoAyFEWLFigRo0aJQtbpXuB6969e5OtyfmkunTpokOHDumll15SyZIlLdsLFy6ssLAwffPNN6pcubKmTJmSLEx8UpUrV1aTJk00evRoSfeu9+rVq3ruuefUrVs3DRo0SB4eHlbHTJs2TRs3blSJEiUssySbNm2q1atX66efflLt2rX1wgsvaMaMGZYgMiUFCxZUmzZtLGvXpuatt97Sl19+qYULF8rX11f16tVTWFiYZYark5OTfvzxRx0+fFh+fn768MMP9dFHH1n14e7urqVLl2rt2rXy9fXV119/rdDQUKs2j7sONzc3zZ8/XwEBAapWrZp+/vln/fjjjypYsKAkady4cTpz5ozKlCljeWQ/MTFRAwYMUKVKldSsWTOVL19ec+bMkZQ8cHV0dNR///tftWvXTuXLl1ffvn01YMAA9evXL833+bvvvlPt2rXVqVMnVa5cWe+//77VrNuvv/5aXbp0kaOj42PvOwAAAICsZzI/7rlAAACA/++3335T48aNdfLkSTk5ORna95kzZ1SqVCkdOHBANWrUMLRvI+zfv1+vvPKKLl++LDs7u6dyzv/973+qUKGC9u7dawmrAQAAADzbmOEKAADSrFq1avroo490+vTprC7lqbt7964+++yzpxa2SvdC6Dlz5hC2AgAAANkIM1wBAMAz4Vmf4QoAAAAAaUHgCgAAAAAAAAAGYUkBAAAAAAAAADAIgSsAAAAAAAAAGITAFQAAAAAAAAAMQuAKAAAAAAAAAAYhcAUAAAAAAAAAgxC4AgAAAAAAAIBBCFwBAAAAAAAAwCAErgAAAAAAAABgkP8H5wq5TazU+vsAAAAASUVORK5CYII=",
       "text/plain": [
        "<Figure size 1200x800 with 1 Axes>"
       ]
@@ -489,13 +489,13 @@
     "pivot_df.plot(kind='bar', ax=ax, color=colors)\n",
     "\n",
     "ax.set_title('TTFT vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n",
-    "ax.set_xlabel('Arrival Rate')\n",
+    "ax.set_xlabel('Arrival Rate (requests/sec)')\n",
     "ax.set_ylabel('TTFT (ms)')\n",
     "ax.grid(True)\n",
     "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
     "\n",
     "# Save the plot as a PDF\n",
-    "plt.savefig('/usr/FlexFlow/wildchat/ttft_vs_arrival_rate.pdf')\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/ttft_vs_arrival_rate.pdf', bbox_inches='tight')\n",
     "\n",
     "plt.show()\n"
    ]
@@ -740,13 +740,13 @@
     "pivot_df.plot(kind='bar', ax=ax, color=colors)\n",
     "\n",
     "ax.set_title('Queueing Time vs Arrival Rate for Different Models and Batch Sizes\\nLLM: LLAMA-3.1-70B-Instruct')\n",
-    "ax.set_xlabel('Arrival Rate')\n",
+    "ax.set_xlabel('Arrival Rate (requests/sec)')\n",
     "ax.set_ylabel('Queueing Time (sec)')\n",
     "ax.grid(True)\n",
     "ax.legend(title='Model and Batch Size', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
     "\n",
     "# Save the plot as a PDF\n",
-    "plt.savefig('/usr/FlexFlow/wildchat/queueing_time_vs_arrival_rate.pdf')\n",
+    "plt.savefig('/usr/FlexFlow/benchmarking/queueing_time_vs_arrival_rate.pdf', bbox_inches='tight')\n",
     "\n",
     "plt.show()"
    ]
diff --git a/benchmarking/queueing_time_vs_arrival_rate.pdf b/benchmarking/queueing_time_vs_arrival_rate.pdf
index e77da10bad2cce12d587cb6ea92c4d3eada9af5d..a552ebceae828b0ead4e669f2c27f21348b35997 100644
GIT binary patch
delta 4353
zcmZuxc_37K8;)iyV;Ot4IGU1>nKS2{S+Y~s5SmOXne4KKvUD{W*($`8hT__HvgD?;
zlC<c?kVK>_p`^MgrRbZfd%wQS_xtC(zvsN?dEe*#J-;)5TqrqHsJ=@GbzuKP9J^b{
zmf7k7m957{+mEhF8Srjbr-uak8=F~#OQ_r<_irICHmjXEPYN3fsBOs{Zm;?h74=j+
zq~<_dG#>FlH7|HQKa)JPu)fvV{ssEc(Ic#r(Q-Dz=i#04uEdS2hc^}|+-jYPj3<ca
z>J93-*v+gK*R~v_&*Xt3tNNeX*7tT6pJz^RGouw^{gKqQmV@qftSvHe52ECzXW#kb
zsVKP&=>|pqi*?fyqJ!2o*Vvxr8J$H}GY2<l=pm1`8mVwKS~Thx9a;C&tVN_oyK$G_
zw#C$%T~u~D;aPKmo1^|<CVRMIoc@usZl$R8o+(*zR)UNp_F_D+7F$9y))q!Hqs`^b
zv9{wa;-eO))Wr_WN2@;cT2-GUQIJ}YTw=9(tPj10D#PmsuF6QgUfge$3Yc@1(=}D9
zo}2Lm^j2ud$7en14F@dcT5at$!xq)Emf!Hb)%3x6V6ySBC%e4n>g3qt-bS@%@3gmM
z$s^B_{xno^;p8SFBS~7OGLy{N9+;c)7zhitehiqqqe8Cladsu5Za8W>;o`bPD4_0!
z*WNbviFdJsEm>OM`<n{o$foytcjI)-LK0}4qU5i<PON*yG$Z0g3<6|xwbBx#>;JI1
zI$<bI($h!@&>hzah`gSNiFE|Evog|hokt6z3*Or%0n4z$^2~gE(b|D~p9X9cXw@s{
zBI({XtO<9z-mw#VCw+{9HYRVl$@;UW=69{iEfL$+1so%Ze<YfuCm!Jnx38~#NnG7L
zx-I1d*Mf<Q_qf$=rS6R9gtpt@#g7iJSM1NJ@=x)EKD-HDJO&v#P7J1uCi*@<Q}WH2
zZ<v0Jcyr>WQfA3xs^z|#zgQM%omkw!)K~z&WKsC}GRKtj(t5bk<5efOdzELD$?60u
z+1nL7^R|hd?TpzQR63?=7-z&+LwCWrSNVuZY(DE5DRxONE-lX_-)f#CC7BMmSNqtA
z9Lw>bzojKrG<;T($g%0Y*m>+^>0gQ^&pXPl`qnaUJI`SfN}J4+fb0d&^O0*-$V_wc
zGsce#Ur-gL(xS12%$*(U*HA0+%{pgWWkdRij`NI?ag>mW>H5K)J>(ndQ!zN_t?QMT
zx&6CUg#G?1J3UvZljGSrds!Cu5(*)5tMn~L5z(`pxYF&c`(0$N{i*)vM3IKtORgJx
zF8<CW`J+{kqbtHodQi6d`^_<VFOUAV_D1^axMDknDs~I8(Bicg+;PJWdn#J|K+qet
zfDq;E3C_2vd?&1JT3TL{xguJAdSHb7_r{Jr!PEY<(^{#P!~U9UTg*F9hJVbZZFFvS
zjL3a<aJ95z-(|(dNsecP+ki*K`5j8P^n&u{m|WUO@X&XE)_Oa4(1Uby=OB~?Bi;JW
z-J&2Cf0Q>X_Ue-q^Q4F<OnuEqI!Uoso_`-2txCNb)>*f<8l&ictQ*@}zi9!*KauX@
zkIO|N?jhE7Ww@*qKUI+AsCl3MiGvBV#aUe}NDgZpRXXYLW@uhb9M;QaXNVA(hJI?K
zR`2cw&)qp@@)BLYuRJ{7LV5>`4KZf6e)YS^IPEt5b$5kT&#vK;2(+8#W6fbb%B#ic
zx<g*yF7J8cae}jJ@86R5*Z8*etp>THg!iKIxD3m#y79QA7jh6IFiSZRT_-ClU(5&N
zSpQ~fhuprKkovAui|P9)$@1USy{p|Wt+gt$6&-N^-;clWjf=Wg_4)JNqQXsZWj!9?
z+%zx<J=pXmZI{v6XMtJe)>mU&z8tzTbIA17>bIz!rnQ%_=QJ?&^Yg(UKFvliwzwan
z7&>BCM)titu^1p#=ojnlH)NRT)cm^j)1TCeJ9V<1B`79pFl!ZRJEgLoS|a0Wtk>sa
zdF=R^nSK5pWDAeLvpRX5^O2_H7cI}}@#D%mTX$=fdp6y!bnppJDa}4>-a$XXSXbTb
zcFP^1QB#V!^4MuQwyxhb@qS@4=jBvkw)SLZ!|2?gS|M!UtvyMdnaot;Vj8lyCEhP8
zC!Lzo%4U{Lf(9Yt_F)BO_o~OzHPCdifXco5Xanl8DV6yefdMRyHR=XThgdh?X$9lX
zmByGyHZAZRciXH&kdkq8<z{RHyeT79%~xBQkuWDeTfyUy${Gt)4E$e?{OBrcvFhsq
zSjN$OUP7><n_Z6kPDRqOeCFWRvSfO0k!Ue=*Zo{+-p=dYH(bx<AyJhcS=?e;9%nFg
za`?i-3lE1Ef};}0wYfPj@3%VWo1F=SvUHOQnA`9(l!y#<679L~M0{LOj6ZtN$nV^y
z%&%_@rClxz_g`5itrzuswWA8QH|-)@G0O0^#2|X|X_YqSn(2YcZfmA!sr*&b2Sz8q
z=zol*PMob;JcGv$<D<yrYWmmSb8d4^!$WvYw^yAr^NU_lF$?Fv_FcL5<>^b=)|A)u
ztlNj}<ua34vWM(%=KB*k<j|4DD+k}dsM+B;P4}J()>#pXR}ED<+TNCWujAZYh7<N_
zw$ld+_1YF^TziIjSj(&_6U~q=-&W{CL`bnp8HSzjy?dHjw=`4441%xQma%Xx!Bu}v
z$xEdl-ay{j$O<fBD5H?$SBex(3aj%=6NX3zhiIeWYUYWxvd}yIovW+caJlZ%SXCL<
zA;=gPn-X_8Nx527fuw41dY|9htTbrrpAHXizvbY}N;3<uj0;aQXSrml)}D&uh2d`-
zC%U1p?M_FsLLWv2{r&vfu{P^@_0cxtJ5OXicP+by?h?Cgx^9)E`KcRv*=M7gKP=4t
z7X9JXg53UA_dVRNZ$6Ca&dC2p`Ld%tU|t_yd7v$3b8py+GbSspNNv!tcr$kpLqqEY
zJ)`Jx0vi0nwrRYV9Q>-m9F#Va+)z~eq@yh3M2O`h&S$hw@L^2AJO671XKCG#j#@t0
zF~Bs5`_g!^b+^Gt%{lTTiRLYKmxsyS2Ud#ITzvd4j$yu`Za(g`h2;$l*3OO3PH^V2
z>wc8VU;e-%O~mPQx<h`!pjA4p9y!vq-bFcq0Qep^N~nFkyo}j0Sd(A3;?+%8UkSg^
z-P&*N$e#Z;U<YxM%4(=>Sza`XY0~0c-g0ZjZFIRyZJUqX_p6HS+C3KdROXBcmY}-5
z6da0)_PlqxGjX|$ijw~>Wd$@*YF4r?$*-%#&R}$+{pILf+1OT-Cy3*C@rBwQ()!s~
zHq=&Q4x;sDsjdu#vW|Rn&zP>=USaUSNW-_~KFQ5O?5h$hg+WTwumm1hPJ|(LkLoI>
zgL7-`a!~S>aiS9C`uyXoF$U4m9X{VkC1+8U8!Z)V@3gyK^1jQ6v$L{EH^2s~+6P@S
zB3rm7A7xeUO?`Q?I;X75Ro|ie{mAyZ7s{gtY2?1Avnd|Jna(U4BVPK1VT<Rd$9uJd
zoF!I)Nn4=90d1Iw7C8{5a=h73`J}ES)A8X0UAHgza-RX+`e;JlD$~YX|MK0J)T3Xm
z6SWw4a;h~}%Ry`=O~k89e`i!>W8O0L&5=2&$LW`)I(xMwr95&)YQp-{I6#9Mt`zGb
zs?u8@wMk?D1?6{FcicMJ`tU)VbKHwvuQkPPsIC2UQ0T7OE)R@Ph<8L}xDO+k7qOq{
ztFR^Y``IxD@=YJ~j|j0v@$&5B1_#*;-jlvsn@DmaEVI~Uk3)L_Lr-qIXmy;pce@YO
z)iur%e-XA_Ie|7FreYSrDN?VKtb><*IU=peRwrn)Rq@vBE>NBwiEm}gK=Mr%grhue
zSib2gNDyM@zzS>}G{#<!D6m!G!|Vz(Np=~WA_TF)6}();Ls$wQ5yXh_$KYdq!b1Ue
zCD~FGgWm{{A%Oj!yh#Rw4+iid06r7|K?uO_YuZF<5Mk~E5D*{$E&x6pK!`u02mpcj
z5e6UObHFDAK;VCHK0SaS|LnpTUOoUq`mu`y5b{5ff4C11r|@?ed}LH8fKY$D*$<Ei
zKcWKw=|@#Ld@Mlvr|kE(Nl;U$DOE@aW>=aepvgo8hS{LGq7cHiHYfN|$-Hv>1%v>Y
z?*T|6Lp=JW7@0@E1f%j)3t$jW+R{7(VUL(^5Jt#sRSRt<Bv26ni2@i*1qHf*A^xbA
zu!H2Kc?5_+=8f)`JOV_7d86Vj=1-{v2>CyW6mY2o0z`!b2w?*G2jTzl!9<8R`d<sf
z2>*<JVoZJrQ%Fmd5D=oE5(EYX3L^+f5Tn8Z7>P&{5JEyoOo4Hc$byPV6uw7)cFm8-
zAm4*aFqra-y6<^ZSRjuA^4-i|%>O!#K!F7nQ%JDDWT*tb;rZ+5w^Sl^sb>gOm?$uQ
zDnj}-k3WAZPu>zjkU#)`P2>RtBL4>eQWzv4uz>9eB<fPdAc0C0xH2FJBh00QJo5{D
z%$xMm@q-{mzzZOeNEFB;LQ53$gb-osze1S*z34=Q@-O%kBU1#LfrtbFB@jXsm>@(Z
z3-Cb#*I1PR!UO>agJc3kV(VI|^G+Y{5*iu_{P>;WZ9{xR0ltxWe^e6_OhhCjJTf}m
T#}@;VK?)hOX3ZuiE6jfYXV-uO

delta 4014
zcmZWpc_38#7k@liW~3t78Cj|!b7$_IJ2S!~Srf8ETCJgMlMs)~NcJo@@+e!@ED_Qp
ziHba>1|dm`h(vxyipp>1)qAhZ_pfu$J>PTA=kxuX6P?1dl)zK<fQRo|!xR~P%wvKa
zzADv|E-)>6LvS<v6c9+yl6F{^oIJF%jovP8+SVN4jP~(+@2C)`Tnn*3wK6laqb2c)
zx#VnNTYT<yB;ogY-^ONFif6~o5r-7ZWKj!VlkoJjYQCmtV+zdqOdW5&HJ1=XbN|e@
zFfi@7q}<|SEYsNpw&~to%I4b<s%092XvoDH<wfs9gGdDxuknIwn2ST`(c&bdkPgM<
zApe*OV=bv}^OrTgxuM_hY+6X{?VGbN{j=y*NAic;3*igeLNG;M3JTXxdf$qjsN;R@
zq@~BG9yUEj*J?Zva5>%isdJIPtZZzdk+Pu3POA18KjWfS^1g)(!Hpxxnmv5l%4wsj
z;aFQbZeViM!zL;AwzNjLDnmY1nwV%Uaehp<s;9lDL`Ri3$w5Yi!Dr%34Ra|;{k6qh
z!QyhT8J{WLqBBC+<aaZNrSSQQWLZHN%&tgkHN3EfFH6gH!9d|rcKY)YmE&ic9dW`G
z|KX10)?W4UIav5pgl;jCRFo3-fIs5+j=%VYx;Oj&x+y+0RZ%ctDM)(K8NMC6m+7%Z
z-vMtE_P8xgSdY_)F>)ut_o8Z~ydDo-ro45d_Lu<Ul1`@K#|$jwYZCuh*hAJoyw2c%
z+Kb8q<w3Zj(({)Km^>2rH?(^h#yYU4VMOGsh^{SCj9;__&4Q3yG1*56qs~fuZ7KqW
zdK^;}g|d%3R;cf+%)2mGwLxxbJyz4GChTm%&WHO83$_m`4|IN{ZW1-c3fI?O=}hir
z98lb8iVZd5ZAyqvq~;$6UYt!iEV@yl-p2mem3&3U!`|llPlT@b)Jw;e7!sK&KB-?9
zYo`sj?^jdq-Lxq!6>*yozE>TG#fay=No2o!F{fnwU(8HR#cs{=g!7~;R&I$Lc)2C}
z)we~QMHMHR3KJF8Y3Ghlv*sW+8fTMj?xpF9>VO}Esrbf}MgGnmKxMdBq$^6Py9GP8
z*h%HLjUQ8Y*}P{$o?w+8XP2BRk+qarw;3^dt|pw`#V^IM{}9RSPCKYu<s7V&4q}Ep
zWyfq@YM=S)CtiPkEH{`}u4mTp#@<fqWOPcp7+=szt@}xX7koR4CK7k;t+W5^;efab
zUX0B;JRF_x!jS8kH%usbbfd$#?3!>yzpSs*hcq`K7<^$&uysH?CGTFJGT&e@5jKeA
zio5h~J$ygKP)xbHcg;5cD{6ELhTOAd!?dk7Tl`#iMh)t_$3>>e<L5-D2Zkx#LxB1t
z;!2Np4qKiwJB~%-F{%UAO+I6>arXuXm{&CPMWY*-I_AESaY6wnW@TdYJui(zcNUFh
z@g=9#T$sd`@ye<uOOJZ>1iJyv*+Qi3=CkX?oC$X5$d9WYuz_Ze37%<{O-fBOc*+^k
z#F)MP_9yU~GA@D}kM4h0eFE>Wg)=5DiKgvN*5?8OLj<c%ShtlPjb9StFrR1iPC&%1
z;0s>_(cNtKk#4=U?ANP)PZS?XHB6xEt$XRxvVCjTaQ%Xu5TuqlkEZYw5QQ%IL_kOG
za?34JgzBb@-*)`c>F4!IudV*hQ0d1<^g9|?2R=1hCgwLbm`q?Qb*}x^VCUQS<>{d>
zZC_8{8*ai88~etjGIu{XQ)=Q=(Be40PSsF8I^$61vhYI{x7J=|A_jX<Nkr99f$d!8
z`zO+DC^=0)Y&1z9c~H4F`k&A~<4S*vgk7k}%`B(!&W8_|X2NXe=hF=I<K0EEdIhur
z_g^p6H$DB*rXk<uOYZ2|enfjHwE1vgbIvKzAl}NHeeI`(1uhk9jw#ljeGF<}s|n?c
zg6cwb=kC!h(a-VDR=m7kCL0*)7i|V)tzt^E#XU=rJcG7N3)HYz#}x?VuCrav8ya$C
zP9CI}-_FEtG%ZiIuG&A=TSTJWwc6c<vn_9^I{ai%&AB2IzwYkd$=G6gxq#i<($?|E
z`eq9(%iUtE`}R*l-X&T}+o_4MWCK~oSPv=t-DRzAOzW(_SnGFNvm_-JLsh7jJ>*_L
zqa@yPB%rpp?gsvZi1u9@i*<F7UXrLt{|(iq^<Y3;_UJkJD6HZ^B*a547z4*=584z(
z+h^vt-#tVWOQ>pD77FGh;qUM>!TglNLaRaPjO|EmG+^+E9dJEPt<Y?HB__+j!VBM{
zufB&F$&_-f=<hbKv_jsDluF#sY_uVmOfs*So4up3W}Zzwo0|E0y|Hz(*rb{5_;a@>
zp+O5b9Kxt&v!W*A`j3ArlL(Y8HxkLh(`)8!bgpaF@_NnG^yeFYU0f&9?c7+A!%J0v
z6k4h2szs!Xo9`QXha<R4<RYgsw|@RQQ0OTyci#Nt*T?^}?5X*P^maOq+Ybg(j>tD0
zneq1CH{iuunEF~8viy3~^UJ`Y7s#~6)tvO4hQ+{@1KO{YoObldktEN(E-*dr)vV%V
z0ZBf{qWG)~H=RD&w;c#jzs7>k$Ja|FQn2?NU*3KFs(-n?INGPPIQZhlBPAs%FC`SH
z2bW%i0MXWEmfQZ2mO-@Rk7bVLRljl28@;N_cL5$zwr<%l1l!KV#PS>IUXsz+uSf0%
zgc;Ip#rfjHd1W(&k<Va_FxwHjj9yg525|270p+q*rA!AAiu8KxXP~xHa7<|US;;b)
zjj*)F1;6Fl)I`vxYtNJW?=zHi3sdr%Uh__~mf*FG%#zEG8j%&bt~`N-=odUHFRsiE
zjLn5fL?y9kx1XF&{Zr|7!!Kdjj19>MWtSuaA8DsfoArFVJo)MURR6NdR6yO`f=@5r
zw%ZN~fAsxOZ{EY&@2)v5IiFFKCy}M|`Q}>9Y-pBp3S+xQ?Ziu}T82mUG2eaO^TLCl
z<p)Ky=4km9ET<l1Vpo0M@duw7*I?bK+Muu_mR<NaQs>t$W;jj%mUAog>Ckh6gil|8
z*4rqDtt!JaGVYD$pPvj(f4o(Nle7DBYj<AW(I|mJwFk8WV8+(^)BNJg{Es38vJE4P
ze{WY!GVxMs#}7!iKHan}K~Hi$Xqes7_krK~pw~W&{`ga`w$&g}HT48^aZGR!feWR~
z^sW0YT6a|AFDmG-D0M|##;B<a9dazXHxAl9AHV8X=M%H1nWd0e-ktGIOR?&8=V&WW
zj4-xVLpYB25B^|&yF1!$Rw@$j=N(VI?_~Pn6@nA3KfOttO8+(9^{V;XE3JoHFri&G
z94gk^d(Zbv$_lN$9yq-cm{XyJ@=(Rl=U5Cc0M|@n2{_bddmpN|U93i1;|vdq*APQ{
z2_YzRgBUsj7@-z`7^+EZL>~fTHG-r#9&`&)tfmkIcu*E`BRUC<qAB1;bc7s^_CaFk
zNjL(1NEWLRpd8^9f~Yt`uq)jMgPx@63*ZR5a1;=OHqi{$;s{<if;WcXgCi258d1$^
zeuR!A6LA2}3P(7JBa^;ct^uyzI5Noj!VpgSIy$@J$k4Zc{x~xEyX8)IW&7dpUx7Y2
zGUc0n3P+}XvjT8r8ppyAf^jhL&BtjC2Y)N7Q{$iq^6-$*b2}qtV1Psh9JXO}bTF>o
zE<R44m>{~V8wS9^AgZnpAP|jxiF0t?ILzM$5hii^TJ58<ds^j#NbHVQ`yh62t9=x9
zmp}Q~A21s+0HDwGH}gVdbVXkkA#qiO;BQesb_YQm9)8M31Ze;l3NnSbIt~E8SN$O$
z0Fu6=K>niu0Lfg10gwu?pYo@|07xVK4dcgz5Wv+S0Fg*s6hLIQrhdqWaBPUmQ3&V!
z`yK$91hX};%Ezw4<s)-cv?`7Ua{1Wr%LN1`lM${WFohd2Oog~EQ;5)NaR3wu{s|qY
zEeZ|filY*t?+qgVA*Mpyh^a8h#S9IAxSFPssH=4Z&>$ig{WS6_m2>!~!IafSh-~-j
zghYS@txiY;$RL*p0hoJi02;tGB19qxBCFnxNF;Ig0O$P6412>+xi3>`TyZ3}Tvy*E
z5xM7;1cLv7L;hU=BnlU;L=d2Gp#wp#5hH>S*W?gEn8?K{TW~+*^QSv{>^$L0$7pHc
z2-Y6KuI%-MBUt(P_~Y17oJDHl?Z#ek-*zc`m!0%?r2GH9$%q`4VpLQNEDSOK2Z`PN
AdjJ3c

diff --git a/benchmarking/throughput_vs_tpot.pdf b/benchmarking/throughput_vs_tpot.pdf
index d17ec837758d25c127c21785e653e3386e33c13a..064bfb661cbdd158052970b88e25d7d7e6064c77 100644
GIT binary patch
delta 7168
zcmZWtbzD?y*A)~Ak(881LUKC55u}xpE<qY3g-ZzxM?ylnOOTWX8KfN=5d`TD;erZC
zr%3p5@At-inK}Q=?|GiH)?RzBz0dEQ;Ta<Q4AI-GL~LB*s#cay%w(LsxQw_!=NA|T
z193qSATDEycwr(-pwoETiA^yMyIE1FNTJWT>$jc$ZVqnEWj0pxoXGkPwIP*zDtCUR
z(qxP8X=b6awWX&!4%fX}%@xkx8}$;!kjmmBByqfYoc`sm@t2dm_L2=LV9xlf-uR*9
z<mqKV$}Q)Z^z3Y|dxP}k^ew%Ydu{xO0qWXC>Z!r#-DXZtpz*M=B$?Sanp<kEbUAP{
zVlB5_vU&hJVCRiZHxah+;U>Pd`18bj;LyHnRIp@=dM?jRTxGdw&JEbO>!~NsV65#9
z1QmfAsX>qS#3)!Ywpv@#hCXHeqRHQ%Y!c~aT=Sbdoj8>!VUm7PL@E}e4PP0!x#NCF
z5p&jb%<!8Dn9YCAxanPzNY^7sTI*Ypp;gn^G>!LeZ2aXsd4CX(m3n@5u-Vw`!8Fyh
zd^UzZUEFA%+wfy6ow~ww+t^uUL|4EwrSN)cL;2!_(rs^A^Syh_1>?^xI-a%o>vF%>
z21_Q$N6u@z>$HFyJU{qcQ_&@fLTk-mnKiP>0ZqOZ0qi>76qf`<9^&c7iVFkw?OQ6p
zle6(*#S>k*kK*|p2S#A*qzYyV&kCw9Js&G2{&q(rH*3@^%_#Y_J4eB`xIt*1dDx!o
zMv#tfIZ;=}ZM%S#M5+g$)*dp#1GGkUh)A`Be$te-2RGY2sONG0xi(VcIk79^7ci}G
zoR(Jmdlx9EofT*DJk+Cpe0G#GT&Ktkxa=M!GuaRUn#JJSyzxu{(ES>!U$UxK1%GMw
z6KxqD3_W<B&zm!RMIdAM+i&3Z%|vfO29x{UPrGr9kLupgCd>4@JbJVoD`>l}9Rzf_
zY=1ZCR>;>J;1c<8XVI#S(UQWL7nc9#MPlip#?@XRemN#!ic5Y3qTS-(_D4>xxwCIf
zRLFIDM0UHo>hg^=p=B=Lx1vh7$dzteUgvNho3G5)Q<HQ=-!YiDLOl*|)v_&awYXk$
z`8)e$iT&0q_db8ehxFlaflp~IW@-Emb(yR%CE10fA$~4zyMb5klNv+BR2E<^i}oPV
z`5O%ifZGFA2CoEd{J>net*H~qIfHu$<W7k4)W?PHHh!<Va`H?Wt?L|@89vqq(9+7A
zkZ@Ed%f)6Y4++X{@>$7b5)Vqn$(^oDw>LLo+?=KSu>L;7@z<FiH(`G&wLKO)pes2p
zIZpra{bRYMbI5VP+6VIz&AE*$qJoP6NAq?YaP^YEOxf{Dh*R?|Y1;1@I@whppGHz-
z@@SwZB}?!vd5Wl2+tOFWM)M6PcC*_h*!-J~Q#BQwZ;nwjA1sqS2KlGo(Z3vu-dK%X
zZu%V;Okc_|nX#*{{c5>7@iSRrqQllOP12GrrHmqKae>sX(M2doRzE}L8%M%;`-T9m
zKOiggQiNh3M@g%Pz)I&^S}9b&m(hD<7J$1Ow1B)R4W!Ug-K=Eij%AY?w<wFGTed^4
zt(h>3N4g<!gX&Zo75sk?aptt@6FpYhPL~yt&8inrd?R#~_XTI2=&Yis2*X61RwjS7
zFb}Ncj$b8Pw!MllO=d6MpuqypQCeSx7AQuwC0u#tNXbt(!eg-~=&uuM_yqB*h%AH~
z1R90S#H8pbJm}?dwRrWs?xc-}#xlCz;tGG-D)wz5sV!|N8BZDQy{FW?zFQF2(B097
zMLlJj3S8NvD{;p9t?o6~ojJh6Y-Te=H!%-P?jxyI^g3{yc*Vg!h2I}*Ip7UoM?f;1
zZ$-Olv+8S0R^G%Wqo4<6an;U~mXl0g!>e@#j;@!gjvs2)rCd@V%b~bIA>qwA{YJZ9
z72k1i%b1ga=ix9g+c~b@jBi}`Ypap3eR5XO=ymvX*ZA=3>G8DsYiEfj9b^1R&|_~`
zBFe7sjE%M$41%~I95O`HaFRP|CIOJ)Ul0u{6sqi-*Hlp(auL*Ze@Wjd=fUBQPk%vC
z>n$gw{IK)peeX%W1p4;6$%LbQc-6C{*^QhPK4Qc-7mWf911Y9pgN)BK6KkSO0{uHy
z%*5f$Cd2e;dX1-fPaY(D!A!@L3|B(f1osrugkJ1@`}D=x>8OvLF)^?-2iFg9iU<x0
zXu}R%as^jf(?pY&Y9(KO;XC>;sqiFv!9#OB*gyDF<p}TWRotR&_!ahgy8DiKSI9M$
z$3J8SrOV$W?tQDIr?k=bkRlFd9Lw=JYvYf6*pbw#(vEcf-I|p0fr#f!`mY?HkhTq9
ze@5^2w1n3h^R$&AHAE>sK7=J&1mI(*N>64QO3ZcS%%?I_ugYJ$Cb`?huGA~L)svu`
z&PQFJu-N*Q1yhdZ?OUHed|}kLVSW(vWjp_83V#msSX29C!H#*3`)0u*TLu<5`nmpT
zwxEAJxx)F&EWU9h*)SaWsj4#RWb{pzCSHDNQ?<pwP_0=6Ta)*$t~g>c31ADTN+1=s
z<mUPvA#ZCD2;ryuq}vNhFbcqhnG4<SF;EG`C8%*E4PTpNNK$!Jlwu;tA4^%{nk2hy
z>OYnKsBTl~Mr2IqQh?h|gqcG;iOAZ{N4Bg-HdNo+p0geI>Q^l>C0kS0!4^A5`EBdT
z*AO6|Bh07FuZ=#;q(21c=K$_tPqvH)o=ECVG3SscFx59E#yxt;z)#=bTl>@d?SmL6
z)%=I)rdz@tw+9#0te>H~>wFiz^WSwnmHZ>d8cl6GqsJHdCEQO5<7!vg+_m6m&wiWY
z=exGf{DB)JK0_<sUvCB77h{DK4Y2uGU{X8Qv&u<WR=QkSH5q7b;41-rQ@Mi}l5A6t
zt>;#`22Y`V&-fEOmY%eiA(S~>Z+GP;^e?;gu{^O>ba|XIdLm{`eZ>CwW%T|q-R9u1
zk0Jk?_mt7<y*m3=R^|`#W**d^&OpP%pLBQLOs>m^2e72dAi@pY;s;n6R4Q-xga&c?
zl#3PI3JZ%AtdR;!o3p(Q^kuqv;8SAgM#EYd&BKjj^9tJ!9i;`+q+xsIpcEbrnm(PM
z-{PDpNs}EvX7_ZxDHI!dgMK&_!%tZ|E>iy?pORGUbCwI9{IS#JMy454kHr*cb8^yH
z2YocTRVbeGi>n*e*~0Gr_{RAV-bfDrjr9a%8L-`V)ysn3vuLOQ^7t`|a3yOAksiYp
z)g17*@zko6FO}maN+ZfeYa=$)ji(0D=4{tuYtkx34MxVAG{)WEV`YV-TaZK9JC!Ad
zD|@Yhsb%vnp%Oh4uAW@<@V4{Eu74LduCg^W%j(hStm{N@J;NPZ5)s(jGiiRipn~EO
zVroWC`{ihewT=xiV#fPTVT)O=*{#j=MXIrR4#I1-A*$UX;imp&|HOE@bS-aiZ2;+D
zw1K#e)1i5eg4g*oe~U)>Nk&)m-n-pAr{1#pdwY`6a)ROoDkEg-I(j4=Dq^CQV#3^~
z#ra2X1^(dm5*bm*3x@o)e^uRh+{G(qeE;LaFaYXmya+7R=X5nJEZo9us*71^@ZQle
zX7F;z(<Nqdd+KC7yjt0$A{%Y;{@GbmZ{JSwebFC<Do2kkoWI41l@)f$tRLEBw(`zh
zL;H+O>D1@hAJ|q@?ND+sU?QrN`%YuRr>qMOe0cg77&b8~Ce;UEF*{|G^?gWYyYL(&
z<-MJw)FWUd>HU=by*nX#rQ_o0(Fu56o^ZXk2UTWkZq7Y%hK8~}zrRs#o_#(?DN#O0
zSr$D<5j|MRACoB4-h4;(-Jx8PWTc>a+#s@}l~-1t%Ip`j_SVRfKTkz%ogz%~=`LO&
z?t>8q;au^_gGH#*X8L`;Q)RZnhpjQzRlpo1CMfKlcOVe~WlY(wRDYqr!J6r0mv_zN
zWo2z2_vafX54RQqri4}1$wc;8MHj2?_V&{7(|==ZoTOYh9Wl2-2WKnxg!)RS(tub)
zg@3hNdO+8$ASIiXIbvA=Cf8u$Ze=etv5#ctVF1m)$U0Vwh6OQA@bynxCsXczYyzqr
z=UAlLzP;pltHilCDno2J-1*$#+k2|GsVlq71%bYXuQT$^`Iw;xI0O2Ql;IZn@9Psb
z1!lPH(Vr1mkIC}vddOt3>(hX~<XDH?5#s8YoKCKsTXBX{AG^w(s;0juQKwqi#R-#r
zZzO3;{k6?uRx}wi<5XL?66+!90Ko&)a<x)J4xSHc*`504#66gh&3=vb05bya@T8F;
z0-uY*v49fFU<`DOyFb<rPNGOYG-)3=)Q3XL$MyBits7Db%idP}kT98$w0ng6z`c9u
z!BFI-bl;gIIvOvV!?ha{omxnnSzJzWCm~H<L@8}5LEStLDMQL)&B^N;(+luGHEv3{
z8z*Y<E#J?qiSqv1S=|4O6EbAfD%)Bi>XEdj$@u>*74j{~Gw$pcJ<CejeD-{Z!DP0<
z09l^~hC%-hc<{4upz2Y_FcBhvPpPNc^tz-mUpOnZUD2upm3r3w7mvuk#>%O>(b3}>
z?%dcPQ*NrT{+HDOTTS_B6unxhVO%t1kZJ`?ynU3qT!!c{5VeTbG8z6TC%)q9DtM5T
zvK)!|b8UM~Lzo`_m(l&h<|c9jBr}cetaF&E`_x3igGN4`^@~e7=LZT!pH;WWIj=K(
zkBZkTQ`yU@SsEp#$5-n`VARy$kU`~!IOPzPy$>7=z>JH~ykY9HyknJ?vdZ3=iOUA+
z*=UcP__@bNQ*GCBKWeQTq%LboV_E6hExDqSy&Xz(Ul;nkR_lji*br^hD(^r-LI<%+
z>#56@$u4s#4(Za~ke&P4mN|Idu@S*vK?1H^+zH`n)L5Z5RXNy@z==s8CoT%{u!H;H
zolGwvugdpg6q^-IQBrny6_Xkx^#hZVottVJ#&++E(WX3E5-a8w-N)wvombb7=50cU
z;%#k1>`{3pyu;y=?7eTJX~-Q`h&m=@ob$DdLx&;_mMxjEmXe@FnreEic=%=Vgk|>u
z9!(Y~&0yOl`+~eN-=D)ScO%27t}?bVI)|<UF9bYV&;#geL(wX*C9$Agzdt1>^BoiJ
zPAdTXGnOp9GR%(E*J6zRyt$kjbE9>8u%vq#JGB>cgM^MR*}JwS-+aL0ipb5l?qzw^
z=$OI6#LCrR$B(xAB9?BUd-ZXYDG66gqOOxi-1->RziE(_tL)ydH#=NVo=CCwR*~PX
z6xb+f5E{jXrnpUIm1g&C<z3y0oDUOJNr?LVbqvjK7g$jK19eBE=<XGNce#)5UA!E)
zeWTXJVD=u%8ewJD)*{<t>A;R|v;A=0VF6>coYmTZ;Lk=|tdsW-1jEm_SBqO@UpL1}
zJbYdz>An|J{Dq9;R}kl^3CJ(w6D##D7LYUgihAs?DQ(}=_`>9HB~iA?4;t}mMRbVv
zfHLKiA-LGS#4IT1vT}5oMuo45m(rYbFzt5z1KxBS6DtE@1$Rey1WzO~jcVM)Q*4hr
zu$e)D&MFnw_I7u%zI4B1zoBL-^~Xy?vRm%gCUw5Q<h0se++suCkqU1%NP`zy^#VR>
zWH+Cj@L3Tp6yzG^Y$H9wv^Kl*bvc0hlWRl7(an|0B)gfucKka`qkSf#v`0pbokfRh
zy2A0zg<|#9ElbYpjNj*~jLNjnavGM#h!dNujUxQ|P2jFa$~|%FA&<^Jh%v1Bhyh03
zC-Ux9ddik(V@@u=FpZ-<-^7cRoq*BF6B>t#d@12{fTCF4Vs}O4z=ZUkY6ByceQp~H
zv;69+1kO0hw`tdeiE?ht*RQ7cm{}XA#&x40%DAW}R}o*+Hh{*b3Eru=ZT;d^v{T}m
zY45PV6VE;`dnH}p%$VxETd>45I`JH~I#T8DL_f0-Jp1wF^^LwTSsyg*K7d1=Lg+w+
zfwJ1XW$O0@L8^Q2{HU1KzwYOH;N>thBVKEvq5PUSg?BGvpN?HGm^H;|a{gusJ9S8T
z6`jH!@duHM6t~Sm81}Yo)*|9E&RK$clw25;NexI06wn?Qiad0mkY@LEL-Xa1SQyL9
z%DcQds1%;q;T15_I_`y(0;CXwkPh|FA(Yc`(_N>ytL;Ssn4vBKHeG~;C?`H``io6A
zHH)=$rK0K=QH|j}vU#vVdSv(B8LnLM=c$rT9@F&J@i&8?*oNJ`Sp*A}tb@~=!>;7X
zAA7+h9)P-m>`R<*|C=wbnfK-gjM}AG`>i7LfAdZW<u}t&2sbjY0&t&4UE7=wm8SjW
zbpFU|NUcw_D&rch9+e()QxLBaXL)CM*lAD7HsrzeEV_@6tn>YY9Nkd62D?jU;H<bw
zeJEStQ;KXni?p|wheLq*iBLRTW-qjcLMe@^Q#RO2X3s5V_jbgA2N%n@5w}Z0s*%AM
zg{HC5^;?0MX+e~A86XimG)I-k<-PVi7#qu)Tpgf2u6wyVeeRcFY5c0Tnh!LF5-S^0
zEBIl8>XlU_7bd6dz&Le}NaIOhq*%FOTomw(9s8TC+5>a`wmhk-<c+Ste7zT0dVmwt
z1{FrO(^cT>wMeQk5~Dr#;y=<d3{7j0p2bTwiWO<8>%3FeKyy;v`Z~Mt#Cmt7uG4X7
z&C&#MBOY%-(u_fm^*C`3Cb1dK*VcMQKxg6~N1hzu%9lpD+s`rix-g^9%P0?N9jhKI
zvmG8pF^C*4=?iyZ<&0cQKf5a5Cv}?g(oH+QWlAbx@43np#)a8+6U1PZk?fj(QhJ1O
zJBGi}gA4(lZAyW(N?ZNEPAw<jnug(2K9IxX4azO+SJGcE`>bYgLxkrA@`DjX=P$*m
z+QFGPU6P8~893Fo7aRpl*(F8Co+mjFOH+?r3vC>2Jr)a+v-n9FC7%6G#`L8E!<&NA
z6cLKiEU{oI#oo-l&Pz8`+Fg3-U$2Rj-aJTIP*w(%y1a@pok_x+pB;KVpu@po34QTq
zcbjB`F%8qgmuJ1S)_CZA!}Ar5{`8*Qvbl8pu6<4;O;Dnx{b>8R^;NAz1mCqb3R)O@
zae!d__x#3qv!2Enhus@B^sbdBrr|mrI{KyLZpANK`vR$j`urz4JUCr(A+LnDXI~ZJ
zB`xg$(g*K9Z|rbK{4g2Vo2X5>{+N?z6rt^@ukOAZC>6L|P3Qg7?kSXQSPN=5WAdq_
zzj4j^aQ0zA)9;<?<Fn!8Lu81o^Ia*oE&JNJiep|6Ns00q&3N+yc!mHl;-G)}Yd?Ln
zVZ7q8TQUWANm=aThUfN9->1NEsxPziojb`$;Twt}156hBJeOc6+g<*FBo%X*N5Goj
z+1_UQ@utKsr?H@B^z3gUH7clZd<G*sF%n(%p7Gv85+no$g~k)xC|bkMKdb)r<8tDn
z`1=n8gaAPa*Pw8MH6#*AxQ0d%tf9c@_zD|-Mkx5g76^_4A>x17@Bv5^0X!IlBnS-*
zg}^RgK)`S$0VNm(g<L!b0ih5CYtZx0$>*wpf9i+8(F7zA6pVlb3I-97K%sB~5-0*p
zAOwm+{FCFH9|VHHz+eIr7!*lB0z*IuNMI-w0SO!o{U^!a9B?R#fCG*IU<4#^6#60w
z7=<Rt0vLsaT`VgY1x64$0|dHw?!5f}5(50!ejqp+g}zWH5CTe|7=(Z$F4`1?KoHaz
zggCb!0UjFlKf(N+e<TD!a1aRxU(5^$i2`3#2SPyz&5eQ*;-TP#!JtrRfPfec0bfi%
z2n~n)Pt<=E6pcU-o;&ZV3yQ%YFo6ldAlSdV1o%()gF)!?Uib_7za;^m-+M7YFc?Ol
z7z~D^FJ=Y|IcL6z2Z3Dh(Ru#C5GeAZTfq=GK}Ucg2;{}OgCR)t#WI1R=ZF`H&)btg
z2owPa0R#t8po;?ueBPb}>R|sGApgu34E{g8{Z}f%Fa(O=UO4DKa~Amb@CCymP=eZ>
z;}Ix1A9@4<BESTb5R8C=2(pTRUL0@$2#i1w)*T4~6A&YzPy!`LIEsJ-i6ry~5{0~&
Zeh30`Zp`>AP734*Fa;l<yt)F#{{g@z0dW8T

delta 7175
zcmZWuby$?|(iTya5F`bWE|=U6K#`VII+Skd6oK^>Nr^>RI;E9vk#3}<1f&r~k(34z
z{MO%hzVn^K?*6ma%)HM%bI(0<J=eR9GbE!kBxPhId=z48R+e^VGS1$d4>-YuF9Z&Q
zio(&HFa$yrfrg<FoJO2@ZW2piz-SsbqY#5FEnUdI;9&FpWD#Z8Xy1RQt;msta?P8)
z>XwK~<?M)|$YhJ(NxErt|N8>riVAIB-5Lkan=BFNh=#Pw{LanIe@@zGPwq)Itex(C
zpLIO_v0;~A2P_<vrmda&&jTI>$0W_btkUn@TCH(gGoI7m;FC$Tz(Y9DD$O6BF}LZp
z>Gf93<%quI(ClrcG4DhYXKF4}ltfz0U1gi$cHR+ViUe)3X_Icg#+CUx(x553XvvZ>
zg}aISM@`3r8y1g`W$@&6$FPU~54hVPAio&FCoEaBCqr>J18-{H7ro%*z~rXlCAIl<
z?J4Uf7YSSarIr9orL^)gz-#klgKy}M#R2Q7P(3TTku+s#Rs7VW&Ce_tzk^!eBkRY<
zn*~}GzB?rV+WEBU*#4lu_WSI79SWE`nNFFFYBWobt@?7+Gvq2LH)ym6-JWJXqWhAG
z@k3iq&b8irhODYN;YG_OYqWHEg!B*X+PK^Sv<N7*yN8$7<MM|?0M^#ybWNR$+}Sav
z=BJ=aikYD%nnnvOqZsnO9cbrPcdB$5p$Jx5w}3>bx9P?*eH<{)8D7dlD4bWfNq|fV
z$HN&ei52ez=^0E#jKs7(NiAZ~6=caBRTo+O`ZhfGW{`#ktIoes$+4pEnA7yeMGr70
z9a@kM5a92Bzh0D_Eo;QxaPD`v$(%>Cq_1Dy$@UK*9q(^BS9u)OEIGddq4LSTysQBE
zX*J9%LG#B#ZT#}8@-4=hi|JSHKi^SOD2+X6J#A)Q-}k{@Wj2q|qe>RG_Ybz8lGR$q
zT!}=1N6+7wnIF7Ek7WJ2?}Jwi^|hA_;O{eh^%+RsA7bzk%x-RR<7vwcbi8x-Ltmir
z?SRM_*6u6psV|NBI!QywA1r`8*0(^2fpa-kbW;*98l&GnqULIJJ-lfH4!T5tWF0}C
za&@teERaIY{W1oWgIAAkw5HB1?q=_Q(8kL4<WgS+zh7b=344hRHJbxtwkv2VbZq&C
z_yZb1$`E9x1Mh-*Czf5zkeOhAr8!C^V@US7x<b?-Q6k{woDWr2XGT1KZwGB}8-HMJ
z2}!2R=na`J3)&TYIn*o)9q{tDzFu@X>&WY(%P%;$FDn-MROj#gURq7~bMh<Kcjt7+
zT~bjxG6VbT*Qzhw>Gqmw`JTtI2k<zTF@dfG0N~7?Y2AU5l;x-U?WNB+%9z6Rapvw5
zCmvP&T`#RtAB%f2;~Vyk56ah{U+%i_Vy-mjF4rTez_%UP!G>uYpBjLn__2F&h~lJF
zPmkyq{~p_vtxx%%K^|{QtqxA|-(a{{%U<k}RMP8ib%C5ABzB$G0ipM4+^br#{H+`n
z0DE{pHYBwBecerAvNBUJN87!%^k0kxZ{cJGi7qy_qxx2FsESt9=gsc8H<1Upz@MF(
zDR^_lO=oAJ1C33;Dp8!XxR~S7mR&Awuw-OjTuS>|nuduU^d`WQZsSwQVG0kLhrK*A
zU!^}`$KRGBt8bw>2Dkr(uKw_u?Y4YsI^bS;BOmX8nn?DSOblYI#^0VMd%1KqKRF<R
zFP5guF#OH@NEpmCrN{z0RW`174zvH+Ki~1aw}%*&X>0vyNRvR@^IqGl&qF`DKeWld
zGbnpuSHdtb1{%ua&CIEIgAccD@HplEhrbjZ*@7p7hG#6NUAEYf?b5nc;WP@_Kmv)S
zM?fHm^I}n-_bSH`Jm<iPO6$I7b293%dsp%qI%}V!@Z)X6bUb5X@{;|%l~s;baOD2E
z$IVQ;&!EePnm%e%zg_rJ-g{u88(S{qxj07-!FLON_mA2)ojK|<(hqhDSpd($xu@^`
zlpODF%(jO_9k6BeJDD)ccq$j2117PCRtZ;MspCcD)E;%5EF{l~Au3;L^61*GF6*kQ
zkCfY~DjBB9YeW`rcxAt|j-f3UZvO*0RTwQ#vF%;j6;X7r_=+<~&!wpk+zXZQ-l~Bt
z#gAfZhg6j-V<%5rIw$-5gn|_q2PJy1HO{b!mIrt0NFSTGmxA@92ldTz0r=ZY?8%SK
z1DfBEg8+3Ym~q|jyVf$Dw*K;D(KO8CR%)kZ+4?0tddBe#`=-{GUgn40H|&KIEN;_L
z<oQ(4JuWx2(2&=RIO*BQO*G2RQIvNl%X?@Wl8XFra$26DRFOYoTDeS<+kUby&i^j6
zn1ny6WQa<(JY&10iuAJ`DPWy#e5vw!W)ZJQb#2H?b@qzy3$<*97p~=OTiFM_heFtG
zyR5r!qP=V{1yjo9)Z~s;tYk|1hZJRX!(ll|Y@P0Ax|R8fx4zzMVB(NP#h%>so;j#m
zoJjnfjBng*o&!#H4?lmI9ViTW8|CcW9|_s^Y(#w!q5WR>DlE}|T?3#=?XbFJxmZ|i
z+;=Q)txnyp)X&2t!<tk6Ce*p#Vr1Epx-KcpU<jkLD-aYE87`;KI5ON*ftOBQ2v(67
z(0UPis6F)zLr+TOCsMC36=%*@5+)j)$c(k1%}u-_{M&8%#+!$ZxYf4c8S=!&ik0M|
zFPw$1AdM{Z4zgRq6|KPUR;&3>u#%4UBQ{E3@8;&Omd!p5f429(Wo~N84$}QnXlytd
z?amj=Xr9AjFrSU8BF!4Fqz`M+gv^Tc^__Mt32izn`|`epbw7&VsSSqbgtj&;U+`Y&
z8oj;#N<foyVK6WC=he%J@d0824Qs&UH8Z;XOSLw`*IeCSV1on!0m)z4Yb-$(sQod9
zb2+I$InyL9y}g|jaRzhZ-KdFF30XVE4b03VgT4FdEwZ}97Rf9l{kX+<;KE5QOZHwt
zcIdRLb+dLAo~6X@N8$><=;Qvrnx=Oaf@@wpU{L)MuZTDwzNCRg)X{aa4fId|MCrgt
z`*f<_O`ZEq)Qbv0uK`sMI{Qp3bJ^=_rZ!VCP3}o!Sjp=$=PkM|ZPjoJ)@y-2?=J9j
zEe(21(lYMDkOqq#wKgxQ)2*l!42H=K`ap@4X<Ew(WU42DtU`rPdj!cU!t`|eUJdXw
zL~%{`Wf<%WOU-^kqnp2crWiGGt(ZRf@n!b=#_@6eMuN>B;AH<t`u;wj)-7%Hc7YPc
zX)_Y{-0OA3tV>n7XM81Vb|^i|mpDb8sE&~vZyF89&kNBNNkzcqi@EVN<AK70sm0{2
zr));8f~9J1eOB0SH|s08HQCEw(-zA&a^sOa)j2Ngpzm+86<8vb`^I41L0LlFajDDr
z1o_T?+`wP|^~RyuGU^(yckHXR+8(!ZgzJK8MuKBZ21ayq2sa7tAd#j}4YL{Drg_zB
z-SCo7hPog%cG)LAa`#58bK)63@C40;kW1*#=j%<c%InEd%yJKDKcH$+or?HKxDn(G
z9hY^qH){qh(Vo;kA{<}#i9wUBFpMNo=|MDLsErPfB}L9D*qtA5vH2+L#H>X<E8|-<
z`PZKkO}LFj#kRkXTDO1?AHxJ4?&;tbqp$w)+Up%gB-jU7>+WDgz^sLe_H_|?Su|LE
z>AFa*219p`g6fURJbRCoSNra)`QB+u%i8vx)@l>WAEn2>A=MejqO<h|;^K82fozkI
zv3OQYSNL^-ZbM4m!YTf4>M&lETZ9ge`cuR>V%$T`IIcQ!8m$w1RXt^8+^wUaX1KIH
zrX%vkmpA}#QLGcF<A}EyQ*L{6IC!)FTyKX+-yhMAy=jVbX|0BBld%kplbDB@JpN9Z
zNIoBk4A+IS?k>{ligVy;D?306m}0uYSh=So89(tG?K3c-xE`ydeJ<2%oCG$vKds?Z
zcn-e1VHR_|B2l!q@Me_ZIg(<v>c^@9sVlZ4v_z8>odjkg!CAk*8q9sC+brTdbKk4#
zNZR5zKJrQh>MCK?P3pDAu~IPhlot$T+q2E*?Pb=!jL9WB@!rBJCwPaj0V{by&+uj>
zecKMK@%&~{sbb0>V_fdUg_=_ZfGF}`UzgV4vT(vOri!VApi2W~5~lQ~K~y2f)zl3+
za&9K%pCnYZkAmZ#O^Oy;zOONxKyzN}_3kR^H%+|skeQLLM#XlM&e_+C%QR@Fq<p+(
zF?8o?k=vZ@6Lzx?*SDXy&;#PvD9)?3{0?kAh;VCBSs~eKp*}S~SUY`NdT`I=mmLj$
zZX;FoxuW+ItvjJ1m)&n&vSx6-&?Quu6v$Jp_n_XpFi0twU(*_fuW`E&PGN&N@BDhM
z&1cEA+e&mNk;8=c&4OIZT7KBuvZN}OH+(@mcg<AW=Rx+bDeBYMO=w;NcVrz}pn0a1
zNmWOtID%i%{XUY86xPVhnPBovDzBR$=f*bkeX5H)?{fz35^m0lVd5Nl1q?&SqL~C|
zn9oI3p_%Je12+diDT<-uY&F;P&mD(TYw6QueiC!AE!9xHZ7>x!lGSFA!@VaSpp&mm
zp)%`b=qI1^TEY~g0hD+o{wpz!Qp&@)oP<7}BPGKd(~z)%!qG*c53jSW6|C+R<|-IA
z<n<j)vpmS|oOm3jHJPndT;)v=w001a$vjBiDoVY2WD#MMt;J~Qhs~$p_T4Q=Vk&*a
zdQvdF$cpZIvDQ<rYuHfW=vE{X?!3U9T_MnX-EJ!xDCsG*_}3FtTs0$$I*!>U2tz;8
zZgli5%(!y6UbPx-V06)#Zekgk+bKp6w(Zty+tgugXkb0<-r5W62ssB^3#;Y%@chRk
zq`p{Rtf5~;-!D_g8ns`iV~rd8p2BupcfNnLE&OpyUpRbutkBYX)cMl8NXCQRxv*it
zSN!_K^r?uE+^f7!d1%#@awYvFVW(yaq1#E|ip2mH{z;Og4(SWLT<@9L;&N}YUC@~r
z&CF=n@-*M$R-bvm`CXOS_Pgc76&ogp^4QGO_or9R2Y9B4ntNI)`&ER`xkUA&Rnn}1
zygRO8q~ZxL6!@oBc8hAOp0VERW1jmh2;4VO2v-chau+IxoaTI1ypf4hr@eN&iE}f}
zYr2V3YSPNKKF+c71J35wYiJJNz6SL$$vYM9i^^IcmRrtPB+Qu0-d<RE0lnzLQFuet
zFkSInWpFBsRHnN>%K?u1d4xUW%P;YM?;O-M8InxR-R+teH<FaPAk3x3ZeJ9`N&!WN
zA^R2Pi@olsq^hKw`kLr_3~~a`aB6iKa?^!t6S9JKKGHm9Yu|7XJ0VSiz@wFSk*^a@
zKnR;ujVGz{`m~^ld(JhSV-;mA`A<A|Znow><CAeCqyKyn6v(KNg%?h3mC64pq>|$L
zca>`AxQ@w%HNZXu-%{r)3W;q4dYSKO^u}y|U;GA3O3)F#u^l<tCTKXV;@-7-r~64D
z;xjjRd9P->#g5-8#9m|R{6p%#hebF#w&|MG9Z7Y;B67WoPovBL*cIj~@i1KQ+Cr8d
zV=>~goM%_&e0Ne1<7Z9njc1N3M^{Khn-&^Q_~Le4hLO?glnVJCeTqF9K)-0tZ4tK8
zbR&KDJ*2HQi+{tSgR0oa*iPoaD^2mn`tw5Rbcqd*1_-7|W1VfvRo_nc^LV{t^5Eo(
z;X>u-X4ulO*p{!l?!lP_P98<^W}jMM>Gbc@tMix(R`>@6LBUrfE$sZ)^eu3ic@A!?
z=<c#6J$YSD=jsP|DjiQC+oGDcoh*H;>dt-<qVK*A>IpK#DpZ3U-<D?(OzD48BTPE}
zh`g9@Y+Pa{1HF7QH13{@n`>X+p~9)heu56Z6v+;-*|kiIuokw}t&_ZsWDXg_hmBv&
zX1-b@s*Ff)Dg4xvH~-x8K?RqRc+osgSSpBaU*l7$pvW@ZfSUqvU;xRCDfa~T1g1KC
zO7TCYA-|w=!$kSgZS%?iwTiiX*4Q{^vbBz+lvU0ru1#DlA`EGj#mDqUbV~^`gYiRs
zUzA2d;&U?T*O##&SLmrH!PQbAZ!opjA?+jIM3<HYy5d;dm(Rgn-amM!cE0Jg{;CKJ
z$VvdGzR*?(z2o@_@bMmvy1I)Hr;5S$a#1X0i<%4>NG!Do!*mOK#3elu>CEfBsif>l
zmA0a!kyk%zo0+=f-rkn@em}F-ehv8(ZQaeWx`tj7_`ow^ppx!|&h_NlJa;b0-L1GG
zpizB}ge3DvyTU~zQ=yg7H#Rcb5YxJ<<i}`Tzgz77(2T?ifEvfUuxQP0X0UWV>LoY-
z1%Vf&cWo>=aQX#0ThI<=(-=V|I)0~ag7hSfJ)R4BSwzRT{fcltWE<8lTx9G&r8*dS
zt2Bihf8KVn$nTZxXoF{sRD4#Fi3z=s><$Aq=2gz)q*|6@t2j!NkXYrw<stZ<*hLim
zTH&HqwTs3Rz=-;t>I`lotM_6<#I@_qsb1<PZDplldY5HJPG#dhSh-3V6IA_^qUwpt
zu4rgesJRKfn9A?->DjJJvFV-b^N)<QF;NM*r1$aRqDNJdercz?+}P&e;68i&n*D8@
z5+Z-Hdrya_&sZ-nR86D2tJ3+ierpvCu9|M$(<<8nU@X8=c2iVfU5euDONCh33d`0X
zpmn#`18fF7j>$c<oe^zV`tKCISdWr8SE)=Vx01GZ&Wkr}W)<hJD;?BT^i*0*5l&EE
z#Ln2)Y;OpBjBFZ9JWxv&sy!Hfr@L|zSl9FJ$u`^ia!`8}==dq9cK&X2O6c4VVXS;Q
zbz(Wd^=P(qbNPXU9NYeyL|U>F>g1OZQdpeN6RExuZMegCq;A5X(rbp}d{mrYd4RZ5
zw8!|$$zq-s6Wlb7*q+l=qyWO?#jROMSjJ9a-XCnAhW<hv9XQDGFZjy-d^r|+@8+<F
zW>!A`^_gWAZx5G}_{68--}J&}!f$%Z^JvEcPk(c!Be~xB>$d8CxYK>gdRg2Yp>2)R
z<^P;yr%N`_ujMNFSiX~#>9#H9>{Q6dT;65-Tg&bpQkCB(T90L}&-LC_EIusz^D9+$
zFyc5ZlJ=8H1YDBQKUvM-um@a3^TqPPs|B_el_g4P_0g$$GnZHNuEOaVK}TBm?yB+n
z#R2ps(T#HL9B!RaIwiFNYlG&=Px_t*mD(HqQfJ4S1iLv2mH*uGV@l`PyS^+@<MyY~
z$6VDY6=g4VCE<X>=F5CW-P6+JjZwmdo@R4ymIKxJ=P)<Grhey^uejN}eE-p1jv<6M
zyT%7*>Uxt#pA(~lm9d=-vihe%<a+A@JvL}D<7RZ7;~&sV&ei$zAe`5hoWM}w?_g+?
z`$B7Qf^+js<L1-TpM%Hwo8re}M#Ax+`ah(HyQ3@k5GJ<s7-(f4ll1+INH7!(#;;i`
zf>2OS@ZS%o6DP$#KcHYF6hUl5;9YDOKo}wi8cKLud*%Q%9D+}>;b(%Mb%7CRC>lRz
z!v~<jsI!kjz!;)jATT(RC=N-;_6#HhjewuEK_Or?kqw6Y2Lt%ef}uzV5eO8GA_9Ry
zphO@r7>Wo4hJ+Cn0z;$!6N7*c21DTxB8H%F7={Q0jwHMYKT|Or1117NKoI{4@(%_A
z4k0Q8fdr65AP5ZPEC>XRAz}f7Mj>Hml7XNJk!NfKeL>I05!(N+LV*9*8yE?OK+Zf2
zM#6|H1|t#Zv$_N$QA9lkBMI_5dmai5{;y>I;U5LX5C<Y)XPE(`(6F=BfzeRJe;xRj
zYeE>&c?4Y&+b~do$cTZ$&e9LYAmBtG7$llFjxbecDu#d|L?VQM;r}-!z`w`+@6??Y
z5(EN)olOt|ffH2>fgmAgnSnsTu(L@+pwKfqBJdA^!Z2sm3V|YsCISLQV$Sv*0!2g4
zwuz83oG1<kLlYGOL!zL-S-N0o=-Gt?fkTM;PWV^qe;3Gq><<F$f6ew^DL~*TFcC8d
z@V^%;@PFGDf`B53dWS$Fi7G)5Z%hc1Fl1*7A?zL^R*~?t3l0E7kVxXbqrfmCU=$2N
nR0#?RCIUgBi1mTOfX~toMM4R}#BVq%kfUJ~e0=iv6e#`&7UBwm

diff --git a/benchmarking/ttft_vs_arrival_rate.pdf b/benchmarking/ttft_vs_arrival_rate.pdf
index 6238a38e1420c222c92aa462202943a218f04aca..041d5e5018d3cb57841aaaf11ae1768d56aa13cf 100644
GIT binary patch
delta 4334
zcmZuzc|4SB8_w9Kkv*EB^x6xVd1oJuV=E$L36b?MX(%*y4lkoPIaz9^NU|5%vP6aB
z$d)W+Nf8G{LL{`&_RUo1d|%^x|C#%}-|M-b`?~Jyd49t@$;bMguY%17x%iKeWusnP
zpYY@_MMXpNhpVckyMd?~eA9;dzY|Jd7VMngoHQUi<n~wEW>we3S|4wT68+e@^t>?f
zMM?vEXvNKbeEWqGy>jhYy)o7bqh?gfK(A#yvUAq^!^b#8gdnuIF`-0s;}|Zt6dzDy
z>|h*zwc+$vXQP3+WogAwXzCM8XSmtL;pEXS*r*gzoJt&)P}jmCox~;5x)aYlmj~H8
zU(N2iILW79and}Z66MsVH>>=-`_RIzC0CzY@ZzsV#co-z-rp?9zt9?Z)lLRs^{hl+
z3J7<1BGaYL`4#7EZr`or2JAuf9MsjcF`M?1F^?~pFc)k|N<I|o4V~)Qp3<LX>d=;d
zhGy@u<h5@(y~7eIS^4Td;*y@if~s_GqDjI~aiY4>x!p(D6B>I;>#LimQ|>YwPZd3@
zw86Rjp>A|S`(Ubl>u6`F{ZQpnN6x7LjMA;wqv2)l#hz3Hgy)~<?@xt=a2xa-j<eNK
zbC%hku%~99A6qJp1i5>?({~z-lib0#rA9JHVzje?gi)Ci70l4fFkW%ze}vKYk)HuJ
zUTiCqXjk@a8v3+^&qyIo7|@mF_gQso2`Ap<bL4NIPATl84#k}7e%z)XUl3p(ArE5+
z32Z}I=o$J=#@vPzlUNgfXlLl|nk`bL@A*rSDSu>+pl;Of3#m2Oj>-Br^?vHZSEX2P
z*!#yeiUG4ACECACrehn12vL(>-x0}V8rHmHik)+<n7C}(wSQ3@c}XwJ_=Ehf?q{!;
z<rsOD^0E8xD~58q<5Uq=?`7Vr+t6c`=IDx?7ensWP>{6Hps?{eO=^P1uAV0gL86aP
ztVo;TpBY{d{MbgsZ<_p%ev5VWKAEI%Q2hjSQ{Nk35Nh6HlOy;DY<BfNlQg5BkfY@g
z86E#DjD1eFN`NlyaUIuQnX2z(^?0Z*uPgN=RvTF)N$r<sa;uQ@Twyqmf;0hBahC6m
za|*t250km^kx%O7ixug<Epab04*sR`d68~vvI{cRROyvH$gT=`v578tI!Eh(CasY>
z;KH-pc1W6a_C@O`IynmM-=t#EcZA~=3uUWbt<CH_G0xPfHim9>Gq<>4YaD#5hM8-c
z52_#A)CqIBr?kegPnPl8giw2>)ngx2;%!wK_QeiGjrp*<FgLTV+X;V6*({l<Kr2eN
zdMx{?oJhetk4g;U@hU8ZNWOMU%5r6<!Gt}|_PQjh{|PR?^mTEOf>GhtQ037gn)ue_
z8=1iIeqS?xt$q?0Y@C<lwBt*V3q;#jDB8fdXAr6CInP%1Z-uYjn0OuRazU|-!m@1c
z*;kF+-S3~%??5-v_LPVkyez1c826FRN)_A;*`_pFtHK9Ohb}%eTmS^qVai(ldpoX~
zjCC-K@HZzuza7aEn`_@$ws)?qf`dJ)RwPq3G$`W|&yJ<Zyb9QNM<!&hkF+e%rqkmS
zx+p=**&v!M?|^sqxcjPH*I`!I5xH0%9Gy_U99jF_htCm+cthL4tZYTW3;7L@G<`m<
zSd#0w7=r;T;i3rD4y%K)6ETC>#oX0=`W{i81gvfH*jz|zRo3c;IPrZSzqxSQdR6}x
zYNwt7LSXVdoK70A=i?j)Z>`@dVAqk`!wrND_6nE82Q<q<O}*BAnQk2)hMoJzb2qGJ
zHq4rgZaf)XF_oCyIKBuC8GFMalKSERlJPZjZ1T&j-JYv;*b3TAWSt803*fdfL0fhr
z`yhNrRp|;W`f)r(=4<6>YL`6YOGyrM;8J~1*2J{(Dk)0&{tPtJvpAJan)+K)s@hjC
zE7$Y%#TlP-NIKB-=}v3MfJC*gZ}vaQ=ehmQgr)NjKt|o=p#mtK3+boJ)lP%3PI5Q~
z{DN<tj~Gt<2kWb1Tc>Pnm}VJ9YU@s`uYcPziC9@V-<m92xnD3)k!d<FbYXRDe7J1@
zC6j!`q%l0=Lv;OBDARi9A(&sb!<&{%g|O1b=)=deZ@w5ATy7brADPKfwom9jFd_+^
zF;7{W^)7sUZS8iPBiq2jQdCb>wYhcglZ57un5{<exvP`>@B4eLr4H)*wY10W7u_vJ
zs@bv3>X%AG`{nL2xW=sTEV@=V*KsW}BgdpYtu?tzqLir=^(&paOV>=KJ*iLbxoC58
zZsA;Ue&H)RQ{l>1R=a#*O{~T-J>9t*(3_()<l|0Lml=a0@XbM&;5QF-EqeLuL)B{D
z>U4Zy`sYO#&EC19$$ojSTm|WXSynIU6m=5zi*DOe0XtS(Gh*VjD)H{Yr--|b4(^pF
zmU`~TRrN0S%H)LKA5#dIgGj+j7Nb!FaTr!r$iG0KrtqUgpcMGnRyakd=HsiA?vTj@
zJa7CazO0rocKx=BWi-Vr{lZv1`oNpCg!+0r=h&Co4k-`bETpBByuC2?OLRfn%?Div
z-h6KPYJWIgy((Wjm2VD~CDj}-UR|lK0iRo2TNs~x`xfQW5mwuLv@a?*JaFk<9;(L>
ztPJ?XSQ?ppw>mqrj8gAfJ(XyKviub?x*znS;xx|v>e8+luXc^cmkDQtb)NMi<PLZT
zsvXBDki`BzoB42agZ+tJ&VqBJ0j7xJ*my;0%%<u*8ER~v_w2-4JIxvH{7w=$CW@pg
zsZ>cQ^gU#R^~NAavuBZgM)Umm*54JZhK}BHZeLhcMakRM?qH)7G7XE^s3(V^_EVQb
z64@d7r6n)wcW;Mg_?|rdEm}@~Ey-M=KG^!Rezvrgu6kKL;?S0+_c18GRVvh$s&AZp
zh%xqb?ES#X(;okn{k;o58{1dMM;0_>Uv9tu#W;8&_te!EF=R4b?v>KOU8#>blP~i8
z2JDJ74ta<ym%3kVb8T`A?-n__5(ypM{Tl!1_gco5i@&rCCX{;!P7r+g5AKU|3X3u4
z|1+lQDa!WD=a&OF(U%hLK7SSwRe#&1=#<T;g~mN2!E(ZZ)syd=nL*~km$D6w?jXd?
zl)G;XiKOpH3Fnjfc<x103p{^n=U-><DQ}9Jf)$I|o_M5u3X~LTJ#%(T#h{*hVhPkU
zw;EDuNu|}gzkNqr%IePfJf3vPOD^Nwv^*o(?#>#r?yEB);ABO!)MGeqR|wU1285DI
zqIMELTl=vcuYkj=g4bVkY0AnZ&dTKkd!%mZ!1s#WlFK@wV#iR7&axMV8jy!q10D}`
znq*TW#I)Fd>Kyk$L<(xYp~PVZew9Q671ZhLAla-obn2^kZBrvlUSu1LK&jRdxwI9A
z#k2TSl_3l3`Z9iS(@Wh`vgD?<;l^<7u&_4TS7Je?z_mR_n@#Vv*w%S<dPJES?@QJe
zKQ6cbSRIvQXq%A8yykm)j$Lx`YPYSHMeoX(WBD|4BGQiZaQw~huCTM#OgoQQvDDqS
z+&>KasvWZyQ3B(wa53~|MBptcPyltN#v94j*aTS)KG1OZh%TfJX;g$^E-UF(WgIPZ
ztJ@m(a;LE2(CGR42t^C{t0W=MZY}4~YgLywY&8$Mc>2tt2GP3%ikn1TGlWV59wns%
zmGa70#9f6^0~Mir6@qUfml_YZv+D;RL|I2oyDV&n-;r1U5Xsjm@8XJ}`FRBfF=!sg
zIT7p14isxp{SnJhL%O<g2j^dGuZA?sP9u_~p()K$1k|uZ2h0XTm;K6up1{axhmB%e
zYFEcYf$r|-%#G6lj>tH>=K&~vdV1d0@=fLV4IfX6ZD&2#RAWtQn6S?8l4j{@*0YAS
zrK?4>5;?We($z(~Kt7fyMw&IRJ;i#@`JF^VEC;G6YZMJ<nTsj2GBD0CB3S(aa{?xc
z4m{=&cocz-pfUUb76WG_j6fR$BpkqMz!^y((8mF^AAt4;KoAF@PgHNhX9z(j06Yc2
z05$-c0T3|Xqd<TF{)lKHG(UiV{qe>v4-jzNHv~E`z{8UU5b)o>1px%YkLW0a#>o?Z
zyoLG$1k(377$A_pM<D>=+pY*HbOb>BvF*3EIb62yrttA$S&q~=K@ygL!?2!GW%zKc
z_f(8GnZzl}eE>fI@ht!eL>#9SZ;r$vz>_0$Sg*_BIPLS|2n1HLt`3YsW_9YSK_pJ|
zKO^uMu4F&waAZ73B%U0W%whI(jsRkKaTqKG<jC?94uivxzcYaTOBI8|{~sWct2@tP
zGKQBb21mj2N{7K?xKsFfEgp;gNjix88Bh2>iz#?sb{GPNw2lb@i(Q{1V%Fy<*mXpR
zSR98xPlrStX+4ktuFsLdb?itW{wH?a(j+2gJ&yd3($K$7!;mOsUbkS#JaTahl5u!m
z&tS-S%DRooL~tEBGKHr>uKpBooe41%JaL_b6pmiJYH@nzojb_k%)2p&0V%v|LC!oW
z5Klo6Lt5{25L|yEK#r|>*K*Y7RR#n(rdfw0{d-oSe-AnqTxUWMi|5{1+~?mhaZr?X
z1+ip2@8~%FQFxCcNTA^UN39@+2i@=(je($20kkbWf(8&d7pINCe-OZ-#=SF5{f_tp
g#P8SRFTWrHgFF~PVGP<41junX38AXG_kc0tKWSu_`v3p{

delta 3929
zcmZuxc_5VA8_qH<Lw2czL1fMM&CEAutc}SU8C#;w(x?$eLs`OS3(5W|+oh4Em20n*
zwS_iYw2IJGbPL(yqTft)7v^{Vde3v7bKd7X=RGeH#}gUNQ<lyHTQPcTHoTp5^riF3
zWcSOxef4`uXR}iwNPSo5KcKxySIA?_y_4dshWZQ%rGxRfM+pMSy!>%(DH3Cy`mq^e
zelyQTKP@t+|M~;9*sjamQ+nQdXNUFpY!Y*R-BMUV<iX~C-R1#LO607iiu(HfC7tJN
z@>CgvhA%p*Y8T8}HDQu&EIE($LP`F0#i|3mLlG^?(u2{o)s#;>gK18w<-MX5`nZ<A
zUSV~F)75G>ldcxY#V7-6TS8Bg^jJ~MS=PK^;rt&HJ^{OQJk9lvI6LI8QmRL9<w;vp
zd>T=fzhZ|f0{Xh<c;#)67LQ(1C_21*Pt##DbelcU`!Vi>zB=r+6qr6^S*s>X)2OVC
z-$qGm*?F=#j<dvGsmid7R+E)+yQY>WX#OSBCi0p{dFheUPvw$Pp#nDd-HSYGH(19E
zR#M5%qJHYL*UKnFsbqlx7ZcxsTa(UB1xXY9stMD3bdaG}eQ8O~Ni^ZzBbmMq8Zu%%
z3b$-51pC6_)K`3T2VoNy|Nd^14syr2Acl)Yt27!9;}eIQR$q7B-%aQsi($|1lTR>x
z9$r2-M90f{H%4`t5K>r~PC^n4zKYcePvxu&g35T5vVK1sA!t63IYr=qFSu3*$PSC?
zeX!0?)GK>ilhgscu5*{ouCBN+3cvP~?qj=S3SU^lftT)R_J!xq70?gM<*tg~pv#Qp
zH<w%Nb5>8$bzxhuc_2!Ph&t{086~r|&ZS8gIiJxd2M}Pr#BuF=jS7eMkz-r;@~IT%
zQogX&u}=}+%|an3c(U`iSKa|$$^a*+dJ5V0OclC+xE1=zod;{J{k*ybd8Oh{2SX1<
z;Fn?&-}bD&KF3?vCMX>d=~3-ozdK&gd@M7jXotXc_vpH|Ez+G4LS(1XnFYc7b1F)j
zhu3u}Hy=Eb<Srjk#VRd7VVohqj<z45Dh3v6^K(u)zO#<squESrU7D<HX{SZ-b@^zS
z%P?c|!p-`-Zl&vm^N!grrcE^zD9KcBg-cR)?4#N|dYy|al-wFwp;wZAi}S$Nin>&V
z^z?#7f1|1j!Sfr<vM)YG=NM+2uvktiD#kLqV+KPCqs#p}k8{Wc9wJ2h^gG*ou!Xen
zbEud!<0(mwMN>`d6WsGld$1CIyVO*0sTBK{14*d*^z(kgQV)v;&zTLw#>HXEC9ULH
zbYzKRlpx(qS$~E%LB?;`ofOqr;!%815nEV&#KdX9!A3tOi8N+%)8?@(aSNS9eqHz?
zy>D+JgF{Z-Nv@_YjMOeYO4%(P_(xjQjKzbO;E}#K(J6Y<z^7UbinE_`ZgiVj<wo5O
z2CK`FZo1uFB(x`2fj=Jk%LhqCNx<&y^)^atd(2;EbsK%gpfY(CHM)#j%a8Q6GAZ~Q
z?-r-u<Oz)qtCkp#mXxu%smhlnD_-_U9*9Ygph*T33*Vu2Xa1B9ige5MsJ<N#hoy$E
zZb?+%Tv|p6c#(H)Do-OJ;Dt^D#c@vmP11S)?zAc<3Ps7k;@RGw!u!~!4?csRCHJh&
zG0Zb*WYI74+1H63R=K*vx6w0ewqv0ae{pp#B}>(maYEP3sXZk8;hMrwPT4C?UJk+Z
zCZu1$pdu$MqN{So-mY?=Y{6|X*c;)PkbH*~#kv^fKq0!GHFWvNI@;DoTkzUG!k~tS
zSoBS4e&IJVbYhyD_K)3wy$W#A4LL{vT_`AI_t^NiDXfAWWz$RSl;Qyqazm<C^j#5w
zacebx38&{lJPpV0{ucjzuTSfJi1e2+wP|G9TgTdWF=8P(FV1d$TisIPLWosgy#n}V
zW@wX6dghsY<uj?PyPk|$CFlu(Z)#@#25n^3H0!m&c_Mtz>y&9qKIKf-M$fd3imU|W
zl=x6G<w;G(`oFp5CfFifx2=LB(E)Z(Lh9=d^c!9W3=f%YIVu_(Z6tD3h>#?@clgw-
z%ZKDk_Qdl(L0<#EPkUIwhDEkz{Jna=Crxgg(J`%VryN}qnKjZ$TvSvkyt2g8RFFot
z`U<xvo1X!WE6wBsSWK))WX#C7#sa*5x`sqbW7+L@ws&iqtx7@`E?3=y;`d#qP_JVC
z6!E;8ufrToRsNVdVO2JS<-f)fyRi3r(yrvL$m8x+CP7hUl#f?pMn7IL&1RXZwiHf0
zpnJ2rUt1;XY}a?*^q`t)D-|qqa;<(3RYo74HWQy}+>mSTT_=RCzn`Nb<{9i8iiUzJ
z)Ym<bOju~^I-dAsX<jOX+<(C<ukXU(o1M;cA~UlK!H*;MxRf8Cc>4TVSqHOUl69-S
zO*AMQp8dEp*f*fZPb5VZ3{+IQtUpb8Vt#MGwM_5k?k~69!(}2A<HABO(#=aI!d#Um
z;DHPz`gmy3jSF%k*0VQmsDC8P2ThM#_J=H$IU9E7#0pOh4hPMp*5CeqYMQ_OjF-~_
z4X_+f7m4{RFHy#ZOm%c*gcUkEUh<b8;z1V}&u$uizwk5&mvh?9ZQ$Ge!56WYKD0EP
zZznpnwK=3e^6lNsEY|h1`i%#dxG(f!f&*NtOWkPgeea4SUbbo2^0kYe&z4$0?JKH*
zC4<g4(cQNO(j@Hp)>D*SHs;0e@m&-Dr6}r?bViW8?<>>rGRVdEY)Y$9;7+^8115ZK
z<AKHQJB7=IJ46&~$*u(h%uTcJqz-%8zq&~E2=qng+j<;jY?3<pTGjS-Vnu0K-&^@_
zaH#b|q1f?fy&a6hU%g7sj;@*iEnJL5f9ud#yQ+T8leMj}GwLz?SmwR7NulKWoV?j>
z4F($I8`qRJe)!JF+f%Lp!}EMS)v+70XLOB;#q^KH#qF{)<Q9g}+qSDzhqC4k@12?~
z&(?)kGIOLlh66fkKBQg~^)8aiGPv9mTME0P-a*Is?9HN_1H&!(>cT4qWbG0bXNL}5
z(4G~3e7|Sj!>^JmQ1mV5z?quQt?P$99*><yjbx4z3{S^IO3J32WYWu}02`l4UbT}-
z?RndTOdi^WDIy;v=37zrGR1?}P0{WOf^nM^wOp<>G)Xw-^h`c=%ME<w)inC~@t{7O
zMZRq)r2P<&S9%&dxaC^pH0b|5%Jbb?*V9U+q8`qF`WAP1FUdIGw)L8S{6?kH!o6Y|
zcc4APQA17Q8MNb5Ge#?g&XV$ldWUvgeQy+ESTeI@1lD}bfAR8>X+IP!q>)~y<L!w)
zg;Jj)MFG#;v@m78!~G!9qUOeXxy|*wH|G#_0U2IAuF^svXA2^)`Wz8b6|Y3AMe`uD
zs^Un#`Y9wyT^uRVFh#a#h$Cw>ZzI_n;*~v`@jS>BJ78*oJV>*qI8p<=M-FMRnIMc@
z1SF6h*l;8Zl&I{+9^zexCt|dMX-ozh*#S}3V6;py+Bh^44^h@*w7fA|K4>il1_bey
zZ=vkfFcX6(VE~LRM#~Qa0ly;JNtzD^0)NeXUEOII2>Wy9kAZNU2(9IRlu2Xb_+NMc
z0|P-n;}aN&@H0AzfwX@{!5E16D`HE-KtGcjR0`?icz6KB$RJ9Z0AK;gSru(yfTsBz
zX1IBx1DP}rG=L$1$U6f7#u3?PaR$Z*gZ>eK5XjNI5+ky;uE20uw$7Cpo~?H!Mqn#n
zhOsAvA3P-4P=S|-MJf%IVFKHmWh@W}5xFwpaN1mou|zDFIy?clk_TW(>?XZD2Y~yf
z1c3jw0Knn@50QXl`@2jafYau(55SSYmG%L60MCBl<$Lk$hxsoCbC&VY|Jh99@)>{t
z0@qC-EXa*PByNlZa9xCe1>u!e0|XqWqL!;B;7QyV5#YL=Hi%tzJI*QE5Q&>d{M%{x
zp9%!DNd$IXFHcNdVHanWh{LU{8Gwk_=1NS2NL-gAl2!=h_)h}4iVz^-Ip35s{uM|f
z09?gIBH~uQI|u-rue@w82!PxL4B$Zk<{}col}SK=#PuW~2ynX1@+55cSGoX#1TI%W
zkn~S|;Qna<SS(kigIFAK<tJw6KwJ&QF?jrcWco8*>0UG@T2Bw7Wk(OD{pd3k3AvwL
aklls+{9T#;KY9=dK_nttNy*sS1pPlXck4g^

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 28257edbd..1f5ae1bb9 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -403,13 +403,13 @@ __global__ void apply_pos_encoding_to_streaming_proj_kernel(
   // Apply the rotary position encoding.
   cuFloatComplex cii = {kv_cache[real_part_idx], kv_cache[complex_part_idx]};
   size_t pos = token_idx;
-  float freq = pos * (1.0 / pow(rope_theta, (float)2 * offset_in_head / head_dim));
+  float freq =
+      pos * (1.0 / pow(rope_theta, (float)2 * offset_in_head / head_dim));
 
   if (llama3_rope) {
     float pi = CUDART_PI_F;
     float wavelen = 2 * pi / freq;
-    float low_freq_wavelen =
-        original_max_position_embeddings / low_freq_factor;
+    float low_freq_wavelen = original_max_position_embeddings / low_freq_factor;
     float high_freq_wavelen =
         original_max_position_embeddings / high_freq_factor;
     if (wavelen < high_freq_wavelen) {
@@ -439,7 +439,7 @@ void apply_pos_encoding_to_streaming_proj(
   // apply rotary embedding if needed
   if (!m->rotary_embedding_meta->apply_rotary_embedding) {
     return;
-  }  
+  }
   int const kv_hidden_size = m->num_kv_heads * m->qk_dim;
   int num_tokens = 0;
   for (int req_idx = 0; req_idx < BatchConfig::max_requests_per_batch();
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 8644a5a3c..cab5f749b 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -149,7 +149,7 @@ void inference_kernel_wrapper(Legion::Context ctx,
 
   // if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
   //     !CanApplyCustomAllReduce(num_elements, dtype)) {
-    // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
+  // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
   runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllReduce(input.ptr,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 734855fa5..4b6836754 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -488,7 +488,6 @@ size_t RequestManager::get_num_ssms() {
   return ssm_models.size();
 }
 
-
 RequestManager::RequestGuid
     RequestManager::register_new_request(GenerationRequest const &req) {
   // Add a new request
@@ -739,7 +738,6 @@ bool isPrefixAndRemove(std::vector<int> const &prefix, std::vector<int> &vec) {
   return false;
 }
 
-
 void RequestManager::request_complete_clean_up(int batch_index) {
   RequestGuid guid = guid_of_requests[batch_index];
   profiling_requests[guid].finish_time =

From 30efe4d60f97408f545e7c8432ba74f1b9cad129 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 15 Nov 2024 23:24:48 -0800
Subject: [PATCH 630/667] feat: use custom allreduce for performance

---
 include/flexflow/utils/communication_buffer.h |   5 +-
 src/parallel_ops/kernels/allreduce_kernels.cu | 128 ++++++++----------
 src/utils/communication_buffer.cu             |  22 +--
 3 files changed, 66 insertions(+), 89 deletions(-)

diff --git a/include/flexflow/utils/communication_buffer.h b/include/flexflow/utils/communication_buffer.h
index 5935c4859..3c14284d6 100644
--- a/include/flexflow/utils/communication_buffer.h
+++ b/include/flexflow/utils/communication_buffer.h
@@ -24,7 +24,6 @@
 #include <rccl/rccl.h>
 #endif
 #endif
-#include "legion.h"
 
 // adapted from https://github.com/mlc-ai/relax
 
@@ -59,9 +58,7 @@ class CommunicationBuffer {
   int *barrier_flag;
 };
 
-CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
-                                                    Legion::Runtime *runtime,
-                                                    int num_devices,
+CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
                                                     int device_id,
                                                     ncclComm_t ncclComm,
                                                     void *allgather_src,
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index cab5f749b..2574cce2f 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -58,9 +58,7 @@ AllReduceMeta::~AllReduceMeta() {
 namespace Kernels {
 namespace AllReduce {
 
-CommunicationBuffer *get_or_create_comm_buffer(Legion::Context ctx,
-                                               Legion::Runtime *runtime,
-                                               AllReduceMeta *m,
+CommunicationBuffer *get_or_create_comm_buffer(AllReduceMeta *m,
                                                int num_devices,
                                                int device_id,
                                                ncclComm_t ncclComm,
@@ -71,9 +69,7 @@ CommunicationBuffer *get_or_create_comm_buffer(Legion::Context ctx,
     return iter->second;
   } else {
     CommunicationBuffer *comm_buffer =
-        create_comm_buf_with_local_ptr(ctx,
-                                       runtime,
-                                       num_devices,
+        create_comm_buf_with_local_ptr(num_devices,
                                        device_id,
                                        ncclComm,
                                        m->allgather_src,
@@ -123,8 +119,8 @@ inline bool CanApplyTwoShotAllReduce(int64_t num_elements,
 }
 
 // Customized all-reduce kernel backed by CUDA Peer memory.
-void inference_kernel_wrapper(Legion::Context ctx,
-                              Legion::Runtime *runtime,
+void inference_kernel_wrapper(Context ctx,
+                              Runtime *runtime,
                               AllReduceMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
@@ -138,72 +134,68 @@ void inference_kernel_wrapper(Legion::Context ctx,
   assert(input.domain == output.domain);
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   size_t num_elements = bc->num_tokens * hidden_dim_size;
-  // int num_devices = m->handle.num_devices;
-  // int device_id = m->handle.device_id;
+  int num_devices = m->handle.num_devices;
+  int device_id = m->handle.device_id;
   ncclComm_t ncclComm = m->handle.ncclComm;
   DataType dtype = input.data_type;
 
-  // tensorrt_llm::AllReduceStrategyType strategy =
-  //     tensorrt_llm::SelectImplementation(
-  //         num_elements * ((get_bits(dtype) + 7) / 8), num_devices);
+  tensorrt_llm::AllReduceStrategyType strategy =
+      tensorrt_llm::SelectImplementation(
+          num_elements * ((get_bits(dtype) + 7) / 8), num_devices);
+
+  if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
+      !CanApplyCustomAllReduce(num_elements, dtype)) {
+    // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
+    ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
+    runtime->concurrent_task_barrier(ctx);
+    checkNCCL(ncclAllReduce(input.ptr,
+                            output.ptr,
+                            num_elements,
+                            nccl_data_type,
+                            ncclSum,
+                            ncclComm,
+                            stream));
+    runtime->concurrent_task_barrier(ctx);
+    return;
+  }
 
-  // if (strategy == tensorrt_llm::AllReduceStrategyType::RING ||
-  //     !CanApplyCustomAllReduce(num_elements, dtype)) {
-  // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
-  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(dtype);
-  runtime->concurrent_task_barrier(ctx);
-  checkNCCL(ncclAllReduce(input.ptr,
-                          output.ptr,
-                          num_elements,
-                          nccl_data_type,
-                          ncclSum,
-                          ncclComm,
-                          stream));
-  runtime->concurrent_task_barrier(ctx);
-  //   return;
-  // }
-
-  // // Initialize the all-reduce kernel arguments.
-  // tensorrt_llm::AllReduceParams params;
-  // params.ranks_per_node = num_devices;
-  // params.rank = device_id;
-  // params.local_rank = device_id;
-  // CommunicationBuffer *comm_buffer =
-  //     get_or_create_comm_buffer(ctx,
-  //                               runtime,
-  //                               m,
-  //                               num_devices,
-  //                               device_id,
-  //                               ncclComm,
-  //                               const_cast<void *>(input.ptr),
-  //                               stream);
-  // params.barrier_flag = ++(*comm_buffer->barrier_flag);
-  // for (int i = 0; i < num_devices; ++i) {
-  //   params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
-  // }
-  // for (int i = 0; i < num_devices; ++i) {
-  //   params.peer_barrier_ptrs_in[i] =
-  //       reinterpret_cast<uint32_t *>(comm_buffer->barrier_in[i]);
-  // }
-  // for (int i = 0; i < num_devices; ++i) {
-  //   params.peer_barrier_ptrs_out[i] =
-  //       reinterpret_cast<uint32_t *>(comm_buffer->barrier_out[i]);
-  // }
-
-  // if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) {
-  //   // Two-shot all-reduce does not support this case.
-  //   // So we fallback to the one-shot strategy.
-  //   strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
-  // }
-
-  // runtime->concurrent_task_barrier(ctx);
-  // tensorrt_llm::customAllReduce(
-  //     params, output.ptr, num_elements, dtype, strategy, stream);
-  // runtime->concurrent_task_barrier(ctx);
+  // Initialize the all-reduce kernel arguments.
+  tensorrt_llm::AllReduceParams params;
+  params.ranks_per_node = num_devices;
+  params.rank = device_id;
+  params.local_rank = device_id;
+  CommunicationBuffer *comm_buffer =
+      get_or_create_comm_buffer(m,
+                                num_devices,
+                                device_id,
+                                ncclComm,
+                                const_cast<void *>(input.ptr),
+                                stream);
+  params.barrier_flag = ++(*comm_buffer->barrier_flag);
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_comm_buffer_ptrs[i] = comm_buffer->comm_ptrs[i];
+  }
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_barrier_ptrs_in[i] =
+        reinterpret_cast<uint32_t *>(comm_buffer->barrier_in[i]);
+  }
+  for (int i = 0; i < num_devices; ++i) {
+    params.peer_barrier_ptrs_out[i] =
+        reinterpret_cast<uint32_t *>(comm_buffer->barrier_out[i]);
+  }
+
+  if (!CanApplyTwoShotAllReduce(num_elements, dtype, num_devices)) {
+    // Two-shot all-reduce does not support this case.
+    // So we fallback to the one-shot strategy.
+    strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
+  }
+
+  tensorrt_llm::customAllReduce(
+      params, output.ptr, num_elements, dtype, strategy, stream);
 }
 
-void forward_kernel_wrapper(Legion::Context ctx,
-                            Legion::Runtime *runtime,
+void forward_kernel_wrapper(Context ctx,
+                            Runtime *runtime,
                             AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output) {
diff --git a/src/utils/communication_buffer.cu b/src/utils/communication_buffer.cu
index 83b0385a3..cd6cc0db4 100644
--- a/src/utils/communication_buffer.cu
+++ b/src/utils/communication_buffer.cu
@@ -23,9 +23,7 @@
 // For the i-th pointer, if i is the worker id of the given device,
 // then the returned i-th ptr_group is the local pointer,
 // or otherwise it is an peer memory pointer from the remote device.
-std::vector<void *> create_peer_ptr_group(Legion::Context ctx,
-                                          Legion::Runtime *runtime,
-                                          int num_devices,
+std::vector<void *> create_peer_ptr_group(int num_devices,
                                           int device_id,
                                           ncclComm_t ncclComm,
                                           void *allgather_src,
@@ -48,14 +46,12 @@ std::vector<void *> create_peer_ptr_group(Legion::Context ctx,
                             cudaMemcpyHostToDevice,
                             stream));
 
-  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllGather(allgather_src,
                           allgather_dst,
                           sizeof(void *),
                           ncclChar,
                           ncclComm,
                           stream));
-  runtime->concurrent_task_barrier(ctx);
 
   std::vector<void *> peer_pointers(num_devices);
   checkCUDA(cudaMemcpyAsync(peer_pointers.data(),
@@ -89,9 +85,7 @@ void free_peer_ptr_group(std::vector<void *> ptr_group,
 // all-gathering peer pointers across devices. The size of allgather_src should
 // be sizeof(void*), and the size of allgather_dst should be sizeof(void*) *
 // num_devices.
-CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
-                                                    Legion::Runtime *runtime,
-                                                    int num_devices,
+CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
                                                     int device_id,
                                                     ncclComm_t ncclComm,
                                                     void *allgather_src,
@@ -106,27 +100,21 @@ CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
   comm_buf->num_devices = num_devices;
   comm_buf->device_id = device_id;
   comm_buf->local_ptr = local_ptr;
-  comm_buf->comm_ptrs = create_peer_ptr_group(ctx,
-                                              runtime,
-                                              num_devices,
+  comm_buf->comm_ptrs = create_peer_ptr_group(num_devices,
                                               device_id,
                                               ncclComm,
                                               allgather_src,
                                               allgather_dst,
                                               local_ptr,
                                               stream);
-  comm_buf->barrier_in = create_peer_ptr_group(ctx,
-                                               runtime,
-                                               num_devices,
+  comm_buf->barrier_in = create_peer_ptr_group(num_devices,
                                                device_id,
                                                ncclComm,
                                                allgather_src,
                                                allgather_dst,
                                                barrier_in_ptr,
                                                stream);
-  comm_buf->barrier_out = create_peer_ptr_group(ctx,
-                                                runtime,
-                                                num_devices,
+  comm_buf->barrier_out = create_peer_ptr_group(num_devices,
                                                 device_id,
                                                 ncclComm,
                                                 allgather_src,

From 76df177dea6c9f0c6f45537af201664309533b21 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 16 Nov 2024 00:27:12 -0800
Subject: [PATCH 631/667] chore: minor

---
 src/runtime/request_manager.cc | 39 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 4b6836754..d2ddf6f34 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -500,19 +500,19 @@ RequestManager::RequestGuid
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
-  // for (int i = 0; i < tokens.size(); i++) {
-  //   std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-  // }
-  // std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
+  for (int i = 0; i < tokens.size(); i++) {
+    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+  }
+  std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   request.set_slo_ratio(req.slo_ratio);
 
   if (get_num_ssms() == 0) {
-    // std::cout << "No small speculative model registered, using incremental "
-    //  "decoding."
-    // << std::endl;
+    std::cout << "No small speculative model registered, using incremental "
+                 "decoding."
+              << std::endl;
   } else {
-    // std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
+    std::cout << "Num of SSMs: " << get_num_ssms() << std::endl;
     assert(get_num_ssms() == 1 && "Only one SSM is supported now.");
     init_token_tree(request.guid);
   }
@@ -762,8 +762,6 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   // }
   // std::string output =
   //     this->tokenizer_->Decode(std::vector<int>(bos_it, eos_it));
-  std::string output = this->tokenizer_->Decode(request.tokens);
-
   {
     std::lock_guard<std::mutex> const lock(request_result_mutex);
     request_generation_results[guid].output_tokens = request.tokens;
@@ -786,12 +784,13 @@ void RequestManager::request_complete_clean_up(int batch_index) {
 
   trigger_request_completion_future(guid);
 
-  std::cout << "Request " << guid << " completed" << std::endl;
-  // std::cout << "<bos>" << output;
-  // if (eos_rit != request.tokens.rend()) {
-  //   std::cout << "<eos>";
-  // }
-  // std::cout << std::endl << std::endl;
+  std::string output = this->tokenizer_->Decode(request.tokens);
+  std::cout << "Request " << guid << " completed: " << std::endl;
+  std::cout << "<bos>" << output;
+  if (is_eos_token(request.tokens.back())) {
+    std::cout << "<eos>";
+  }
+  std::cout << std::endl << std::endl;
   {
     RequestProfileInfo profile_info = profiling_requests[guid];
 
@@ -2424,10 +2423,10 @@ std::vector<GenerationResult>
   for (size_t i = 0; i < requests.size(); i++) {
     requests[i].slo_ratio = emission_machine.sample_slo_ratio();
     requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms();
-    // printf("Prompt[%ld] with slo %.3f: %s\n",
-    //        i,
-    //        requests[i].slo_ratio,
-    //        requests[i].prompt.c_str());
+    printf("Prompt[%ld] with slo %.3f: %s\n",
+           i,
+           requests[i].slo_ratio,
+           requests[i].prompt.c_str());
     RequestManager::RequestGuid guid = rm->register_new_request(requests[i]);
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);

From 6c3bebca8e12c1c50e7aaafdc8d8bbc051b49c0d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 16 Nov 2024 02:16:06 -0800
Subject: [PATCH 632/667] chore: minor

---
 src/runtime/request_manager.cc | 39 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index d2ddf6f34..66a95b3ed 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1875,7 +1875,6 @@ bool RequestManager::update_ssm_inference_results(
     profiling_requests[guid].ssm_decoding_steps++;
 
     if (current_ssm_step == ssm_tree_depth) {
-      assert(profiling_requests[guid].ssm_decoding_steps % ssm_tree_depth == 0);
       profiling_requests[guid].speculation_start_timestamp =
           profiling.ssm_step_start;
       profiling_requests[guid].speculation_end_timestamp =
@@ -2728,6 +2727,25 @@ void RequestManager::terminate_background_server() {
     for (int num_tokens : profiling.generated_tokens_per_step) {
       total_tokens += num_tokens;
     }
+
+    if (profiling_requests.size() != all_requests.size()) {
+      std::cerr << "profiling_requests.size()=" << profiling_requests.size()
+                << " != all_requests.size()=" << all_requests.size()
+                << std::endl;
+    }
+    assert(profiling_requests.size() == all_requests.size());
+    str += "\nDecoding Steps: ";
+    for (auto const &profiling_info : profiling_requests) {
+      int request_id = profiling_info.first;
+      Request &request = all_requests[request_id];
+      str += "Request " + std::to_string(request_id) + ": ";
+      str += std::to_string(profiling_info.second.llm_decoding_steps);
+      str += "/";
+      str += std::to_string(request.decode_length());
+      float speedup = (float)request.decode_length() /
+                      profiling_info.second.llm_decoding_steps;
+      str += " " + std::to_string(speedup) + "\n";
+    }
     str += "\n total_time_ms(" + std::to_string(total_time / 1000.0) + ")";
     str += "\n total_requests(" + std::to_string(total_requests) + "/" +
            std::to_string(all_requests.size()) + ")";
@@ -2878,25 +2896,6 @@ void RequestManager::terminate_background_server() {
     goodput_str += ")";
     str += goodput_str;
 
-    if (profiling_requests.size() != all_requests.size()) {
-      std::cerr << "profiling_requests.size()=" << profiling_requests.size()
-                << " != all_requests.size()=" << all_requests.size()
-                << std::endl;
-    }
-    assert(profiling_requests.size() == all_requests.size());
-    str += "\nDecoding Steps: ";
-    for (auto const &profiling_info : profiling_requests) {
-      int request_id = profiling_info.first;
-      Request &request = all_requests[request_id];
-      str += "Request " + std::to_string(request_id) + ": ";
-      str += std::to_string(profiling_info.second.llm_decoding_steps);
-      str += "/";
-      str += std::to_string(request.decode_length());
-      float speedup = (float)request.decode_length() /
-                      profiling_info.second.llm_decoding_steps;
-      str += " " + std::to_string(speedup) + "\n";
-    }
-
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     request_queue_cv.notify_all();

From 48b41530a5ff916acc024f402754296c47507dc2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 17 Nov 2024 00:38:39 -0800
Subject: [PATCH 633/667] fix: argtopk memory

---
 include/flexflow/ops/arg_topk.h |  2 +-
 src/ops/arg_topk.cu             | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/flexflow/ops/arg_topk.h b/include/flexflow/ops/arg_topk.h
index 86d0bb239..f46404e9e 100644
--- a/include/flexflow/ops/arg_topk.h
+++ b/include/flexflow/ops/arg_topk.h
@@ -18,7 +18,7 @@ class ArgTopKMeta : public OpMeta {
   bool renormalize;
   Realm::RegionInstance reserveInst;
   void *half_precision_output;
-  int max_input_size;
+  int max_output_size;
   std::unordered_map<cudaStream_t, raft::device_resources *> device_resources;
   ArgTopKMeta(FFHandler handle,
               Op const *op,
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index a88963aaa..7558fdbcc 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -184,13 +184,13 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta *m,
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
-                            m->k,
+                            k,
                             m->sorted,
                             m->renormalize,
                             bc,
                             stream);
     // transfer data from half to float (half_precision_output to output)
-    int size = length * batch_size;
+    int size = k * batch_size;
     half2float_kernel<<<GET_BLOCKS(size),
                         min((int)CUDA_NUM_THREADS, size),
                         0,
@@ -203,7 +203,7 @@ void ArgTopK::forward_kernel_wrapper(ArgTopKMeta *m,
                             indices.get_int32_ptr(),
                             batch_size,
                             length,
-                            m->k,
+                            k,
                             m->sorted,
                             m->renormalize,
                             bc,
@@ -227,11 +227,11 @@ ArgTopKMeta::ArgTopKMeta(FFHandler handler,
                          Op const *op,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, op) {
-  max_input_size = BatchConfig::MAX_NUM_TOKENS * 32000; // TODO: use vocab_size
+  max_output_size = BatchConfig::MAX_NUM_TOKENS * BatchConfig::MAX_K_LOGITS;
   gpu_mem_allocator.create_legion_instance(
-      reserveInst, sizeof(half) * max_input_size, "ArgTopKMeta");
+      reserveInst, sizeof(half) * max_output_size, "ArgTopKMeta");
   half_precision_output = gpu_mem_allocator.allocate_instance_untyped(
-      sizeof(half) * max_input_size);
+      sizeof(half) * max_output_size);
 }
 
 ArgTopKMeta::~ArgTopKMeta() {

From 65f7f52085b6a82359bc4343ba7cb03c64eeeb2b Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 19 Nov 2024 07:54:30 +0000
Subject: [PATCH 634/667] chore: eliminate inconsistence

---
 src/runtime/request_manager.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 66a95b3ed..2f044f7e0 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1041,8 +1041,6 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
   bool request_completed = false;
   int nb_requests_decoded = 0;
   long long int current_time = Realm::Clock::current_time_in_microseconds();
-  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
-                                     1e-3);
 
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
@@ -1096,6 +1094,8 @@ bool RequestManager::update_llm_decode_results(InferenceResult const &result) {
                 << output << std::endl;
     }
   }
+  profiling.llm_step_times.push_back((current_time - profiling.llm_step_start) *
+                                     1e-3);
   profiling.requests_per_step.push_back(nb_requests_decoded);
   profiling.generated_tokens_per_step.push_back(nb_requests_decoded);
   return request_completed;

From 127ca97635705ed44c07650ead25e0b424bd953d Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 19 Nov 2024 08:28:49 +0000
Subject: [PATCH 635/667] fix: add Legion concurrent_task_barrier to eliminate
 dead lock in AllReduce

---
 include/flexflow/utils/communication_buffer.h |  6 ++++-
 src/parallel_ops/kernels/allreduce_kernels.cu | 14 +++++++++---
 src/utils/communication_buffer.cu             | 22 ++++++++++++++-----
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/include/flexflow/utils/communication_buffer.h b/include/flexflow/utils/communication_buffer.h
index 3c14284d6..016860bf6 100644
--- a/include/flexflow/utils/communication_buffer.h
+++ b/include/flexflow/utils/communication_buffer.h
@@ -16,6 +16,7 @@
 #ifndef _COMMUNICATION_BUFFER_H
 #define _COMMUNICATION_BUFFER_H
 
+#include "legion.h"
 #include <vector>
 #ifdef FF_USE_NCCL
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -58,7 +59,10 @@ class CommunicationBuffer {
   int *barrier_flag;
 };
 
-CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
+// All NCCL operations need to be wrapped by Legion concurrent_task_barrier.
+CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
+                                                    Legion::Runtime *runtime,
+                                                    int num_devices,
                                                     int device_id,
                                                     ncclComm_t ncclComm,
                                                     void *allgather_src,
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2574cce2f..2dc1caf19 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -58,7 +58,9 @@ AllReduceMeta::~AllReduceMeta() {
 namespace Kernels {
 namespace AllReduce {
 
-CommunicationBuffer *get_or_create_comm_buffer(AllReduceMeta *m,
+CommunicationBuffer *get_or_create_comm_buffer(Context ctx,
+                                               Runtime *runtime,
+                                               AllReduceMeta *m,
                                                int num_devices,
                                                int device_id,
                                                ncclComm_t ncclComm,
@@ -69,7 +71,9 @@ CommunicationBuffer *get_or_create_comm_buffer(AllReduceMeta *m,
     return iter->second;
   } else {
     CommunicationBuffer *comm_buffer =
-        create_comm_buf_with_local_ptr(num_devices,
+        create_comm_buf_with_local_ptr(ctx,
+                                       runtime,
+                                       num_devices,
                                        device_id,
                                        ncclComm,
                                        m->allgather_src,
@@ -165,7 +169,9 @@ void inference_kernel_wrapper(Context ctx,
   params.rank = device_id;
   params.local_rank = device_id;
   CommunicationBuffer *comm_buffer =
-      get_or_create_comm_buffer(m,
+      get_or_create_comm_buffer(ctx,
+                                runtime,
+                                m,
                                 num_devices,
                                 device_id,
                                 ncclComm,
@@ -190,8 +196,10 @@ void inference_kernel_wrapper(Context ctx,
     strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
   }
 
+  runtime->concurrent_task_barrier(ctx);
   tensorrt_llm::customAllReduce(
       params, output.ptr, num_elements, dtype, strategy, stream);
+  runtime->concurrent_task_barrier(ctx);
 }
 
 void forward_kernel_wrapper(Context ctx,
diff --git a/src/utils/communication_buffer.cu b/src/utils/communication_buffer.cu
index cd6cc0db4..83b0385a3 100644
--- a/src/utils/communication_buffer.cu
+++ b/src/utils/communication_buffer.cu
@@ -23,7 +23,9 @@
 // For the i-th pointer, if i is the worker id of the given device,
 // then the returned i-th ptr_group is the local pointer,
 // or otherwise it is an peer memory pointer from the remote device.
-std::vector<void *> create_peer_ptr_group(int num_devices,
+std::vector<void *> create_peer_ptr_group(Legion::Context ctx,
+                                          Legion::Runtime *runtime,
+                                          int num_devices,
                                           int device_id,
                                           ncclComm_t ncclComm,
                                           void *allgather_src,
@@ -46,12 +48,14 @@ std::vector<void *> create_peer_ptr_group(int num_devices,
                             cudaMemcpyHostToDevice,
                             stream));
 
+  runtime->concurrent_task_barrier(ctx);
   checkNCCL(ncclAllGather(allgather_src,
                           allgather_dst,
                           sizeof(void *),
                           ncclChar,
                           ncclComm,
                           stream));
+  runtime->concurrent_task_barrier(ctx);
 
   std::vector<void *> peer_pointers(num_devices);
   checkCUDA(cudaMemcpyAsync(peer_pointers.data(),
@@ -85,7 +89,9 @@ void free_peer_ptr_group(std::vector<void *> ptr_group,
 // all-gathering peer pointers across devices. The size of allgather_src should
 // be sizeof(void*), and the size of allgather_dst should be sizeof(void*) *
 // num_devices.
-CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
+CommunicationBuffer *create_comm_buf_with_local_ptr(Legion::Context ctx,
+                                                    Legion::Runtime *runtime,
+                                                    int num_devices,
                                                     int device_id,
                                                     ncclComm_t ncclComm,
                                                     void *allgather_src,
@@ -100,21 +106,27 @@ CommunicationBuffer *create_comm_buf_with_local_ptr(int num_devices,
   comm_buf->num_devices = num_devices;
   comm_buf->device_id = device_id;
   comm_buf->local_ptr = local_ptr;
-  comm_buf->comm_ptrs = create_peer_ptr_group(num_devices,
+  comm_buf->comm_ptrs = create_peer_ptr_group(ctx,
+                                              runtime,
+                                              num_devices,
                                               device_id,
                                               ncclComm,
                                               allgather_src,
                                               allgather_dst,
                                               local_ptr,
                                               stream);
-  comm_buf->barrier_in = create_peer_ptr_group(num_devices,
+  comm_buf->barrier_in = create_peer_ptr_group(ctx,
+                                               runtime,
+                                               num_devices,
                                                device_id,
                                                ncclComm,
                                                allgather_src,
                                                allgather_dst,
                                                barrier_in_ptr,
                                                stream);
-  comm_buf->barrier_out = create_peer_ptr_group(num_devices,
+  comm_buf->barrier_out = create_peer_ptr_group(ctx,
+                                                runtime,
+                                                num_devices,
                                                 device_id,
                                                 ncclComm,
                                                 allgather_src,

From a74775b47560d460d6c4f3b641b176aeb651758c Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 19 Nov 2024 09:29:57 +0000
Subject: [PATCH 636/667] feat: add SSM_TP

---
 inference/spec_infer/spec_infer.cc | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 6be63c7fb..99c7e2432 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -63,6 +63,7 @@ void parse_input_args(char **argv,
                       ModelNames &model_names,
                       bool &use_full_precision,
                       bool &verbose,
+                      int &ssm_tp_degree,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_tokens_per_ssm_batch,
@@ -104,6 +105,10 @@ void parse_input_args(char **argv,
       model_names.ssm_model_names.push_back(ssm_model_name);
       continue;
     }
+    if (!strcmp(argv[i], "-ssm-tp-degree")) {
+      ssm_tp_degree = std::stoi(argv[++i]);
+      continue;
+    }
     // cache folder
     if (!strcmp(argv[i], "-cache-folder")) {
       paths.cache_folder_path = std::string(argv[++i]);
@@ -391,6 +396,7 @@ void FlexFlow::top_level_task(Task const *task,
   ModelMeta model_metadata;
   bool use_full_precision = false;
   bool verbose = false;
+  int ssm_tp_degree = 1;
   int max_requests_per_batch = 8;
   int max_tokens_per_batch = 128;
   int max_tokens_per_ssm_batch = -1;
@@ -426,6 +432,7 @@ void FlexFlow::top_level_task(Task const *task,
                    model_metadata.model_names,
                    use_full_precision,
                    verbose,
+                   ssm_tp_degree,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_tokens_per_ssm_batch,
@@ -461,6 +468,8 @@ void FlexFlow::top_level_task(Task const *task,
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
+  assert(ssm_tp_degree >= 1 &&
+         ssm_tp_degree <= ffconfig.numNodes * ffconfig.workersPerNode);
 
   // Sanity check for SpecInfer old version
   if (spec_infer_old_version) {
@@ -537,11 +546,12 @@ void FlexFlow::top_level_task(Task const *task,
   std::vector<int> ssm_model_ids;
   std::vector<FFModel> ssm_models;
   FFConfig bm_config = ffconfig;
-  bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
-      bm_config.pipeline_parallelism_degree = 1;
-  //   bm_config.data_parallelism_degree = 1;
-  //   bm_config.tensor_parallelism_degree = 4;
-  //   bm_config.pipeline_parallelism_degree = 1;
+  std::cout << "SSM TP Degree: " << ssm_tp_degree << std::endl;
+  // bm_config.data_parallelism_degree = bm_config.tensor_parallelism_degree =
+  //     bm_config.pipeline_parallelism_degree = 1;
+  bm_config.data_parallelism_degree = 1;
+  bm_config.tensor_parallelism_degree = ssm_tp_degree;
+  bm_config.pipeline_parallelism_degree = 1;
   for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
     FFModel beam_model(bm_config);
     ssm_models.push_back(beam_model);

From 54acb6d9968e72c03bc524d04e6ec43a58362659 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Tue, 19 Nov 2024 11:14:17 +0000
Subject: [PATCH 637/667] chore: minor

---
 src/runtime/request_manager.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 2f044f7e0..94aa0a261 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -500,10 +500,10 @@ RequestManager::RequestGuid
     request.tokens.push_back(bos_token_id);
   }
   std::vector<int32_t> tokens = this->tokenizer_->Encode(req.prompt);
-  for (int i = 0; i < tokens.size(); i++) {
-    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-  }
-  std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
+  // for (int i = 0; i < tokens.size(); i++) {
+  //   std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+  // }
+  // std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   request.set_slo_ratio(req.slo_ratio);
 

From d845cb2680dda947f249f2661e1081646cb8c4e4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 21 Nov 2024 00:43:11 -0800
Subject: [PATCH 638/667] feat: add flashinfer ResidualRMSNorm

---
 src/ops/kernels/residual_rms_norm_kernels.cu | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 65125bae1..088564eca 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flashinfer/norm.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
 #include "flexflow/ops/residual_rms_norm.h"
@@ -143,7 +144,7 @@ template <typename T>
 void forward_kernel(ResidualRMSNormMeta const *m,
                     T const *input1_ptr,
                     T const *input2_ptr,
-                    T const *weight_ptr,
+                    T const *weight_const_ptr,
                     T *residual_output_ptr,
                     T *output_ptr,
                     int batch_size,
@@ -168,8 +169,23 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                                                residual_output_ptr,
                                                static_cast<T *>(m->rms_ptr),
                                                static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
+                                               weight_const_ptr,
                                                output_ptr);
+
+  //   checkCUDA(cudaMemcpyAsync(output_ptr,
+  //                           input1_ptr,
+  //                           batch_size * m->in_dim * sizeof(T),
+  //                           cudaMemcpyDeviceToDevice,
+  //                           stream));
+  // checkCUDA(cudaMemcpyAsync(residual_output_ptr,
+  //                           input2_ptr,
+  //                           batch_size * m->in_dim * sizeof(T),
+  //                           cudaMemcpyDeviceToDevice,
+  //                           stream));
+  // T* weight_ptr = const_cast<T*>(weight_const_ptr);
+  // // inplace residual_rms_norm
+  // flashinfer::norm::FusedAddRMSNorm<T>(
+  //     output_ptr, residual_output_ptr, weight_ptr, batch_size, m->in_dim, m->eps, stream);
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,

From 1c2875f9164e91d350593ad69b874c3806e8d50b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 23 Nov 2024 01:03:59 -0800
Subject: [PATCH 639/667] feat: improve ResidualRMSNorm

---
 src/ops/kernels/residual_rms_norm_kernels.cu | 201 ++++++++++---------
 1 file changed, 104 insertions(+), 97 deletions(-)

diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 088564eca..0d77bfb7f 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -13,7 +13,11 @@
  * limitations under the License.
  */
 
-#include "flashinfer/norm.cuh"
+#include <numeric>
+#include "flashinfer/utils.cuh"
+#include "flashinfer/math.cuh"
+#include "flashinfer/utils.cuh"
+#include "flashinfer/vec_dtypes.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
 #include "flexflow/ops/residual_rms_norm.h"
@@ -57,94 +61,120 @@ ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   }
 }
 
-namespace Kernels {
-namespace ResidualRMSNorm {
-
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_DOWN(T value,
-                                            unsigned int delta,
-                                            int width = warpSize,
-                                            unsigned int mask = 0xffffffff) {
-#ifndef __HIP_PLATFORM_HCC__
-  return __shfl_down_sync(mask, value, delta, width);
-#else
-  return __shfl_down(value, delta, width);
-#endif
-}
+// Adopted from flashinfer (https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/norm.cuh)
+// Main modification is for non-inplace computation
+template <uint32_t VEC_SIZE, typename T>
+__global__ void FusedAddRMSNormKernel(T const * __restrict__ input, T const * __restrict__ residual, T const * __restrict__ weight,
+                                      T* __restrict__ output, T* __restrict__ residual_output,
+                                      const uint32_t d, float eps) {
+  const uint32_t bx = blockIdx.x;
+  const uint32_t tx = threadIdx.x, ty = threadIdx.y;
+  constexpr uint32_t warp_size = 32;
+  const uint32_t num_warps = blockDim.y;
+  const uint32_t thread_id = tx + ty * warp_size;
+  const uint32_t num_threads = num_warps * warp_size;
+  const uint32_t rounds = flashinfer::ceil_div(d, VEC_SIZE * num_threads);
+  extern __shared__ float smem[];
+
+  float sum_sq = 0.f;
+
+  for (uint32_t i = 0; i < rounds; i++) {
+    flashinfer::vec_t<T, VEC_SIZE> input_vec;
+    input_vec.fill(0);
+    flashinfer::vec_t<T, VEC_SIZE> residual_vec;
+    residual_vec.fill(0);
+    flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
+    residual_output_vec.fill(0);
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      residual_vec.load(residual + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+    }
+#pragma unroll
+    for (uint32_t j = 0; j < VEC_SIZE; j++) {
+      float x = float(input_vec[j]);
+      x += float(residual_vec[j]);
+      sum_sq += x * x;
+      residual_output_vec[j] = (T)x;
+    }
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      residual_output_vec.store(residual_output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+    }
+  }
 
-template <typename T>
-__inline__ __device__ T WarpReduceSum(T val) {
+  // first, warp reduce sum
 #pragma unroll
-  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
-    val += WARP_SHFL_DOWN(val, offset);
+  for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) {
+    sum_sq += flashinfer::math::shfl_xor_sync(sum_sq, offset);
   }
-  return val;
-}
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
+  smem[ty] = sum_sq;
   __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
+  // then, cross warp reduce sum using only the first warp
+  if (ty == 0) {
+    sum_sq = (tx < num_warps) ? smem[tx] : 0.f;
+#pragma unroll
+    for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) {
+      sum_sq += flashinfer::math::shfl_xor_sync(sum_sq, offset);
+    }
+    smem[0] = sum_sq;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
+
+  float rms_rcp = flashinfer::math::rsqrt(smem[0] / float(d) + eps);
+
+  for (uint32_t i = 0; i < rounds; i++) {
+    flashinfer::vec_t<T, VEC_SIZE> input_vec;
+    flashinfer::vec_t<T, VEC_SIZE> weight_vec;
+    flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
+    flashinfer::vec_t<T, VEC_SIZE> output_vec;
+    input_vec.fill(0);
+    weight_vec.fill(0);
+    residual_output_vec.fill(0);
+    output_vec.fill(0);
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      weight_vec.load(weight + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      residual_output_vec.load(residual_output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+    }
+#pragma unroll
+    for (uint32_t j = 0; j < VEC_SIZE; j++) {
+      output_vec[j] = float(residual_output_vec[j]) * rms_rcp * float(weight_vec[j]);
+    }
+    if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+      output_vec.store(output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+    }
   }
-  return val;
 }
 
 template <typename T>
-__global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
-                                                  float eps,
-                                                  T const *X1,
-                                                  T const *X2,
-                                                  T *X_out,
-                                                  T *rms,
-                                                  T *Y,
-                                                  T const *weights,
-                                                  T *output) {
-  __shared__ float v_shared[C10_WARP_SIZE];
-  int64_t const i = blockIdx.x;
-  float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
-    int64_t const index = i * N + j;
-    X_out[index] = X1[index] + X2[index];
-    sum +=
-        (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
-  }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
-
-  if (threadIdx.x == 0) {
-    rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
-  }
-
-  __syncthreads();
-
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
-  }
+cudaError_t FusedAddRMSNorm(T const * input, T const * residual, T const * weight, 
+                            T * output, T * residual_output,
+                            uint32_t batch_size, uint32_t d,
+                            float eps = 1e-5, cudaStream_t stream = 0) {
+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
+
+  const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
+  const uint32_t num_warps = flashinfer::ceil_div(block_size, 32);
+  dim3 nblks(batch_size);
+  dim3 nthrs(32, num_warps);
+  const uint32_t smem_size = num_warps * sizeof(float);
+  void* args[] = {&input, &residual, &weight, &output, &residual_output, &d, &eps};
+
+  DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
+    auto kernel = FusedAddRMSNormKernel<VEC_SIZE, T>;
+    FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+  });
+
+  return cudaSuccess;
 }
 
+namespace Kernels {
+namespace ResidualRMSNorm {
 template <typename T>
 void forward_kernel(ResidualRMSNormMeta const *m,
                     T const *input1_ptr,
                     T const *input2_ptr,
-                    T const *weight_const_ptr,
+                    T const *weight_ptr,
                     T *residual_output_ptr,
                     T *output_ptr,
                     int batch_size,
@@ -161,31 +191,8 @@ void forward_kernel(ResidualRMSNormMeta const *m,
   int num_threads =
       std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
-  ResidualRMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input1_ptr,
-                                               input2_ptr,
-                                               residual_output_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_const_ptr,
-                                               output_ptr);
-
-  //   checkCUDA(cudaMemcpyAsync(output_ptr,
-  //                           input1_ptr,
-  //                           batch_size * m->in_dim * sizeof(T),
-  //                           cudaMemcpyDeviceToDevice,
-  //                           stream));
-  // checkCUDA(cudaMemcpyAsync(residual_output_ptr,
-  //                           input2_ptr,
-  //                           batch_size * m->in_dim * sizeof(T),
-  //                           cudaMemcpyDeviceToDevice,
-  //                           stream));
-  // T* weight_ptr = const_cast<T*>(weight_const_ptr);
-  // // inplace residual_rms_norm
-  // flashinfer::norm::FusedAddRMSNorm<T>(
-  //     output_ptr, residual_output_ptr, weight_ptr, batch_size, m->in_dim, m->eps, stream);
+  checkCUDA(FusedAddRMSNorm<T>(
+      input1_ptr, input2_ptr, weight_ptr, output_ptr, residual_output_ptr, batch_size, m->in_dim, m->eps, stream));
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,

From 1f6dab4b34cdfe83eddcba9ed90468dd38845bf2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 23 Nov 2024 01:04:15 -0800
Subject: [PATCH 640/667] fix: AllReduce minor

---
 src/parallel_ops/kernels/allreduce_kernels.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2dc1caf19..0e5c15008 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -142,6 +142,9 @@ void inference_kernel_wrapper(Context ctx,
   int device_id = m->handle.device_id;
   ncclComm_t ncclComm = m->handle.ncclComm;
   DataType dtype = input.data_type;
+  if (num_elements == 0) {
+    return;
+  }
 
   tensorrt_llm::AllReduceStrategyType strategy =
       tensorrt_llm::SelectImplementation(

From 8fb39176dd60b144b5f487bcb309e77e1c759b29 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 23 Nov 2024 01:05:54 -0800
Subject: [PATCH 641/667] style: format

---
 src/ops/kernels/residual_rms_norm_kernels.cu | 73 ++++++++++++++------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 0d77bfb7f..77c3e83ad 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -13,10 +13,10 @@
  * limitations under the License.
  */
 
-#include <numeric>
 #include "flashinfer/utils.cuh"
+#include <numeric>
+
 #include "flashinfer/math.cuh"
-#include "flashinfer/utils.cuh"
 #include "flashinfer/vec_dtypes.cuh"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
@@ -61,12 +61,17 @@ ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   }
 }
 
-// Adopted from flashinfer (https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/norm.cuh)
+// Adopted from flashinfer
+// (https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/norm.cuh)
 // Main modification is for non-inplace computation
 template <uint32_t VEC_SIZE, typename T>
-__global__ void FusedAddRMSNormKernel(T const * __restrict__ input, T const * __restrict__ residual, T const * __restrict__ weight,
-                                      T* __restrict__ output, T* __restrict__ residual_output,
-                                      const uint32_t d, float eps) {
+__global__ void FusedAddRMSNormKernel(T const *__restrict__ input,
+                                      T const *__restrict__ residual,
+                                      T const *__restrict__ weight,
+                                      T *__restrict__ output,
+                                      T *__restrict__ residual_output,
+                                      const uint32_t d,
+                                      float eps) {
   const uint32_t bx = blockIdx.x;
   const uint32_t tx = threadIdx.x, ty = threadIdx.y;
   constexpr uint32_t warp_size = 32;
@@ -86,8 +91,10 @@ __global__ void FusedAddRMSNormKernel(T const * __restrict__ input, T const * __
     flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
     residual_output_vec.fill(0);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
-      residual_vec.load(residual + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE +
+                     thread_id * VEC_SIZE);
+      residual_vec.load(residual + bx * d + i * num_threads * VEC_SIZE +
+                        thread_id * VEC_SIZE);
     }
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; j++) {
@@ -97,7 +104,9 @@ __global__ void FusedAddRMSNormKernel(T const * __restrict__ input, T const * __
       residual_output_vec[j] = (T)x;
     }
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      residual_output_vec.store(residual_output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      residual_output_vec.store(residual_output + bx * d +
+                                i * num_threads * VEC_SIZE +
+                                thread_id * VEC_SIZE);
     }
   }
 
@@ -132,25 +141,36 @@ __global__ void FusedAddRMSNormKernel(T const * __restrict__ input, T const * __
     residual_output_vec.fill(0);
     output_vec.fill(0);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
-      weight_vec.load(weight + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
-      residual_output_vec.load(residual_output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE +
+                     thread_id * VEC_SIZE);
+      weight_vec.load(weight + i * num_threads * VEC_SIZE +
+                      thread_id * VEC_SIZE);
+      residual_output_vec.load(residual_output + bx * d +
+                               i * num_threads * VEC_SIZE +
+                               thread_id * VEC_SIZE);
     }
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; j++) {
-      output_vec[j] = float(residual_output_vec[j]) * rms_rcp * float(weight_vec[j]);
+      output_vec[j] =
+          float(residual_output_vec[j]) * rms_rcp * float(weight_vec[j]);
     }
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      output_vec.store(output + bx * d + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      output_vec.store(output + bx * d + i * num_threads * VEC_SIZE +
+                       thread_id * VEC_SIZE);
     }
   }
 }
 
 template <typename T>
-cudaError_t FusedAddRMSNorm(T const * input, T const * residual, T const * weight, 
-                            T * output, T * residual_output,
-                            uint32_t batch_size, uint32_t d,
-                            float eps = 1e-5, cudaStream_t stream = 0) {
+cudaError_t FusedAddRMSNorm(T const *input,
+                            T const *residual,
+                            T const *weight,
+                            T *output,
+                            T *residual_output,
+                            uint32_t batch_size,
+                            uint32_t d,
+                            float eps = 1e-5,
+                            cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   const uint32_t block_size = std::min<uint32_t>(1024, d / vec_size);
@@ -158,11 +178,13 @@ cudaError_t FusedAddRMSNorm(T const * input, T const * residual, T const * weigh
   dim3 nblks(batch_size);
   dim3 nthrs(32, num_warps);
   const uint32_t smem_size = num_warps * sizeof(float);
-  void* args[] = {&input, &residual, &weight, &output, &residual_output, &d, &eps};
+  void *args[] = {
+      &input, &residual, &weight, &output, &residual_output, &d, &eps};
 
   DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
     auto kernel = FusedAddRMSNormKernel<VEC_SIZE, T>;
-    FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+    FLASHINFER_CUDA_CALL(cudaLaunchKernel(
+        (void *)kernel, nblks, nthrs, args, smem_size, stream));
   });
 
   return cudaSuccess;
@@ -191,8 +213,15 @@ void forward_kernel(ResidualRMSNormMeta const *m,
   int num_threads =
       std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
-  checkCUDA(FusedAddRMSNorm<T>(
-      input1_ptr, input2_ptr, weight_ptr, output_ptr, residual_output_ptr, batch_size, m->in_dim, m->eps, stream));
+  checkCUDA(FusedAddRMSNorm<T>(input1_ptr,
+                               input2_ptr,
+                               weight_ptr,
+                               output_ptr,
+                               residual_output_ptr,
+                               batch_size,
+                               m->in_dim,
+                               m->eps,
+                               stream));
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,

From 075d7b289fb2f1dffb433b5aa2d8876a2f074fbb Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 23 Nov 2024 02:09:39 -0800
Subject: [PATCH 642/667] chore: remove unused

---
 src/ops/kernels/residual_rms_norm_kernels.cu | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 77c3e83ad..7530c179e 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -85,10 +85,10 @@ __global__ void FusedAddRMSNormKernel(T const *__restrict__ input,
 
   for (uint32_t i = 0; i < rounds; i++) {
     flashinfer::vec_t<T, VEC_SIZE> input_vec;
-    input_vec.fill(0);
     flashinfer::vec_t<T, VEC_SIZE> residual_vec;
-    residual_vec.fill(0);
     flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
+    input_vec.fill(0);
+    residual_vec.fill(0);
     residual_output_vec.fill(0);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
       input_vec.load(input + bx * d + i * num_threads * VEC_SIZE +
@@ -132,17 +132,13 @@ __global__ void FusedAddRMSNormKernel(T const *__restrict__ input,
   float rms_rcp = flashinfer::math::rsqrt(smem[0] / float(d) + eps);
 
   for (uint32_t i = 0; i < rounds; i++) {
-    flashinfer::vec_t<T, VEC_SIZE> input_vec;
     flashinfer::vec_t<T, VEC_SIZE> weight_vec;
     flashinfer::vec_t<T, VEC_SIZE> residual_output_vec;
     flashinfer::vec_t<T, VEC_SIZE> output_vec;
-    input_vec.fill(0);
     weight_vec.fill(0);
     residual_output_vec.fill(0);
     output_vec.fill(0);
     if ((i * num_threads + thread_id) * VEC_SIZE < d) {
-      input_vec.load(input + bx * d + i * num_threads * VEC_SIZE +
-                     thread_id * VEC_SIZE);
       weight_vec.load(weight + i * num_threads * VEC_SIZE +
                       thread_id * VEC_SIZE);
       residual_output_vec.load(residual_output + bx * d +

From 4ce72567fd2ea5ed9b965fb88d65a68c4d9f0f2d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sat, 23 Nov 2024 05:53:26 -0800
Subject: [PATCH 643/667] chore: remove the concurrent_task_barrier wrapping
 customAllReduce

---
 src/parallel_ops/kernels/allreduce_kernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 0e5c15008..879be72b8 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -199,10 +199,10 @@ void inference_kernel_wrapper(Context ctx,
     strategy = tensorrt_llm::AllReduceStrategyType::ONESHOT;
   }
 
-  runtime->concurrent_task_barrier(ctx);
+  // runtime->concurrent_task_barrier(ctx);
   tensorrt_llm::customAllReduce(
       params, output.ptr, num_elements, dtype, strategy, stream);
-  runtime->concurrent_task_barrier(ctx);
+  // runtime->concurrent_task_barrier(ctx);
 }
 
 void forward_kernel_wrapper(Context ctx,

From 7a820c1abd02f4e4ecce18b0d84ca1f7a9b1de2b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 28 Nov 2024 01:07:04 -0800
Subject: [PATCH 644/667] feat: add device_prop to ff_handle

---
 include/flexflow/config.h | 5 +++--
 src/runtime/model.cu      | 9 +++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 795f615d1..5cc6d5cef 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -95,10 +95,11 @@ struct FFHandler {
   size_t offload_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
-#ifdef FF_USE_NCCL
-  ncclComm_t ncclComm;
   int num_devices;
   int device_id;
+  cudaDeviceProp* device_prop;
+#ifdef FF_USE_NCCL
+  ncclComm_t ncclComm;
 #endif
 };
 
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 67d02c857..45fdf8610 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -213,10 +213,15 @@ FFHandler
   }
 
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
-#ifdef FF_USE_NCCL
-  handle.ncclComm = NULL;
   handle.num_devices = 0;
   handle.device_id = 0;
+  // We may not use all devices, so physical_device may not be successive
+  int physical_device;
+  checkCUDA(cudaGetDevice(&physical_device));
+  handle.device_prop = new cudaDeviceProp;
+  checkCUDA(cudaGetDeviceProperties(handle.device_prop, physical_device));
+#ifdef FF_USE_NCCL
+  handle.ncclComm = NULL;
 #endif
   return handle;
 }

From c911faa99c825034c5dc863a39b00316fdd102d2 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 28 Nov 2024 04:14:35 -0800
Subject: [PATCH 645/667] feat: add pytorch gemm_cublas

---
 include/flexflow/config.h         |   2 +-
 src/ops/kernels/linear_kernels.cu | 120 +++++++++++++++++++++++++-----
 2 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 5cc6d5cef..90b1e2393 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -72,6 +72,7 @@ struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;
   cublasHandle_t blas;
+  cudaDeviceProp* device_prop;
 #else
   miopenHandle_t dnn;
   hipblasHandle_t blas;
@@ -97,7 +98,6 @@ struct FFHandler {
   bool allowTensorOpMathConversion;
   int num_devices;
   int device_id;
-  cudaDeviceProp* device_prop;
 #ifdef FF_USE_NCCL
   ncclComm_t ncclComm;
 #endif
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index c495e42eb..8f757f311 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -264,6 +264,14 @@ __global__ void AddBiasWithReLU(DT *output_ptr,
   }
 }
 
+template <typename Dtype>
+inline void gemm_internal_cublas(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
+      Dtype *c, int64_t ldc) {
+  static_assert(false && sizeof(Dtype), "at::cuda::blas::gemm_internal_cublas: not implemented");
+}
+
 template <typename DT>
 void forward_kernel(LinearMeta const *m,
                     void const *input_ptr,
@@ -323,6 +331,9 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
+  DT const *input_p = static_cast<DT const *>(input_ptr),
+          *weight_p = static_cast<DT const *>(m->offload ? m->weight_ptr : weight_ptr);
+  DT *output_p = static_cast<DT *>(output_ptr);
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
 #else
@@ -334,25 +345,21 @@ void forward_kernel(LinearMeta const *m,
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   }
 #endif
-  checkCUDA(cublasGemmEx(m->handle.blas,
-                         CUBLAS_OP_T,
-                         CUBLAS_OP_N,
-                         out_dim,
-                         batch_size,
-                         in_dim,
-                         &alpha,
-                         m->offload ? m->weight_ptr : weight_ptr,
-                         weight_type,
-                         in_dim,
-                         input_ptr,
-                         input_type,
-                         in_dim,
-                         &beta,
-                         output_ptr,
-                         output_type,
-                         out_dim,
-                         compute_type,
-                         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  gemm_internal_cublas(m->handle.blas,
+                       m->handle.device_prop,
+                       CUBLAS_OP_T,
+                       CUBLAS_OP_N,
+                       out_dim,
+                       batch_size,
+                       in_dim,
+                       alpha,
+                       weight_p,
+                       in_dim,
+                       input_p,
+                       in_dim,
+                       beta,
+                       output_p,
+                       out_dim);
   // use_bias = True
   if (bias_ptr != NULL) {
     // fuse bias and relu
@@ -547,6 +554,81 @@ __global__ void build_one_ptr(DT *one_ptr, int batch_size) {
   }
 }
 
+template <>
+void gemm_internal_cublas<double>(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, double alpha,
+      const double *a, int64_t lda, const double *b, int64_t ldb, double beta,
+      double *c, int64_t ldc) {
+  checkCUDA(cublasDgemm(
+      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+}
+
+template <>
+void gemm_internal_cublas<float>(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, float alpha,
+      const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
+      float *c, int64_t ldc) {
+  checkCUDA(cublasSgemm(
+      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+}
+
+template <>
+void gemm_internal_cublas<half>(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, half alpha,
+      const half *a, int64_t lda, const half *b, int64_t ldb, half beta,
+      half *c, int64_t ldc) {
+  if (prop->major >= 5) {
+    // Disallow fp16 reductions that could lead to unexpected overflow issues.
+    // cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+    // if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+    //   cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    // }
+    // checkCUDA(cublasSetMathMode(handle, cublas_flags));
+    checkCUDA(cublasGemmEx(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        &alpha,
+        a,
+        CUDA_R_16F,
+        lda,
+        b,
+        CUDA_R_16F,
+        ldb,
+        &beta,
+        c,
+        CUDA_R_16F,
+        ldc,
+        CUBLAS_COMPUTE_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // checkCUDA(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+  } else {
+    float falpha = alpha;
+    float fbeta = beta;
+    checkCUDA(cublasSgemmEx(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        &falpha,
+        a,
+        CUDA_R_16F,
+        lda,
+        b,
+        CUDA_R_16F,
+        ldb,
+        &fbeta,
+        c,
+        CUDA_R_16F,
+        ldc));
+  }
+}
+
 } // namespace Internal
 } // namespace Linear
 } // namespace Kernels

From d09124ce6492cd170107ba49d5d96338545d78f7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 29 Nov 2024 09:36:44 -0800
Subject: [PATCH 646/667] feat: add pytorch GEMM

---
 include/flexflow/config.h                     |   5 +-
 include/flexflow/ops/kernels/gemm_impl.h      |  87 +++++
 include/flexflow/utils/cuda_helper.h          |   1 +
 src/ops/fused.cu                              |   6 +
 src/ops/kernels/gemm_impl.cu                  | 347 ++++++++++++++++++
 .../inc_multihead_self_attention_kernels.cu   |  66 ++--
 src/ops/kernels/linear_kernels.cu             | 112 +-----
 src/runtime/model.cu                          |  29 +-
 8 files changed, 507 insertions(+), 146 deletions(-)
 create mode 100644 include/flexflow/ops/kernels/gemm_impl.h
 create mode 100644 src/ops/kernels/gemm_impl.cu

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 90b1e2393..45afffc0e 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -18,11 +18,13 @@
 #include "ffconst.h"
 #include "flexflow/attention_config.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ops/kernels/gemm_impl.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstring>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include <cublas_v2.h>
+#include <cublasLt.h>
 #include <cudnn.h>
 #elif defined(FF_USE_HIP_ROCM)
 #include <hipblas/hipblas.h>
@@ -72,7 +74,8 @@ struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;
   cublasHandle_t blas;
-  cudaDeviceProp* device_prop;
+  cublasLtHandle_t blasLt;
+  Internal::GemmEngine *gemm_engine;
 #else
   miopenHandle_t dnn;
   hipblasHandle_t blas;
diff --git a/include/flexflow/ops/kernels/gemm_impl.h b/include/flexflow/ops/kernels/gemm_impl.h
new file mode 100644
index 000000000..4d60da91e
--- /dev/null
+++ b/include/flexflow/ops/kernels/gemm_impl.h
@@ -0,0 +1,87 @@
+#ifndef GEMM_IMPL_H
+#define GEMM_IMPL_H
+
+#include <cublas_v2.h>
+#include <cublasLt.h>
+
+namespace Internal {
+
+/* TODO: Consider appropriate case to use Lt */
+// #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040))
+//     // Strangely, if mat2 has only 1 row or column, we get
+//     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+//     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+//     // is to use lt interface only when self is bias.
+//     // for cuda 11.4, cublasLtMatmul is activated
+//     // the last two conditions is to skip 16b transA and non-trans-B having
+//     // leading dim >> rows when they are sliced from a large tensor
+//     // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
+//     if (!disable_addmm_cuda_lt) {
+//       useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+//           result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+//           self.is_contiguous() && result.is_contiguous() &&
+//           (scalar_type == at::ScalarType::Double ||
+//            scalar_type == at::ScalarType::Float ||
+//            scalar_type == at::ScalarType::Half ||
+//            scalar_type == at::ScalarType::BFloat16) &&
+// #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010)
+//           mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
+// #else
+//           mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
+//           mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
+//           mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
+//           // avoid leading dim >> rows bugs
+//           ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
+//            (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
+//            (scalar_type != at::ScalarType::Half &&
+//             scalar_type != at::ScalarType::BFloat16)) &&
+//           ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) ||
+//            (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
+//            (scalar_type != at::ScalarType::Half &&
+//             scalar_type != at::ScalarType::BFloat16));
+// #endif
+//     }
+// #endif
+
+#define USE_CUBLASLT
+
+#ifdef USE_CUBLASLT
+template <typename Dtype>
+inline void gemm_internal_cublaslt(cublasLtHandle_t handle, cudaDeviceProp* prop, void* workspace, size_t workspace_size,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
+      Dtype *c, int64_t ldc, cudaStream_t stream);
+#else
+template <typename Dtype>
+inline void gemm_internal_cublas(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
+      Dtype *c, int64_t ldc, cudaStream_t stream);
+#endif
+
+// Wrapper for gemm
+// Adopted from pytorch: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/CUDABlas.cpp
+class GemmEngine {
+public:
+    // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+    // defaultlt setting workspace size to 1M.
+    GemmEngine(cublasHandle_t blas_, cublasLtHandle_t blasLt_, cudaDeviceProp* device_prop_ = nullptr, size_t workspace_size_ = 1024 * 1024);
+    void assign_workspace(void* workspace_, size_t workspace_size_);
+
+    template <typename Dtype>
+    void gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+          const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta, Dtype *c, int64_t ldc, cudaStream_t stream);
+
+public:
+  cublasHandle_t blas;
+  cublasLtHandle_t blasLt;
+  cudaDeviceProp* device_prop;
+  size_t workspace_size; // in bytes
+  void* workspace;
+};
+
+
+
+} // namespace Internal
+
+#endif // GEMM_IMPL_H
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f8bf67b3e..ab1a53850 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -4,6 +4,7 @@
 #include "flexflow/ffconst.h"
 #include "legion.h"
 #include <cublas_v2.h>
+#include <cublasLt.h>
 #include <cudnn.h>
 #ifdef FF_USE_NCCL
 #include <nccl.h>
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index f9c85c123..78983d579 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -133,6 +133,8 @@ __host__ void FusedOp::forward_task(Task const *task,
   for (int op = start + 1; op < fused->numOperators; op++) {
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.blasLt ==
+             metas->meta[op]->handle.blasLt);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
     }
   }
@@ -594,6 +596,8 @@ __host__ void
   for (int op = start + 1; op < fused->numOperators; op++) {
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.blasLt ==
+             metas->meta[op]->handle.blasLt);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
     }
   }
@@ -1305,6 +1309,8 @@ __host__ void FusedOp::backward_task(Task const *task,
   for (int op = start + 1; op < fused->numOperators; op++) {
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.blasLt ==
+             metas->meta[op]->handle.blasLt);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
     }
   }
diff --git a/src/ops/kernels/gemm_impl.cu b/src/ops/kernels/gemm_impl.cu
new file mode 100644
index 000000000..c1199294d
--- /dev/null
+++ b/src/ops/kernels/gemm_impl.cu
@@ -0,0 +1,347 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/kernels/gemm_impl.h"
+#include "flexflow/utils/cuda_helper.h"
+#include <memory>
+
+namespace Internal {
+
+GemmEngine::GemmEngine(cublasHandle_t blas_, cublasLtHandle_t blasLt_, cudaDeviceProp* device_prop_, size_t workspace_size_) {
+  blas = blas_;
+  blasLt = blasLt_;
+  if (device_prop_ == nullptr) {
+    device_prop = new cudaDeviceProp;
+  } else {
+    device_prop = device_prop_;
+  }
+  workspace_size = workspace_size_;
+  workspace = nullptr;
+}
+
+void GemmEngine::assign_workspace(void* workspace_, size_t workspace_size_) {
+  assert(workspace_size_ >= workspace_size);
+  workspace = workspace_;
+}
+
+template <typename Dtype>
+void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta, Dtype *c, int64_t ldc, cudaStream_t stream) {
+  static_assert(false && sizeof(Dtype), "gemm_internal: not implemented");
+}
+
+#ifdef USE_CUBLASLT
+/* Implementations for gemm_internal_cublaslt */
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct CuBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      checkCUDA(destructor(x));
+    }
+  }
+};
+
+template <typename T, cublasStatus_t (*destructor)(T*)>
+class CuBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
+                                     cublasLtMatmulDescOpaque_t,
+                                     &cublasLtMatmulDescDestroy> {
+ public:
+  CuBlasLtMatmulDescriptor(
+      cublasComputeType_t compute_type,
+      cudaDataType_t scale_type) {
+    cublasLtMatmulDesc_t raw_descriptor = nullptr;
+    checkCUDA(
+        cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
+    checkCUDA(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
+                                 cublasLtMatrixLayoutOpaque_t,
+                                 &cublasLtMatrixLayoutDestroy> {
+ public:
+  CuBlasLtMatrixLayout(
+      cudaDataType_t type,
+      uint64_t rows,
+      uint64_t cols,
+      int64_t ld,
+      bool t = false) {
+    cublasLtMatrixLayout_t raw_descriptor = nullptr;
+    checkCUDA(
+        cublasLtMatrixLayoutCreate(&raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+    checkCUDA(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+                                     cublasLtMatmulPreferenceOpaque_t,
+                                     &cublasLtMatmulPreferenceDestroy> {
+ public:
+  CuBlasLtMatmulPreference() {
+    cublasLtMatmulPreference_t raw_descriptor = nullptr;
+    checkCUDA(cublasLtMatmulPreferenceCreate(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+    checkCUDA(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+inline uint32_t _getAlignment(uintptr_t address) {
+  // alignment are in bytes
+  uint32_t alignment = 256;
+  for (; ; alignment /= 2) {
+    if (!(address % alignment)) {
+      return alignment;
+    }
+  }
+}
+
+template <typename Dtype>
+inline void gemm_internal_cublaslt(cublasLtHandle_t handle, cudaDeviceProp* prop, void* workspace, size_t workspace_size,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
+      Dtype *c, int64_t ldc, cudaStream_t stream) {
+  assert(workspace != nullptr && "workspace must be provided.");
+  cudaDataType_t abcType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  if constexpr (std::is_same_v<Dtype, double>) {
+    abcType = CUDA_R_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_R_64F;
+  } else if constexpr (std::is_same_v<Dtype, float>) {
+    computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+  } else if constexpr (std::is_same_v<Dtype, half>) {
+    abcType = CUDA_R_16F;
+    computeType = CUBLAS_COMPUTE_16F;
+  } else {
+    static_assert(false && sizeof(Dtype), "bgemm_internal_cublaslt: not implemented");
+  }
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb);
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, transa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, transb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+
+  CuBlasLtMatmulPreference preference;
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspace_size);
+
+  uint32_t a_alignment = _getAlignment(reinterpret_cast<uintptr_t>(a));
+  uint32_t b_alignment = _getAlignment(reinterpret_cast<uintptr_t>(b));
+  uint32_t c_alignment = _getAlignment(reinterpret_cast<uintptr_t>(c));
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, a_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, b_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, c_alignment);
+
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  checkCUDA(cublasLtMatmulAlgoGetHeuristic(
+      handle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    assert(false && "cuBLASLt failed to find a valid algorithm.");
+  }
+
+  checkCUDA(cublasLtMatmul(
+      handle,
+      computeDesc.descriptor(),
+      &alpha,
+      a,
+      Adesc.descriptor(),
+      b,
+      Bdesc.descriptor(),
+      &beta,
+      c,
+      Cdesc.descriptor(),
+      c,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace,
+      workspace_size,
+      stream));
+}
+#else
+/* Implementations for gemm_internal_cublas */
+template <typename Dtype>
+inline void gemm_internal_cublas(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
+      Dtype *c, int64_t ldc, cudaStream_t stream) {
+  static_assert(false && sizeof(Dtype), "gemm_internal_cublas: not implemented");
+}
+
+template <>
+void gemm_internal_cublas<double>(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, double alpha,
+      const double *a, int64_t lda, const double *b, int64_t ldb, double beta,
+      double *c, int64_t ldc, cudaStream_t stream) {
+  checkCUDA(cublasDgemm(
+      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+}
+
+template <>
+void gemm_internal_cublas<float>(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, float alpha,
+      const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
+      float *c, int64_t ldc, cudaStream_t stream) {
+  checkCUDA(cublasSgemm(
+      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+  // checkCUDA(cublasGemmEx(
+  //     handle,
+  //     transa,
+  //     transb,
+  //     m,
+  //     n,
+  //     k,
+  //     &alpha,
+  //     a,
+  //     CUDA_R_32F,
+  //     lda,
+  //     b,
+  //     CUDA_R_32F,
+  //     ldb,
+  //     &beta,
+  //     c,
+  //     CUDA_R_32F,
+  //     ldc,
+  //     CUBLAS_COMPUTE_32F_FAST_16F,
+  //     CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <>
+void gemm_internal_cublas<half>(cublasHandle_t handle, cudaDeviceProp* prop,
+      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, half alpha,
+      const half *a, int64_t lda, const half *b, int64_t ldb, half beta,
+      half *c, int64_t ldc, cudaStream_t stream) {
+  if (prop->major >= 5) {
+    // Disallow fp16 reductions that could lead to unexpected overflow issues.
+    // cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+    // if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+    //   cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    // }
+    // checkCUDA(cublasSetMathMode(handle, cublas_flags));
+    checkCUDA(cublasGemmEx(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        &alpha,
+        a,
+        CUDA_R_16F,
+        lda,
+        b,
+        CUDA_R_16F,
+        ldb,
+        &beta,
+        c,
+        CUDA_R_16F,
+        ldc,
+        CUBLAS_COMPUTE_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // checkCUDA(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+  } else {
+    float falpha = alpha;
+    float fbeta = beta;
+    checkCUDA(cublasSgemmEx(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        &falpha,
+        a,
+        CUDA_R_16F,
+        lda,
+        b,
+        CUDA_R_16F,
+        ldb,
+        &fbeta,
+        c,
+        CUDA_R_16F,
+        ldc));
+  }
+}
+#endif
+
+template <>
+void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, double alpha,
+      const double *a, int64_t lda, const double *b, int64_t ldb, double beta, double *c, int64_t ldc, cudaStream_t stream) {
+#ifdef USE_CUBLASLT
+  gemm_internal_cublaslt(blasLt, device_prop, workspace, workspace_size,
+   transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+#else
+  gemm_internal_cublas(blas, device_prop, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+#endif
+}
+
+template <>
+void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, float alpha,
+      const float *a, int64_t lda, const float *b, int64_t ldb, float beta, float *c, int64_t ldc, cudaStream_t stream) {
+#ifdef USE_CUBLASLT
+  gemm_internal_cublaslt(blasLt, device_prop, workspace, workspace_size,
+   transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+#else
+  gemm_internal_cublas(blas, device_prop, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+#endif
+}
+
+template <>
+void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, half alpha,
+      const half *a, int64_t lda, const half *b, int64_t ldb, half beta, half *c, int64_t ldc, cudaStream_t stream) {
+#ifdef USE_CUBLASLT
+  gemm_internal_cublaslt(blasLt, device_prop, workspace, workspace_size,
+   transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+#else
+  gemm_internal_cublas(blas, device_prop, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+#endif
+}
+} // namespace Internal
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 1f5ae1bb9..e30aed528 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -187,25 +187,20 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
     // matrix B's layout: [hidden_size (hidden_dim), num_new_tokens]
     // matrix C: devQKVProjArray
     // matrix B's layout: [qk_dim, num_heads, 3, num_new_tokens]
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           weight_ptr,
-                           cublas_data_type,
-                           lda,
-                           input_ptr,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           output_ptr,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          m_,
+                                          n,
+                                          k,
+                                          alpha,
+                                          weight_ptr,
+                                          lda,
+                                          input_ptr,
+                                          ldb,
+                                          beta,
+                                          output_ptr,
+                                          ldc,
+                                          stream);
   }
 
   //   checkCUDA(cudaEventRecord(t_end, stream));
@@ -812,25 +807,20 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
     // matrix B's layout: [o_dim, num_new_tokens]
     DT *C = static_cast<DT *>(output_ptr);
 
-    checkCUDA(cublasGemmEx(m->handle.blas,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           m_,
-                           n,
-                           k,
-                           &alpha,
-                           A,
-                           cublas_data_type,
-                           lda,
-                           B,
-                           cublas_data_type,
-                           ldb,
-                           &beta,
-                           C,
-                           cublas_data_type,
-                           ldc,
-                           compute_type,
-                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          m_,
+                                          n,
+                                          k,
+                                          alpha,
+                                          A,
+                                          lda,
+                                          B,
+                                          ldb,
+                                          beta,
+                                          C,
+                                          ldc,
+                                          stream);
   }
   // Add final output bias
   if (*m->final_bias && shard_id == 0) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 8f757f311..644725ffd 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -264,14 +264,6 @@ __global__ void AddBiasWithReLU(DT *output_ptr,
   }
 }
 
-template <typename Dtype>
-inline void gemm_internal_cublas(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
-      Dtype *c, int64_t ldc) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::gemm_internal_cublas: not implemented");
-}
-
 template <typename DT>
 void forward_kernel(LinearMeta const *m,
                     void const *input_ptr,
@@ -345,21 +337,20 @@ void forward_kernel(LinearMeta const *m,
     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
   }
 #endif
-  gemm_internal_cublas(m->handle.blas,
-                       m->handle.device_prop,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       out_dim,
-                       batch_size,
-                       in_dim,
-                       alpha,
-                       weight_p,
-                       in_dim,
-                       input_p,
-                       in_dim,
-                       beta,
-                       output_p,
-                       out_dim);
+  m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        out_dim,
+                                        batch_size,
+                                        in_dim,
+                                        alpha,
+                                        weight_p,
+                                        in_dim,
+                                        input_p,
+                                        in_dim,
+                                        beta,
+                                        output_p,
+                                        out_dim,
+                                        stream);
   // use_bias = True
   if (bias_ptr != NULL) {
     // fuse bias and relu
@@ -554,81 +545,6 @@ __global__ void build_one_ptr(DT *one_ptr, int batch_size) {
   }
 }
 
-template <>
-void gemm_internal_cublas<double>(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, double alpha,
-      const double *a, int64_t lda, const double *b, int64_t ldb, double beta,
-      double *c, int64_t ldc) {
-  checkCUDA(cublasDgemm(
-      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
-}
-
-template <>
-void gemm_internal_cublas<float>(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, float alpha,
-      const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
-      float *c, int64_t ldc) {
-  checkCUDA(cublasSgemm(
-      handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
-}
-
-template <>
-void gemm_internal_cublas<half>(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, half alpha,
-      const half *a, int64_t lda, const half *b, int64_t ldb, half beta,
-      half *c, int64_t ldc) {
-  if (prop->major >= 5) {
-    // Disallow fp16 reductions that could lead to unexpected overflow issues.
-    // cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-    // if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-    //   cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-    // }
-    // checkCUDA(cublasSetMathMode(handle, cublas_flags));
-    checkCUDA(cublasGemmEx(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        &alpha,
-        a,
-        CUDA_R_16F,
-        lda,
-        b,
-        CUDA_R_16F,
-        ldb,
-        &beta,
-        c,
-        CUDA_R_16F,
-        ldc,
-        CUBLAS_COMPUTE_16F,
-        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // checkCUDA(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
-  } else {
-    float falpha = alpha;
-    float fbeta = beta;
-    checkCUDA(cublasSgemmEx(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        &falpha,
-        a,
-        CUDA_R_16F,
-        lda,
-        b,
-        CUDA_R_16F,
-        ldb,
-        &fbeta,
-        c,
-        CUDA_R_16F,
-        ldc));
-  }
-}
-
 } // namespace Internal
 } // namespace Linear
 } // namespace Kernels
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 45fdf8610..00034582a 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -100,10 +100,18 @@ FFHandler
   assert(handle.tree_verify_attention_metadata != nullptr &&
          "Attention metadata must be allocated");
   checkCUDA(cublasCreate(&handle.blas));
+  checkCUDA(cublasLtCreate(&handle.blasLt));
   if (handle.allowTensorOpMathConversion) {
     checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH));
   }
   checkCUDNN(cudnnCreate(&handle.dnn));
+  handle.num_devices = 0;
+  handle.device_id = 0;
+  handle.gemm_engine = new Internal::GemmEngine(handle.blas, handle.blasLt);
+  // We may not use all devices, physical_device may not be successive, so we explicitly get the physical device id
+  int physical_device;
+  checkCUDA(cudaGetDevice(&physical_device));
+  checkCUDA(cudaGetDeviceProperties(handle.gemm_engine->device_prop, physical_device));
   // #ifdef FF_USE_NCCL
   //   checkNCCL(ncclCommInitRank(&handle.nccl, info->allRanks, info->ncclId,
   //   info->myRank)); fprintf(stderr, "handle.nccl(%p)\n", handle.nccl);
@@ -164,7 +172,8 @@ FFHandler
   if (handle.batch_config_metadata_size +
           handle.incr_attention_metadata->mem_size() +
           handle.tree_search_attention_metadata->mem_size() +
-          handle.tree_verify_attention_metadata->mem_size() >
+          handle.tree_verify_attention_metadata->mem_size() +
+          handle.gemm_engine->workspace_size >
       0) {
     // allocate memory for offload reserve space
     Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -177,7 +186,8 @@ FFHandler
             handle.batch_config_metadata_size +
             handle.incr_attention_metadata->mem_size() +
             handle.tree_search_attention_metadata->mem_size() +
-            handle.tree_verify_attention_metadata->mem_size() - 1));
+            handle.tree_verify_attention_metadata->mem_size() +
+            handle.gemm_engine->workspace_size - 1));
     std::vector<size_t> field_sizes;
     field_sizes.push_back(sizeof(char));
     Realm::RegionInstance workspaceInst;
@@ -205,21 +215,22 @@ FFHandler
                             handle.incr_attention_metadata->mem_size() +
                             handle.tree_search_attention_metadata->mem_size()),
         handle.tree_verify_attention_metadata->mem_size());
+    handle.gemm_engine->assign_workspace(
+        static_cast<void *>(static_cast<char *>(handle.batch_config_metadata) +
+                            handle.batch_config_metadata_size +
+                            handle.incr_attention_metadata->mem_size() +
+                            handle.tree_search_attention_metadata->mem_size() +
+                            handle.tree_verify_attention_metadata->mem_size()),
+        handle.gemm_engine->workspace_size);
   } else {
     handle.batch_config_metadata = nullptr;
     handle.incr_attention_metadata->assign_address(nullptr, 0);
     handle.tree_search_attention_metadata->assign_address(nullptr, 0);
     handle.tree_verify_attention_metadata->assign_address(nullptr, 0);
+    handle.gemm_engine->assign_workspace(nullptr, 0);
   }
 
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
-  handle.num_devices = 0;
-  handle.device_id = 0;
-  // We may not use all devices, so physical_device may not be successive
-  int physical_device;
-  checkCUDA(cudaGetDevice(&physical_device));
-  handle.device_prop = new cudaDeviceProp;
-  checkCUDA(cudaGetDeviceProperties(handle.device_prop, physical_device));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
 #endif

From 115a3ff5a3245171281303379f0eb96deb0413c7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 29 Nov 2024 09:40:32 -0800
Subject: [PATCH 647/667] chore: remove unused

---
 .../inc_multihead_self_attention_kernels.cu   | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index e30aed528..5e19160e5 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -149,18 +149,6 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
   checkCUDA(cublasSetStream(m->handle.blas, stream));
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
-
   //   int device;
   //   checkCUDA(cudaGetDevice(&device));
   //   cudaEvent_t t_start, t_end;
@@ -779,13 +767,6 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if CUDA_VERSION >= 11000
-  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best
-  // performance
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-#else
-  cudaDataType_t compute_type = cublas_data_type;
-#endif
   // Project to output, save result directly on output tensor
   {
     DT alpha = 1.0f, beta = 0.0f;

From 1a5803e539ed8c7fe1d98cac8a3a01931ca9d671 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Dec 2024 02:41:03 -0800
Subject: [PATCH 648/667] feat: add absolute slo constraint

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 27 +++++++++++++++++++--------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index f86b234a0..7dbcc8af3 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -349,6 +349,7 @@ class RequestManager {
   bool get_spec_infer_old_version();
   bool get_greedy_schedule();
   bool get_equal_schedule();
+  inline double get_slo_constraint(Request &request);
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 94aa0a261..45b5ab627 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -358,9 +358,18 @@ bool RequestManager::get_equal_schedule() {
   return equal_schedule;
 }
 
+inline double RequestManager::get_slo_constraint(Request &request) {
+  if (request.get_slo_ratio() < 0) {
+    // we use negative number to specify the absolute slo constraint (ms)
+    return -request.get_slo_ratio();
+  } else {
+    // relative slo constraint upon the baseline latency
+    return request.get_slo_ratio() * baseline_latency_ms;
+  }
+}
+
 double RequestManager::get_request_expected_latency(Request &request) {
-  return request.get_slo_ratio() * baseline_latency_ms *
-         request.decode_length();
+  return get_slo_constraint(request) * request.decode_length();
 }
 
 Request &RequestManager::get_request_with_guid(RequestGuid guid) {
@@ -506,6 +515,9 @@ RequestManager::RequestGuid
   // std::cout << "[slo ratio] " << req.slo_ratio << std::endl;
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   request.set_slo_ratio(req.slo_ratio);
+  printf("Registered as request[%ld] with slo %.3f ms\n",
+           request.guid,
+           get_slo_constraint(request));
 
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
@@ -1793,7 +1805,7 @@ bool RequestManager::update_llm_verify_results(
     bool current_attained =
         request.decode_latency_ms <=
         get_request_expected_latency(request) +
-            get_baseline_latency() * request.get_slo_ratio() * 6;
+            get_slo_constraint(request) * 6;
 
     // Initialize the token tree for the request
     init_token_tree(guid);
@@ -2422,9 +2434,8 @@ std::vector<GenerationResult>
   for (size_t i = 0; i < requests.size(); i++) {
     requests[i].slo_ratio = emission_machine.sample_slo_ratio();
     requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms();
-    printf("Prompt[%ld] with slo %.3f: %s\n",
+    printf("Prompt[%ld]: %s\n",
            i,
-           requests[i].slo_ratio,
            requests[i].prompt.c_str());
     RequestManager::RequestGuid guid = rm->register_new_request(requests[i]);
     if (guid != RequestManager::INVALID_GUID) {
@@ -3140,7 +3151,7 @@ void RequestManager::prune_token_tree() {
     RequestGuid guid = guid_of_requests[request_index];
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
-    if (request.get_slo_ratio() > 999) {
+    if (request.get_slo_ratio() > 999) { // infinity
       continue;
     }
     double spare_latency =
@@ -3219,10 +3230,10 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid,
   Request &request = all_requests[guid];
   double num_tokens_to_decode_per_step =
       (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
-      (baseline_latency_ms * request.get_slo_ratio());
+      get_slo_constraint(request);
   double expected_num_tokens_decoded =
       request.decode_latency_ms /
-      (baseline_latency_ms * request.get_slo_ratio());
+      get_slo_constraint(request);
 
   double num_tokens_to_decode =
       max(1.0,

From 7e296652d93134503359edb7231b06b0f991d1b0 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Dec 2024 02:48:51 -0800
Subject: [PATCH 649/667] style: format

---
 include/flexflow/config.h                     |   2 +-
 include/flexflow/ops/kernels/gemm_impl.h      |  90 +++-
 include/flexflow/utils/cuda_helper.h          |   2 +-
 src/ops/kernels/gemm_impl.cu                  | 508 +++++++++++++-----
 .../inc_multihead_self_attention_kernels.cu   |  52 +-
 src/ops/kernels/linear_kernels.cu             |  29 +-
 src/runtime/model.cu                          |   6 +-
 src/runtime/request_manager.cc                |  14 +-
 8 files changed, 478 insertions(+), 225 deletions(-)

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 45afffc0e..1aa80112b 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -23,8 +23,8 @@
 #include <cstddef>
 #include <cstring>
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cudnn.h>
 #elif defined(FF_USE_HIP_ROCM)
 #include <hipblas/hipblas.h>
diff --git a/include/flexflow/ops/kernels/gemm_impl.h b/include/flexflow/ops/kernels/gemm_impl.h
index 4d60da91e..f0e08a67d 100644
--- a/include/flexflow/ops/kernels/gemm_impl.h
+++ b/include/flexflow/ops/kernels/gemm_impl.h
@@ -1,8 +1,8 @@
 #ifndef GEMM_IMPL_H
 #define GEMM_IMPL_H
 
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 
 namespace Internal {
 
@@ -10,13 +10,15 @@ namespace Internal {
 // #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040))
 //     // Strangely, if mat2 has only 1 row or column, we get
 //     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-//     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+//     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] ==
+//     mat2_sizes[1]
 //     // is to use lt interface only when self is bias.
 //     // for cuda 11.4, cublasLtMatmul is activated
 //     // the last two conditions is to skip 16b transA and non-trans-B having
 //     // leading dim >> rows when they are sliced from a large tensor
-//     // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
-//     if (!disable_addmm_cuda_lt) {
+//     // see
+//     fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul if
+//     (!disable_addmm_cuda_lt) {
 //       useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
 //           result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
 //           self.is_contiguous() && result.is_contiguous() &&
@@ -47,41 +49,81 @@ namespace Internal {
 
 #ifdef USE_CUBLASLT
 template <typename Dtype>
-inline void gemm_internal_cublaslt(cublasLtHandle_t handle, cudaDeviceProp* prop, void* workspace, size_t workspace_size,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
-      Dtype *c, int64_t ldc, cudaStream_t stream);
+inline void gemm_internal_cublaslt(cublasLtHandle_t handle,
+                                   cudaDeviceProp *prop,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   cublasOperation_t transa,
+                                   cublasOperation_t transb,
+                                   int64_t m,
+                                   int64_t n,
+                                   int64_t k,
+                                   Dtype alpha,
+                                   Dtype const *a,
+                                   int64_t lda,
+                                   Dtype const *b,
+                                   int64_t ldb,
+                                   Dtype beta,
+                                   Dtype *c,
+                                   int64_t ldc,
+                                   cudaStream_t stream);
 #else
 template <typename Dtype>
-inline void gemm_internal_cublas(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
-      Dtype *c, int64_t ldc, cudaStream_t stream);
+inline void gemm_internal_cublas(cublasHandle_t handle,
+                                 cudaDeviceProp *prop,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int64_t m,
+                                 int64_t n,
+                                 int64_t k,
+                                 Dtype alpha,
+                                 Dtype const *a,
+                                 int64_t lda,
+                                 Dtype const *b,
+                                 int64_t ldb,
+                                 Dtype beta,
+                                 Dtype *c,
+                                 int64_t ldc,
+                                 cudaStream_t stream);
 #endif
 
 // Wrapper for gemm
-// Adopted from pytorch: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/CUDABlas.cpp
+// Adopted from pytorch:
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/cuda/CUDABlas.cpp
 class GemmEngine {
 public:
-    // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
-    // defaultlt setting workspace size to 1M.
-    GemmEngine(cublasHandle_t blas_, cublasLtHandle_t blasLt_, cudaDeviceProp* device_prop_ = nullptr, size_t workspace_size_ = 1024 * 1024);
-    void assign_workspace(void* workspace_, size_t workspace_size_);
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // defaultlt setting workspace size to 1M.
+  GemmEngine(cublasHandle_t blas_,
+             cublasLtHandle_t blasLt_,
+             cudaDeviceProp *device_prop_ = nullptr,
+             size_t workspace_size_ = 1024 * 1024);
+  void assign_workspace(void *workspace_, size_t workspace_size_);
 
-    template <typename Dtype>
-    void gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-          const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta, Dtype *c, int64_t ldc, cudaStream_t stream);
+  template <typename Dtype>
+  void gemm_internal(cublasOperation_t transa,
+                     cublasOperation_t transb,
+                     int64_t m,
+                     int64_t n,
+                     int64_t k,
+                     Dtype alpha,
+                     Dtype const *a,
+                     int64_t lda,
+                     Dtype const *b,
+                     int64_t ldb,
+                     Dtype beta,
+                     Dtype *c,
+                     int64_t ldc,
+                     cudaStream_t stream);
 
 public:
   cublasHandle_t blas;
   cublasLtHandle_t blasLt;
-  cudaDeviceProp* device_prop;
+  cudaDeviceProp *device_prop;
   size_t workspace_size; // in bytes
-  void* workspace;
+  void *workspace;
 };
 
-
-
 } // namespace Internal
 
 #endif // GEMM_IMPL_H
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index ab1a53850..f5ea76c5b 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -3,8 +3,8 @@
 #include "flexflow/accessor.h"
 #include "flexflow/ffconst.h"
 #include "legion.h"
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
 #include <cudnn.h>
 #ifdef FF_USE_NCCL
 #include <nccl.h>
diff --git a/src/ops/kernels/gemm_impl.cu b/src/ops/kernels/gemm_impl.cu
index c1199294d..939eaeb3b 100644
--- a/src/ops/kernels/gemm_impl.cu
+++ b/src/ops/kernels/gemm_impl.cu
@@ -19,7 +19,10 @@
 
 namespace Internal {
 
-GemmEngine::GemmEngine(cublasHandle_t blas_, cublasLtHandle_t blasLt_, cudaDeviceProp* device_prop_, size_t workspace_size_) {
+GemmEngine::GemmEngine(cublasHandle_t blas_,
+                       cublasLtHandle_t blasLt_,
+                       cudaDeviceProp *device_prop_,
+                       size_t workspace_size_) {
   blas = blas_;
   blasLt = blasLt_;
   if (device_prop_ == nullptr) {
@@ -31,49 +34,60 @@ GemmEngine::GemmEngine(cublasHandle_t blas_, cublasLtHandle_t blasLt_, cudaDevic
   workspace = nullptr;
 }
 
-void GemmEngine::assign_workspace(void* workspace_, size_t workspace_size_) {
+void GemmEngine::assign_workspace(void *workspace_, size_t workspace_size_) {
   assert(workspace_size_ >= workspace_size);
   workspace = workspace_;
 }
 
 template <typename Dtype>
-void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta, Dtype *c, int64_t ldc, cudaStream_t stream) {
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               Dtype alpha,
+                               Dtype const *a,
+                               int64_t lda,
+                               Dtype const *b,
+                               int64_t ldb,
+                               Dtype beta,
+                               Dtype *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
   static_assert(false && sizeof(Dtype), "gemm_internal: not implemented");
 }
 
 #ifdef USE_CUBLASLT
 /* Implementations for gemm_internal_cublaslt */
-template <typename T, cublasStatus_t (*destructor)(T*)>
+template <typename T, cublasStatus_t (*destructor)(T *)>
 struct CuBlasLtDeleter {
-  void operator()(T* x) {
+  void operator()(T *x) {
     if (x != nullptr) {
       checkCUDA(destructor(x));
     }
   }
 };
 
-template <typename T, cublasStatus_t (*destructor)(T*)>
+template <typename T, cublasStatus_t (*destructor)(T *)>
 class CuBlasLtDescriptor {
- public:
-  T* descriptor() const {
+public:
+  T *descriptor() const {
     return descriptor_.get();
   }
-  T* descriptor() {
+  T *descriptor() {
     return descriptor_.get();
   }
 
- protected:
+protected:
   std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
 };
 
-class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
-                                     cublasLtMatmulDescOpaque_t,
-                                     &cublasLtMatmulDescDestroy> {
- public:
-  CuBlasLtMatmulDescriptor(
-      cublasComputeType_t compute_type,
-      cudaDataType_t scale_type) {
+class CuBlasLtMatmulDescriptor
+    : public CuBlasLtDescriptor<cublasLtMatmulDescOpaque_t,
+                                &cublasLtMatmulDescDestroy> {
+public:
+  CuBlasLtMatmulDescriptor(cublasComputeType_t compute_type,
+                           cudaDataType_t scale_type) {
     cublasLtMatmulDesc_t raw_descriptor = nullptr;
     checkCUDA(
         cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
@@ -82,50 +96,54 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
   template <typename T>
   inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
     // NOLINTNEXTLINE(bugprone-sizeof-expression)
-    checkCUDA(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+    checkCUDA(::cublasLtMatmulDescSetAttribute(
+        descriptor(), attr, &value, sizeof(T)));
   }
 };
 
-class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
-                                 cublasLtMatrixLayoutOpaque_t,
-                                 &cublasLtMatrixLayoutDestroy> {
- public:
-  CuBlasLtMatrixLayout(
-      cudaDataType_t type,
-      uint64_t rows,
-      uint64_t cols,
-      int64_t ld,
-      bool t = false) {
+class CuBlasLtMatrixLayout
+    : public CuBlasLtDescriptor<cublasLtMatrixLayoutOpaque_t,
+                                &cublasLtMatrixLayoutDestroy> {
+public:
+  CuBlasLtMatrixLayout(cudaDataType_t type,
+                       uint64_t rows,
+                       uint64_t cols,
+                       int64_t ld,
+                       bool t = false) {
     cublasLtMatrixLayout_t raw_descriptor = nullptr;
-    checkCUDA(
-        cublasLtMatrixLayoutCreate(&raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld));
+    checkCUDA(cublasLtMatrixLayoutCreate(
+        &raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld));
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
-  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
-    checkCUDA(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr,
+                           const T value) {
+    checkCUDA(::cublasLtMatrixLayoutSetAttribute(
+        descriptor(), attr, &value, sizeof(T)));
   }
 };
 
-class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
-                                     cublasLtMatmulPreferenceOpaque_t,
-                                     &cublasLtMatmulPreferenceDestroy> {
- public:
+class CuBlasLtMatmulPreference
+    : public CuBlasLtDescriptor<cublasLtMatmulPreferenceOpaque_t,
+                                &cublasLtMatmulPreferenceDestroy> {
+public:
   CuBlasLtMatmulPreference() {
     cublasLtMatmulPreference_t raw_descriptor = nullptr;
     checkCUDA(cublasLtMatmulPreferenceCreate(&raw_descriptor));
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
-  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
-    checkCUDA(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr,
+                           const T value) {
+    checkCUDA(::cublasLtMatmulPreferenceSetAttribute(
+        descriptor(), attr, &value, sizeof(T)));
   }
 };
 
 inline uint32_t _getAlignment(uintptr_t address) {
   // alignment are in bytes
   uint32_t alignment = 256;
-  for (; ; alignment /= 2) {
+  for (;; alignment /= 2) {
     if (!(address % alignment)) {
       return alignment;
     }
@@ -133,10 +151,24 @@ inline uint32_t _getAlignment(uintptr_t address) {
 }
 
 template <typename Dtype>
-inline void gemm_internal_cublaslt(cublasLtHandle_t handle, cudaDeviceProp* prop, void* workspace, size_t workspace_size,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
-      Dtype *c, int64_t ldc, cudaStream_t stream) {
+inline void gemm_internal_cublaslt(cublasLtHandle_t handle,
+                                   cudaDeviceProp *prop,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   cublasOperation_t transa,
+                                   cublasOperation_t transb,
+                                   int64_t m,
+                                   int64_t n,
+                                   int64_t k,
+                                   Dtype alpha,
+                                   Dtype const *a,
+                                   int64_t lda,
+                                   Dtype const *b,
+                                   int64_t ldb,
+                                   Dtype beta,
+                                   Dtype *c,
+                                   int64_t ldc,
+                                   cudaStream_t stream) {
   assert(workspace != nullptr && "workspace must be provided.");
   cudaDataType_t abcType = CUDA_R_32F;
   cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
@@ -151,7 +183,8 @@ inline void gemm_internal_cublaslt(cublasLtHandle_t handle, cudaDeviceProp* prop
     abcType = CUDA_R_16F;
     computeType = CUBLAS_COMPUTE_16F;
   } else {
-    static_assert(false && sizeof(Dtype), "bgemm_internal_cublaslt: not implemented");
+    static_assert(false && sizeof(Dtype),
+                  "bgemm_internal_cublaslt: not implemented");
   }
 
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
@@ -162,74 +195,113 @@ inline void gemm_internal_cublaslt(cublasLtHandle_t handle, cudaDeviceProp* prop
   CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
 
   CuBlasLtMatmulPreference preference;
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspace_size);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                          workspace_size);
 
   uint32_t a_alignment = _getAlignment(reinterpret_cast<uintptr_t>(a));
   uint32_t b_alignment = _getAlignment(reinterpret_cast<uintptr_t>(b));
   uint32_t c_alignment = _getAlignment(reinterpret_cast<uintptr_t>(c));
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, a_alignment);
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, b_alignment);
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, c_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
+                          a_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
+                          b_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
+                          c_alignment);
 
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
-  checkCUDA(cublasLtMatmulAlgoGetHeuristic(
-      handle,
-      computeDesc.descriptor(),
-      Adesc.descriptor(),
-      Bdesc.descriptor(),
-      Cdesc.descriptor(),
-      Cdesc.descriptor(),
-      preference.descriptor(),
-      1,
-      &heuristicResult,
-      &returnedResult));
+  checkCUDA(cublasLtMatmulAlgoGetHeuristic(handle,
+                                           computeDesc.descriptor(),
+                                           Adesc.descriptor(),
+                                           Bdesc.descriptor(),
+                                           Cdesc.descriptor(),
+                                           Cdesc.descriptor(),
+                                           preference.descriptor(),
+                                           1,
+                                           &heuristicResult,
+                                           &returnedResult));
   if (returnedResult == 0) {
     assert(false && "cuBLASLt failed to find a valid algorithm.");
   }
 
-  checkCUDA(cublasLtMatmul(
-      handle,
-      computeDesc.descriptor(),
-      &alpha,
-      a,
-      Adesc.descriptor(),
-      b,
-      Bdesc.descriptor(),
-      &beta,
-      c,
-      Cdesc.descriptor(),
-      c,
-      Cdesc.descriptor(),
-      &heuristicResult.algo,
-      workspace,
-      workspace_size,
-      stream));
+  checkCUDA(cublasLtMatmul(handle,
+                           computeDesc.descriptor(),
+                           &alpha,
+                           a,
+                           Adesc.descriptor(),
+                           b,
+                           Bdesc.descriptor(),
+                           &beta,
+                           c,
+                           Cdesc.descriptor(),
+                           c,
+                           Cdesc.descriptor(),
+                           &heuristicResult.algo,
+                           workspace,
+                           workspace_size,
+                           stream));
 }
 #else
 /* Implementations for gemm_internal_cublas */
 template <typename Dtype>
-inline void gemm_internal_cublas(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, Dtype alpha,
-      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, Dtype beta,
-      Dtype *c, int64_t ldc, cudaStream_t stream) {
-  static_assert(false && sizeof(Dtype), "gemm_internal_cublas: not implemented");
+inline void gemm_internal_cublas(cublasHandle_t handle,
+                                 cudaDeviceProp *prop,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int64_t m,
+                                 int64_t n,
+                                 int64_t k,
+                                 Dtype alpha,
+                                 Dtype const *a,
+                                 int64_t lda,
+                                 Dtype const *b,
+                                 int64_t ldb,
+                                 Dtype beta,
+                                 Dtype *c,
+                                 int64_t ldc,
+                                 cudaStream_t stream) {
+  static_assert(false && sizeof(Dtype),
+                "gemm_internal_cublas: not implemented");
 }
 
 template <>
-void gemm_internal_cublas<double>(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, double alpha,
-      const double *a, int64_t lda, const double *b, int64_t ldb, double beta,
-      double *c, int64_t ldc, cudaStream_t stream) {
+void gemm_internal_cublas<double>(cublasHandle_t handle,
+                                  cudaDeviceProp *prop,
+                                  cublasOperation_t transa,
+                                  cublasOperation_t transb,
+                                  int64_t m,
+                                  int64_t n,
+                                  int64_t k,
+                                  double alpha,
+                                  double const *a,
+                                  int64_t lda,
+                                  double const *b,
+                                  int64_t ldb,
+                                  double beta,
+                                  double *c,
+                                  int64_t ldc,
+                                  cudaStream_t stream) {
   checkCUDA(cublasDgemm(
       handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
 }
 
 template <>
-void gemm_internal_cublas<float>(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, float alpha,
-      const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
-      float *c, int64_t ldc, cudaStream_t stream) {
+void gemm_internal_cublas<float>(cublasHandle_t handle,
+                                 cudaDeviceProp *prop,
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb,
+                                 int64_t m,
+                                 int64_t n,
+                                 int64_t k,
+                                 float alpha,
+                                 float const *a,
+                                 int64_t lda,
+                                 float const *b,
+                                 int64_t ldb,
+                                 float beta,
+                                 float *c,
+                                 int64_t ldc,
+                                 cudaStream_t stream) {
   checkCUDA(cublasSgemm(
       handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
   // checkCUDA(cublasGemmEx(
@@ -255,93 +327,233 @@ void gemm_internal_cublas<float>(cublasHandle_t handle, cudaDeviceProp* prop,
 }
 
 template <>
-void gemm_internal_cublas<half>(cublasHandle_t handle, cudaDeviceProp* prop,
-      cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, half alpha,
-      const half *a, int64_t lda, const half *b, int64_t ldb, half beta,
-      half *c, int64_t ldc, cudaStream_t stream) {
+void gemm_internal_cublas<half>(cublasHandle_t handle,
+                                cudaDeviceProp *prop,
+                                cublasOperation_t transa,
+                                cublasOperation_t transb,
+                                int64_t m,
+                                int64_t n,
+                                int64_t k,
+                                half alpha,
+                                half const *a,
+                                int64_t lda,
+                                half const *b,
+                                int64_t ldb,
+                                half beta,
+                                half *c,
+                                int64_t ldc,
+                                cudaStream_t stream) {
   if (prop->major >= 5) {
     // Disallow fp16 reductions that could lead to unexpected overflow issues.
     // cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
     // if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-    //   cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    //   cublas_flags = static_cast<cublasMath_t>(cublas_flags |
+    //   CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
     // }
     // checkCUDA(cublasSetMathMode(handle, cublas_flags));
-    checkCUDA(cublasGemmEx(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        &alpha,
-        a,
-        CUDA_R_16F,
-        lda,
-        b,
-        CUDA_R_16F,
-        ldb,
-        &beta,
-        c,
-        CUDA_R_16F,
-        ldc,
-        CUBLAS_COMPUTE_16F,
-        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    checkCUDA(cublasGemmEx(handle,
+                           transa,
+                           transb,
+                           m,
+                           n,
+                           k,
+                           &alpha,
+                           a,
+                           CUDA_R_16F,
+                           lda,
+                           b,
+                           CUDA_R_16F,
+                           ldb,
+                           &beta,
+                           c,
+                           CUDA_R_16F,
+                           ldc,
+                           CUBLAS_COMPUTE_16F,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     // checkCUDA(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
   } else {
     float falpha = alpha;
     float fbeta = beta;
-    checkCUDA(cublasSgemmEx(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        &falpha,
-        a,
-        CUDA_R_16F,
-        lda,
-        b,
-        CUDA_R_16F,
-        ldb,
-        &fbeta,
-        c,
-        CUDA_R_16F,
-        ldc));
+    checkCUDA(cublasSgemmEx(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            &falpha,
+                            a,
+                            CUDA_R_16F,
+                            lda,
+                            b,
+                            CUDA_R_16F,
+                            ldb,
+                            &fbeta,
+                            c,
+                            CUDA_R_16F,
+                            ldc));
   }
 }
 #endif
 
 template <>
-void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, double alpha,
-      const double *a, int64_t lda, const double *b, int64_t ldb, double beta, double *c, int64_t ldc, cudaStream_t stream) {
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               double alpha,
+                               double const *a,
+                               int64_t lda,
+                               double const *b,
+                               int64_t ldb,
+                               double beta,
+                               double *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
 #ifdef USE_CUBLASLT
-  gemm_internal_cublaslt(blasLt, device_prop, workspace, workspace_size,
-   transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+  gemm_internal_cublaslt(blasLt,
+                         device_prop,
+                         workspace,
+                         workspace_size,
+                         transa,
+                         transb,
+                         m,
+                         n,
+                         k,
+                         alpha,
+                         a,
+                         lda,
+                         b,
+                         ldb,
+                         beta,
+                         c,
+                         ldc,
+                         stream);
 #else
-  gemm_internal_cublas(blas, device_prop, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+  gemm_internal_cublas(blas,
+                       device_prop,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       a,
+                       lda,
+                       b,
+                       ldb,
+                       beta,
+                       c,
+                       ldc,
+                       stream);
 #endif
 }
 
 template <>
-void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, float alpha,
-      const float *a, int64_t lda, const float *b, int64_t ldb, float beta, float *c, int64_t ldc, cudaStream_t stream) {
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               float alpha,
+                               float const *a,
+                               int64_t lda,
+                               float const *b,
+                               int64_t ldb,
+                               float beta,
+                               float *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
 #ifdef USE_CUBLASLT
-  gemm_internal_cublaslt(blasLt, device_prop, workspace, workspace_size,
-   transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+  gemm_internal_cublaslt(blasLt,
+                         device_prop,
+                         workspace,
+                         workspace_size,
+                         transa,
+                         transb,
+                         m,
+                         n,
+                         k,
+                         alpha,
+                         a,
+                         lda,
+                         b,
+                         ldb,
+                         beta,
+                         c,
+                         ldc,
+                         stream);
 #else
-  gemm_internal_cublas(blas, device_prop, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+  gemm_internal_cublas(blas,
+                       device_prop,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       a,
+                       lda,
+                       b,
+                       ldb,
+                       beta,
+                       c,
+                       ldc,
+                       stream);
 #endif
 }
 
 template <>
-void GemmEngine::gemm_internal(cublasOperation_t transa, cublasOperation_t transb, int64_t m, int64_t n, int64_t k, half alpha,
-      const half *a, int64_t lda, const half *b, int64_t ldb, half beta, half *c, int64_t ldc, cudaStream_t stream) {
+void GemmEngine::gemm_internal(cublasOperation_t transa,
+                               cublasOperation_t transb,
+                               int64_t m,
+                               int64_t n,
+                               int64_t k,
+                               half alpha,
+                               half const *a,
+                               int64_t lda,
+                               half const *b,
+                               int64_t ldb,
+                               half beta,
+                               half *c,
+                               int64_t ldc,
+                               cudaStream_t stream) {
 #ifdef USE_CUBLASLT
-  gemm_internal_cublaslt(blasLt, device_prop, workspace, workspace_size,
-   transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+  gemm_internal_cublaslt(blasLt,
+                         device_prop,
+                         workspace,
+                         workspace_size,
+                         transa,
+                         transb,
+                         m,
+                         n,
+                         k,
+                         alpha,
+                         a,
+                         lda,
+                         b,
+                         ldb,
+                         beta,
+                         c,
+                         ldc,
+                         stream);
 #else
-  gemm_internal_cublas(blas, device_prop, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, stream);
+  gemm_internal_cublas(blas,
+                       device_prop,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       a,
+                       lda,
+                       b,
+                       ldb,
+                       beta,
+                       c,
+                       ldc,
+                       stream);
 #endif
 }
 } // namespace Internal
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 5e19160e5..2ed554e43 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -176,19 +176,19 @@ void compute_qkv(IncMultiHeadSelfAttentionMeta const *m,
     // matrix C: devQKVProjArray
     // matrix B's layout: [qk_dim, num_heads, 3, num_new_tokens]
     m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          m_,
-                                          n,
-                                          k,
-                                          alpha,
-                                          weight_ptr,
-                                          lda,
-                                          input_ptr,
-                                          ldb,
-                                          beta,
-                                          output_ptr,
-                                          ldc,
-                                          stream);
+                                         CUBLAS_OP_N,
+                                         m_,
+                                         n,
+                                         k,
+                                         alpha,
+                                         weight_ptr,
+                                         lda,
+                                         input_ptr,
+                                         ldb,
+                                         beta,
+                                         output_ptr,
+                                         ldc,
+                                         stream);
   }
 
   //   checkCUDA(cudaEventRecord(t_end, stream));
@@ -789,19 +789,19 @@ void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
     DT *C = static_cast<DT *>(output_ptr);
 
     m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          m_,
-                                          n,
-                                          k,
-                                          alpha,
-                                          A,
-                                          lda,
-                                          B,
-                                          ldb,
-                                          beta,
-                                          C,
-                                          ldc,
-                                          stream);
+                                         CUBLAS_OP_N,
+                                         m_,
+                                         n,
+                                         k,
+                                         alpha,
+                                         A,
+                                         lda,
+                                         B,
+                                         ldb,
+                                         beta,
+                                         C,
+                                         ldc,
+                                         stream);
   }
   // Add final output bias
   if (*m->final_bias && shard_id == 0) {
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 644725ffd..2c049be68 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -324,7 +324,8 @@ void forward_kernel(LinearMeta const *m,
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
   DT const *input_p = static_cast<DT const *>(input_ptr),
-          *weight_p = static_cast<DT const *>(m->offload ? m->weight_ptr : weight_ptr);
+           *weight_p =
+               static_cast<DT const *>(m->offload ? m->weight_ptr : weight_ptr);
   DT *output_p = static_cast<DT *>(output_ptr);
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
@@ -338,19 +339,19 @@ void forward_kernel(LinearMeta const *m,
   }
 #endif
   m->handle.gemm_engine->gemm_internal(CUBLAS_OP_T,
-                                        CUBLAS_OP_N,
-                                        out_dim,
-                                        batch_size,
-                                        in_dim,
-                                        alpha,
-                                        weight_p,
-                                        in_dim,
-                                        input_p,
-                                        in_dim,
-                                        beta,
-                                        output_p,
-                                        out_dim,
-                                        stream);
+                                       CUBLAS_OP_N,
+                                       out_dim,
+                                       batch_size,
+                                       in_dim,
+                                       alpha,
+                                       weight_p,
+                                       in_dim,
+                                       input_p,
+                                       in_dim,
+                                       beta,
+                                       output_p,
+                                       out_dim,
+                                       stream);
   // use_bias = True
   if (bias_ptr != NULL) {
     // fuse bias and relu
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index 00034582a..962d2c345 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -108,10 +108,12 @@ FFHandler
   handle.num_devices = 0;
   handle.device_id = 0;
   handle.gemm_engine = new Internal::GemmEngine(handle.blas, handle.blasLt);
-  // We may not use all devices, physical_device may not be successive, so we explicitly get the physical device id
+  // We may not use all devices, physical_device may not be successive, so we
+  // explicitly get the physical device id
   int physical_device;
   checkCUDA(cudaGetDevice(&physical_device));
-  checkCUDA(cudaGetDeviceProperties(handle.gemm_engine->device_prop, physical_device));
+  checkCUDA(cudaGetDeviceProperties(handle.gemm_engine->device_prop,
+                                    physical_device));
   // #ifdef FF_USE_NCCL
   //   checkNCCL(ncclCommInitRank(&handle.nccl, info->allRanks, info->ncclId,
   //   info->myRank)); fprintf(stderr, "handle.nccl(%p)\n", handle.nccl);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 45b5ab627..a4909e78c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -516,8 +516,8 @@ RequestManager::RequestGuid
   request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   request.set_slo_ratio(req.slo_ratio);
   printf("Registered as request[%ld] with slo %.3f ms\n",
-           request.guid,
-           get_slo_constraint(request));
+         request.guid,
+         get_slo_constraint(request));
 
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
@@ -1804,8 +1804,7 @@ bool RequestManager::update_llm_verify_results(
         request.decode_latency_ms <= get_request_expected_latency(request);
     bool current_attained =
         request.decode_latency_ms <=
-        get_request_expected_latency(request) +
-            get_slo_constraint(request) * 6;
+        get_request_expected_latency(request) + get_slo_constraint(request) * 6;
 
     // Initialize the token tree for the request
     init_token_tree(guid);
@@ -2434,9 +2433,7 @@ std::vector<GenerationResult>
   for (size_t i = 0; i < requests.size(); i++) {
     requests[i].slo_ratio = emission_machine.sample_slo_ratio();
     requests[i].emission_time_ms = emission_machine.get_elapsed_time_ms();
-    printf("Prompt[%ld]: %s\n",
-           i,
-           requests[i].prompt.c_str());
+    printf("Prompt[%ld]: %s\n", i, requests[i].prompt.c_str());
     RequestManager::RequestGuid guid = rm->register_new_request(requests[i]);
     if (guid != RequestManager::INVALID_GUID) {
       guids.push_back(guid);
@@ -3232,8 +3229,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid,
       (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
       get_slo_constraint(request);
   double expected_num_tokens_decoded =
-      request.decode_latency_ms /
-      get_slo_constraint(request);
+      request.decode_latency_ms / get_slo_constraint(request);
 
   double num_tokens_to_decode =
       max(1.0,

From afaa88fe23263d4bd4b9ea6235f362e93fa45903 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Dec 2024 08:58:56 -0800
Subject: [PATCH 650/667] feat: add seperate server baseline

---
 inference/incr_decoding/incr_decoding.cc | 31 +++++++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 959535e0d..7f04beae0 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -57,6 +57,8 @@ void parse_input_args(char **argv,
                       double &baseline_latency_ms,
                       double &ssm_spec_latency_ms,
                       double &llm_verify_latency_ms,
+                      double &slo_filter,
+                      int &replica,
                       double &request_per_second,
                       std::string &emission_file_path,
                       bool &add_special_tokens) {
@@ -158,6 +160,14 @@ void parse_input_args(char **argv,
       llm_verify_latency_ms = std::stod(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "--eval-slo-filter")) {
+      slo_filter = std::stod(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--eval-replica")) {
+      replica = std::stoi(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--request-per-second")) {
       request_per_second = std::stod(argv[++i]);
       continue;
@@ -212,6 +222,8 @@ void FlexFlow::top_level_task(Task const *task,
   double baseline_latency_ms = 50;
   double ssm_spec_latency_ms = 20;
   double llm_verify_latency_ms = 50;
+  double slo_filter = 0.0;
+  int replica = 1;
   double request_per_second = 1.0;
   bool add_special_tokens = true;
   std::string emission_file_path;
@@ -240,6 +252,8 @@ void FlexFlow::top_level_task(Task const *task,
                    baseline_latency_ms,
                    ssm_spec_latency_ms,
                    llm_verify_latency_ms,
+                   slo_filter,
+                   replica,
                    request_per_second,
                    emission_file_path,
                    add_special_tokens);
@@ -249,6 +263,9 @@ void FlexFlow::top_level_task(Task const *task,
   if (max_tokens_per_prefilling_batch == -1) {
     max_tokens_per_prefilling_batch = max_tokens_per_batch;
   }
+  if (slo_filter == 0.0) {
+    replica = 1;
+  }
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -430,10 +447,16 @@ void FlexFlow::top_level_task(Task const *task,
       std::vector<double> timestamps, ratios;
       for (auto const &json_obj : trace_json) {
         EmissionTrace trace(json_obj);
-        requests.push_back(
-            GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
-        timestamps.push_back(trace.emission_time_ms);
-        ratios.push_back(trace.slo_ratio);
+        if (slo_filter != 0.0 &&
+            std::fabs(trace.slo_ratio - slo_filter) > 1e-6) {
+          continue;
+        }
+        for (size_t i = 0; i < replica; ++i) {
+          requests.push_back(
+              GenerationRequest(trace.prompt, -1.0, 0, add_special_tokens));
+          timestamps.push_back(trace.emission_time_ms);
+          ratios.push_back(trace.slo_ratio);
+        }
       }
       TraceEmissionMachine emission_machine(timestamps, ratios);
       results = model.generate(requests, emission_machine);

From 841bee1f4c8466d1fdc6451f0f31c76bd887e8fb Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 4 Dec 2024 21:34:07 -0800
Subject: [PATCH 651/667] fix: update tree depth

---
 src/runtime/request_manager.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a4909e78c..b974f270f 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -878,9 +878,9 @@ void RequestManager::request_load_onto_batch(int batch_index) {
 }
 
 void RequestManager::update_token_tree_depth() {
-  ssm_tree_depth = min(
-      int(std::ceil(get_max_tokens_per_batch() / get_num_active_requests())),
-      get_max_tree_depth());
+  ssm_tree_depth = min(int(std::ceil((double)get_max_tokens_per_batch() /
+                                     get_num_active_requests())),
+                       get_max_tree_depth());
 }
 
 void RequestManager::update_inference_results(InferenceResult const &result) {

From b0a5918cfb0ed3305118fe368d840ed5cae5d58c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 6 Dec 2024 14:13:56 -0800
Subject: [PATCH 652/667] feat: add a switch for fcfs baseline

---
 include/flexflow/request_manager.h       |  4 ++++
 inference/incr_decoding/incr_decoding.cc | 12 ++++++++++--
 src/runtime/request_manager.cc           |  8 ++++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 7dbcc8af3..0222ce2f0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -130,6 +130,7 @@ struct Request {
     RUNNING = 102,   // running inference
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
+    PREEMPTED = 105, // preempted request
   };
   BatchConfig::RequestGuid guid;
   int batch_index = -1;
@@ -346,9 +347,11 @@ class RequestManager {
   void set_spec_infer_old_version(bool spec_infer_old_version);
   void set_greedy_schedule(bool greedy_schedule);
   void set_equal_schedule(bool equal_schedule);
+  void set_fcfs_slo(bool fcfs_slo);
   bool get_spec_infer_old_version();
   bool get_greedy_schedule();
   bool get_equal_schedule();
+  bool get_fcfs_slo();
   inline double get_slo_constraint(Request &request);
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
@@ -465,6 +468,7 @@ class RequestManager {
   bool spec_infer_old_version = false;
   bool greedy_schedule = false;
   bool equal_schedule = false;
+  bool fcfs_slo = false;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 7f04beae0..4aa1fc1d4 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -61,7 +61,8 @@ void parse_input_args(char **argv,
                       int &replica,
                       double &request_per_second,
                       std::string &emission_file_path,
-                      bool &add_special_tokens) {
+                      bool &add_special_tokens,
+                      bool &fcfs_slo) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -180,6 +181,10 @@ void parse_input_args(char **argv,
       add_special_tokens = false;
       continue;
     }
+    if (!strcmp(argv[i], "--fcfs-slo")) {
+      fcfs_slo = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -226,6 +231,7 @@ void FlexFlow::top_level_task(Task const *task,
   int replica = 1;
   double request_per_second = 1.0;
   bool add_special_tokens = true;
+  bool fcfs_slo = false;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -256,7 +262,8 @@ void FlexFlow::top_level_task(Task const *task,
                    replica,
                    request_per_second,
                    emission_file_path,
-                   add_special_tokens);
+                   add_special_tokens,
+                   fcfs_slo);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -351,6 +358,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_max_tree_width(16);
   rm->set_verbose(verbose);
   rm->set_streaming_cache(streaming_cache);
+  rm->set_fcfs_slo(fcfs_slo);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b974f270f..e1fd1e4f1 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -346,6 +346,10 @@ void RequestManager::set_equal_schedule(bool equal_schedule_) {
   equal_schedule = equal_schedule_;
 }
 
+void RequestManager::set_fcfs_slo(bool fcfs_slo_) {
+  fcfs_slo = fcfs_slo_;
+}
+
 bool RequestManager::get_spec_infer_old_version() {
   return spec_infer_old_version;
 }
@@ -358,6 +362,10 @@ bool RequestManager::get_equal_schedule() {
   return equal_schedule;
 }
 
+bool RequestManager::get_fcfs_slo() {
+  return fcfs_slo;
+}
+
 inline double RequestManager::get_slo_constraint(Request &request) {
   if (request.get_slo_ratio() < 0) {
     // we use negative number to specify the absolute slo constraint (ms)

From 4c1b2ce3207950b1cd4cd20fdae7a7b12b9a0ef6 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 6 Dec 2024 14:57:38 -0800
Subject: [PATCH 653/667] feat: added data structures in request manager to
 handle preempted requests

---
 include/flexflow/request_manager.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 0222ce2f0..ab7774f07 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -501,7 +501,9 @@ class RequestManager {
   int num_running_requests = 0;
   // Available requests in the batch config
   bool request_available[BatchConfig::MAX_NUM_REQUESTS];
+  bool request_preempted[BatchConfig::MAX_NUM_REQUESTS];
   int num_available_requests = 0;
+  int num_preempted_requests = 0;
   int ssm_completed = true;
   int ssm_tree_depth = 0;
 
@@ -529,6 +531,7 @@ class RequestManager {
   bool update_llm_decode_results(InferenceResult const &result);
   BatchConfig prepare_llm_prefilling_batch();
   BatchConfig prepare_decoding_batch();
+  BatchConfig prepare_decoding_batch_fcfs_slo();
   /* ---------- Incremental Decoding Helper Functions ---------- */
 
   /* ---------- Spec Decoding Helper Functions ---------- */

From 9fb88850ecd91fa7499de53104fb0ab0ff45346c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 8 Dec 2024 00:01:47 -0800
Subject: [PATCH 654/667] fix: use num tokens to decode to replace spare
 latency

---
 src/runtime/request_manager.cc | 149 ++++++++++++++++++++++++++++++---
 1 file changed, 138 insertions(+), 11 deletions(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index e1fd1e4f1..7a4a6199d 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1168,7 +1168,11 @@ BatchConfig RequestManager::prepare_next_batch() {
       }
       break;
     case DECODING:
-      return prepare_decoding_batch();
+      if get_fcfs_slo () {
+        return prepare_decoding_batch_fcfs_slo();
+      } else {
+        return prepare_decoding_batch();
+      }
     case SSM_SPEC:
       if (current_ssm_step == 0) {
         return prepare_first_spec_batch_config();
@@ -1399,6 +1403,121 @@ BatchConfig RequestManager::prepare_decoding_batch() {
   profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
   return bc;
 }
+
+BatchConfig RequestManager::prepare_decoding_batch_fcfs_slo() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
+  if (verbose) {
+    std::cout << "\n############### prepare_decoding_batch_fcfs_slo "
+                 "##############\n";
+  }
+
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  bc.prompt_phase = false;
+
+  // Check if there are any requests whose SLO is in the fastest category
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
+  bool has_fastest_slo = false;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+
+    if (!request_available[request_index]) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+
+    if (request.get_slo_ratio() <= 1.0) {
+      has_fastest_slo = true;
+      break;
+    }
+  }
+
+  // If there are requests with the fastest SLO, we limit the number of requests
+  // to be decoded in this batch to 8
+  if (has_fastest_slo) {
+    int num_fastest_slo_requests = 0;
+    for (int request_index = 0; request_index < get_max_requests_per_batch();
+         request_index++) {
+      if (!request_available[request_index]) {
+        continue;
+      }
+      Request &request = all_requests[guid_of_requests[request_index]];
+      assert(request.status == Request::RUNNING);
+
+      if (request.get_slo_ratio() <= 1.0) {
+        num_fastest_slo_requests++;
+      }
+    }
+
+    if (num_fastest_slo_requests > 8) {
+      int num_remaining_requests = 0;
+      std::vector<std::pair<long long, int>> start_time_and_request_index;
+      for (int request_index = 0; request_index < get_max_requests_per_batch();
+           request_index++) {
+        if (!request_available[request_index]) {
+          continue;
+        }
+        Request &request = all_requests[guid_of_requests[request_index]];
+        assert(request.status == Request::RUNNING);
+
+        if (request.get_slo_ratio() > 1.0) {
+          request_available[request_index] = false;
+          num_available_requests--;
+          num_remaining_requests++;
+        }
+      }
+      bc.num_available_requests -= num_remaining_requests;
+    }
+  }
+
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (!request_available[request_index]) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+
+    // Per Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
+
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
+
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+
+    bc.num_tokens++;
+
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
+    }
+  }
+
+  if (verbose) {
+    std::cout << "prepare_decoding_batch_fcfs_slo NEW batchconfig:"
+              << std::endl;
+    bc.print();
+  }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return bc;
+}
 /* ----- Speculative Inference Specific functions ----- */
 
 /***** Request Init Phase *****/
@@ -3146,8 +3265,8 @@ void RequestManager::prune_token_tree() {
   int budget = get_max_tokens_per_batch() - num_available_requests;
   assert(budget >= 0);
 
-  std::vector<std::pair<double, int>> spare_latency_2_request_index;
-  spare_latency_2_request_index.reserve(get_max_requests_per_batch());
+  std::vector<std::pair<double, int>> num_tokens_to_decode_2_request_index;
+  num_tokens_to_decode_2_request_index.reserve(get_max_requests_per_batch());
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -3159,22 +3278,30 @@ void RequestManager::prune_token_tree() {
     if (request.get_slo_ratio() > 999) { // infinity
       continue;
     }
-    double spare_latency =
-        get_request_expected_latency(request) - request.decode_latency_ms;
-    spare_latency_2_request_index.push_back(
-        std::make_pair(spare_latency, request_index));
+    double num_tokens_to_decode_per_step =
+        (ssm_spec_latency_ms + llm_verify_latency_ms) * correction_factor /
+        get_slo_constraint(request);
+    double expected_num_tokens_decoded =
+        request.decode_latency_ms / get_slo_constraint(request);
+    double num_tokens_to_decode =
+        max(1.0,
+            num_tokens_to_decode_per_step + expected_num_tokens_decoded -
+                request.decode_length());
+    num_tokens_to_decode_2_request_index.push_back(
+        std::make_pair(num_tokens_to_decode, request_index));
   }
 
   // Sort the requests by spare latency in ascending order
-  std::sort(spare_latency_2_request_index.begin(),
-            spare_latency_2_request_index.end(),
+  std::sort(num_tokens_to_decode_2_request_index.begin(),
+            num_tokens_to_decode_2_request_index.end(),
             std::less<std::pair<double, int>>());
 
   for (auto const &spare_latency_request_index_pair :
-       spare_latency_2_request_index) {
+       num_tokens_to_decode_2_request_index) {
     int request_index = spare_latency_request_index_pair.second;
     RequestGuid guid = guid_of_requests[request_index];
-    add_tokens_toward_slo(guid, budget, spare_latency_2_request_index.size());
+    add_tokens_toward_slo(
+        guid, budget, num_tokens_to_decode_2_request_index.size());
   }
 
   assert(budget >= 0);

From aa2d36d2f066aee686a2b4cd083e7452b7330bd5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 8 Dec 2024 14:50:16 -0800
Subject: [PATCH 655/667] feat: support the policy fcfs and smallest time to
 attain

---
 include/flexflow/request_manager.h       |   7 +-
 inference/incr_decoding/incr_decoding.cc |  12 +-
 src/runtime/request_manager.cc           | 181 ++++++++++++++++++-----
 3 files changed, 158 insertions(+), 42 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index ab7774f07..49d51bfb0 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -130,7 +130,6 @@ struct Request {
     RUNNING = 102,   // running inference
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
-    PREEMPTED = 105, // preempted request
   };
   BatchConfig::RequestGuid guid;
   int batch_index = -1;
@@ -348,10 +347,12 @@ class RequestManager {
   void set_greedy_schedule(bool greedy_schedule);
   void set_equal_schedule(bool equal_schedule);
   void set_fcfs_slo(bool fcfs_slo);
+  void set_stta(bool stta);
   bool get_spec_infer_old_version();
   bool get_greedy_schedule();
   bool get_equal_schedule();
   bool get_fcfs_slo();
+  bool get_stta();
   inline double get_slo_constraint(Request &request);
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
@@ -469,6 +470,7 @@ class RequestManager {
   bool greedy_schedule = false;
   bool equal_schedule = false;
   bool fcfs_slo = false;
+  bool stta = false; // The smallest time to attain policy
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
@@ -501,9 +503,7 @@ class RequestManager {
   int num_running_requests = 0;
   // Available requests in the batch config
   bool request_available[BatchConfig::MAX_NUM_REQUESTS];
-  bool request_preempted[BatchConfig::MAX_NUM_REQUESTS];
   int num_available_requests = 0;
-  int num_preempted_requests = 0;
   int ssm_completed = true;
   int ssm_tree_depth = 0;
 
@@ -532,6 +532,7 @@ class RequestManager {
   BatchConfig prepare_llm_prefilling_batch();
   BatchConfig prepare_decoding_batch();
   BatchConfig prepare_decoding_batch_fcfs_slo();
+  BatchConfig prepare_decoding_batch_stta();
   /* ---------- Incremental Decoding Helper Functions ---------- */
 
   /* ---------- Spec Decoding Helper Functions ---------- */
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 4aa1fc1d4..7092664ea 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -62,7 +62,8 @@ void parse_input_args(char **argv,
                       double &request_per_second,
                       std::string &emission_file_path,
                       bool &add_special_tokens,
-                      bool &fcfs_slo) {
+                      bool &fcfs_slo,
+                      bool &stta) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -185,6 +186,10 @@ void parse_input_args(char **argv,
       fcfs_slo = true;
       continue;
     }
+    if (!strcmp(argv[i], "--stta")) {
+      stta = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -232,6 +237,7 @@ void FlexFlow::top_level_task(Task const *task,
   double request_per_second = 1.0;
   bool add_special_tokens = true;
   bool fcfs_slo = false;
+  bool stta = false;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -263,7 +269,8 @@ void FlexFlow::top_level_task(Task const *task,
                    request_per_second,
                    emission_file_path,
                    add_special_tokens,
-                   fcfs_slo);
+                   fcfs_slo,
+                   stta);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -359,6 +366,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_verbose(verbose);
   rm->set_streaming_cache(streaming_cache);
   rm->set_fcfs_slo(fcfs_slo);
+  rm->set_stta(stta);
   rm->register_tokenizer(
       model_type, bos_token_id, eos_token_ids, tokenizer_filepath);
   rm->register_output_filepath(file_paths.output_file_path);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7a4a6199d..70eb3ff79 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -350,6 +350,10 @@ void RequestManager::set_fcfs_slo(bool fcfs_slo_) {
   fcfs_slo = fcfs_slo_;
 }
 
+void RequestManager::set_stta(bool stta_) {
+  stta = stta_;
+}
+
 bool RequestManager::get_spec_infer_old_version() {
   return spec_infer_old_version;
 }
@@ -366,6 +370,10 @@ bool RequestManager::get_fcfs_slo() {
   return fcfs_slo;
 }
 
+bool RequestManager::get_stta() {
+  return stta;
+}
+
 inline double RequestManager::get_slo_constraint(Request &request) {
   if (request.get_slo_ratio() < 0) {
     // we use negative number to specify the absolute slo constraint (ms)
@@ -1168,8 +1176,10 @@ BatchConfig RequestManager::prepare_next_batch() {
       }
       break;
     case DECODING:
-      if get_fcfs_slo () {
+      if (get_fcfs_slo()) {
         return prepare_decoding_batch_fcfs_slo();
+      } else if (get_stta()) {
+        return prepare_decoding_batch_stta();
       } else {
         return prepare_decoding_batch();
       }
@@ -1418,64 +1428,161 @@ BatchConfig RequestManager::prepare_decoding_batch_fcfs_slo() {
   bc.prompt_phase = false;
 
   // Check if there are any requests whose SLO is in the fastest category
+  std::fill(request_available,
+            request_available + get_max_requests_per_batch(),
+            false);
+  num_available_requests = 0;
+  std::vector<Request> fcfs_request_queue;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (guid_of_requests[request_index] == INVALID_GUID) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+    fcfs_request_queue.push_back(request);
+  }
+
+  // Sort the requests in the FCFS queue based on the decoding time in
+  // descending order
+  std::sort(fcfs_request_queue.begin(),
+            fcfs_request_queue.end(),
+            [](Request const &a, Request const &b) {
+              return a.decode_latency_ms < b.decode_latency_ms;
+            });
+
+  // Include the requests one by one until:
+  // 1. If the batch includes a request whose SLO is in the fastest category,
+  // limit the number of requests in the batch to 8.
+  // 2. If the batch does not include a request whose SLO is in the fastest
+  // category, keep adding requests until a request whose SLO is in the fastest
+  // category is met (do not include it).
+  bool has_fastest_slo = false;
+  for (Request &request : fcfs_request_queue) {
+    if (has_fastest_slo and num_available_requests >= 8) {
+      break;
+    }
+    if (request.get_slo_ratio() <= 1.0) {
+      has_fastest_slo = true;
+      if (num_available_requests >= 8) {
+        break;
+      }
+    }
+    request_load_onto_batch(request.batch_index);
+  }
+
   std::copy(std::begin(request_available),
             std::end(request_available),
             std::begin(bc.request_available));
   bc.num_available_requests = num_available_requests;
-  bool has_fastest_slo = false;
+
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        request_index++) {
-
     if (!request_available[request_index]) {
       continue;
     }
     Request &request = all_requests[guid_of_requests[request_index]];
     assert(request.status == Request::RUNNING);
 
-    if (request.get_slo_ratio() <= 1.0) {
-      has_fastest_slo = true;
-      break;
+    // Per Request Info
+    bc.requestsInfo[request_index].first_token_index_in_request =
+        request.llm_cache_size;
+    bc.requestsInfo[request_index].first_token_offset_in_batch = bc.num_tokens;
+    bc.requestsInfo[request_index].num_tokens_in_batch = 1;
+
+    // Copy the streaming cache info
+    bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+
+    request.first_token_offset_in_batch = bc.num_tokens;
+    request.num_tokens_in_batch = 1;
+
+    // Per Token Info
+    bc.tokensInfo[bc.num_tokens].request_index = request_index;
+    bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
+    bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+
+    bc.num_tokens++;
+
+    if (profiling_requests[request.guid].llm_decoding_steps == 0) {
+      profiling_requests[request.guid].start_decoding_time =
+          Realm::Clock::current_time_in_microseconds();
     }
   }
 
-  // If there are requests with the fastest SLO, we limit the number of requests
-  // to be decoded in this batch to 8
-  if (has_fastest_slo) {
-    int num_fastest_slo_requests = 0;
-    for (int request_index = 0; request_index < get_max_requests_per_batch();
-         request_index++) {
-      if (!request_available[request_index]) {
-        continue;
-      }
-      Request &request = all_requests[guid_of_requests[request_index]];
-      assert(request.status == Request::RUNNING);
+  if (verbose) {
+    std::cout << "prepare_decoding_batch_fcfs_slo NEW batchconfig:"
+              << std::endl;
+    bc.print();
+  }
+  profiling.llm_step_start = Realm::Clock::current_time_in_microseconds();
+  return bc;
+}
 
-      if (request.get_slo_ratio() <= 1.0) {
-        num_fastest_slo_requests++;
-      }
-    }
+BatchConfig RequestManager::prepare_decoding_batch_stta() {
+  // This function is called when the request_manager_status is DECODING. It
+  // fills the last token of each request in the current batch to the
+  // BatchConfig for the LLM to decode.
+  if (verbose) {
+    std::cout << "\n############### prepare_decoding_batch_stta "
+                 "##############\n";
+  }
 
-    if (num_fastest_slo_requests > 8) {
-      int num_remaining_requests = 0;
-      std::vector<std::pair<long long, int>> start_time_and_request_index;
-      for (int request_index = 0; request_index < get_max_requests_per_batch();
-           request_index++) {
-        if (!request_available[request_index]) {
-          continue;
-        }
-        Request &request = all_requests[guid_of_requests[request_index]];
-        assert(request.status == Request::RUNNING);
+  BatchConfig bc;
+  bc.inference_mode = InferenceMode::INC_DECODING_MODE;
+  bc.prompt_phase = false;
 
-        if (request.get_slo_ratio() > 1.0) {
-          request_available[request_index] = false;
-          num_available_requests--;
-          num_remaining_requests++;
-        }
+  // Check if there are any requests whose SLO is in the fastest category
+  std::fill(request_available,
+            request_available + get_max_requests_per_batch(),
+            false);
+  num_available_requests = 0;
+  std::vector<std::pair<double, int>> tta_2_batch_index;
+  for (int request_index = 0; request_index < get_max_requests_per_batch();
+       request_index++) {
+    if (guid_of_requests[request_index] == INVALID_GUID) {
+      continue;
+    }
+    Request &request = all_requests[guid_of_requests[request_index]];
+    assert(request.status == Request::RUNNING);
+    tta_2_batch_index.push_back(std::make_pair(
+        get_request_expected_latency(request) - request.decode_latency_ms,
+        request_index));
+  }
+
+  // Sort the requests in the queue based on the time to attain SLO in ascending
+  // order
+  std::sort(tta_2_batch_index.begin(),
+            tta_2_batch_index.end(),
+            [](std::pair<double, int> const &a,
+               std::pair<double, int> const &b) { return a.first < b.first; });
+
+  // Include the requests one by one until:
+  // 1. If the batch includes a request whose SLO is in the fastest category,
+  // limit the number of requests in the batch to 8.
+  // 2. If the batch does not include a request whose SLO is in the fastest
+  // category, keep adding requests until a request whose SLO is in the fastest
+  // category is met (do not include it).
+  bool has_fastest_slo = false;
+  for (auto const &[tta, request_index] : tta_2_batch_index) {
+    Request &request = all_requests[guid_of_requests[request_index]];
+    if (has_fastest_slo and num_available_requests >= 8) {
+      break;
+    }
+    if (request.get_slo_ratio() <= 1.0) {
+      has_fastest_slo = true;
+      if (num_available_requests >= 8) {
+        break;
       }
-      bc.num_available_requests -= num_remaining_requests;
     }
+    request_load_onto_batch(request_index);
   }
 
+  std::copy(std::begin(request_available),
+            std::end(request_available),
+            std::begin(bc.request_available));
+  bc.num_available_requests = num_available_requests;
+
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        request_index++) {
     if (!request_available[request_index]) {

From 04cf20684cfbae2c83c2ba95756006714a24fbbf Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 8 Dec 2024 19:29:44 -0800
Subject: [PATCH 656/667] chore: scheduling policy minor enhancement

---
 src/runtime/request_manager.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index a4909e78c..a6e738b18 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -3166,6 +3166,9 @@ void RequestManager::prune_token_tree() {
        spare_latency_2_request_index) {
     int request_index = spare_latency_request_index_pair.second;
     RequestGuid guid = guid_of_requests[request_index];
+    if (all_requests[guid].get_slo_ratio() < 0) {
+      continue;
+    }
     add_tokens_toward_slo(guid, budget, spare_latency_2_request_index.size());
   }
 
@@ -3235,6 +3238,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid,
       max(1.0,
           num_tokens_to_decode_per_step + expected_num_tokens_decoded -
               request.decode_length());
+  num_tokens_to_decode = min(num_tokens_to_decode, (double)ssm_tree_depth + 1);
 
   // The root is already included
   // In function add_root_to_spec_token_tree
@@ -3242,7 +3246,7 @@ void RequestManager::add_tokens_toward_slo(RequestGuid guid,
 
   // The max token that can be added to the token tree when fulfilling the SLO
   int max_token_toward_slo =
-      int(get_max_tokens_per_batch() / num_available_requests);
+      int(get_max_tokens_per_batch() * 1.2 / num_available_requests);
 
   while (budget > 0 and max_token_toward_slo > 0 and
          current_added < num_tokens_to_decode) {

From 76decb311366bf10b56bb0036d87582c0f972a90 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Dec 2024 01:54:13 -0800
Subject: [PATCH 657/667] chore: minor

---
 inference/incr_decoding/incr_decoding.cc | 4 ++--
 src/runtime/request_manager.cc           | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 7092664ea..f1b00617a 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -182,11 +182,11 @@ void parse_input_args(char **argv,
       add_special_tokens = false;
       continue;
     }
-    if (!strcmp(argv[i], "--fcfs-slo")) {
+    if (!strcmp(argv[i], "--fcfs-serving")) {
       fcfs_slo = true;
       continue;
     }
-    if (!strcmp(argv[i], "--stta")) {
+    if (!strcmp(argv[i], "--stta-serving")) {
       stta = true;
       continue;
     }
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0c80b6e99..6a14c452a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -3394,6 +3394,7 @@ void RequestManager::prune_token_tree() {
         max(1.0,
             num_tokens_to_decode_per_step + expected_num_tokens_decoded -
                 request.decode_length());
+    num_tokens_to_decode = min(num_tokens_to_decode, (double)ssm_tree_depth + 1);
     num_tokens_to_decode_2_request_index.push_back(
         std::make_pair(num_tokens_to_decode, request_index));
   }

From b9208383799c7095b4f07194d8dc5b807a8fa15b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Dec 2024 07:27:05 -0800
Subject: [PATCH 658/667] feat: add overhead breakdown

---
 include/flexflow/request_manager.h |  9 ++++
 inference/spec_infer/spec_infer.cc | 12 ++++-
 src/runtime/request_manager.cc     | 71 +++++++++++++++++++++++++++++-
 3 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 49d51bfb0..afbf61641 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -354,6 +354,8 @@ class RequestManager {
   bool get_fcfs_slo();
   bool get_stta();
   inline double get_slo_constraint(Request &request);
+  void set_eval_overhead_breakdown(bool eval_overhead_breakdown);
+  bool get_eval_overhead_breakdown();
   double get_request_expected_latency(Request &request);
   Request &get_request_with_guid(RequestGuid guid);
   int register_ssm_model(FFModel *model);
@@ -471,6 +473,13 @@ class RequestManager {
   bool equal_schedule = false;
   bool fcfs_slo = false;
   bool stta = false; // The smallest time to attain policy
+  bool eval_overhead_breakdown = false; // for evaluation purpose
+  double eval_ssm_prefill_latency_us = 0.0;
+  double eval_llm_prefill_latency_us = 0.0;
+  double eval_ssm_spec_latency_us = 0.0;
+  double eval_llm_verify_latency_us = 0.0;
+  double eval_process_latency_us = 0.0;
+  double eval_schedule_latency_us = 0.0;
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 99c7e2432..2d1572ecf 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -86,7 +86,8 @@ void parse_input_args(char **argv,
                       bool &greedy_schedule,
                       bool &equal_schedule,
                       std::string &emission_file_path,
-                      bool &add_special_tokens) {
+                      bool &add_special_tokens,
+                      bool &eval_overhead_breakdown) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -231,6 +232,10 @@ void parse_input_args(char **argv,
       add_special_tokens = false;
       continue;
     }
+    if (!strcmp(argv[i], "--eval-overhead-breakdown")) {
+      eval_overhead_breakdown = true;
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -421,6 +426,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool greedy_schedule = false;
   bool equal_schedule = false;
   bool add_special_tokens = true;
+  bool eval_overhead_breakdown = false;
   std::string emission_file_path;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
@@ -455,7 +461,8 @@ void FlexFlow::top_level_task(Task const *task,
                    greedy_schedule,
                    equal_schedule,
                    emission_file_path,
-                   add_special_tokens);
+                   add_special_tokens,
+                   eval_overhead_breakdown);
   if (max_tokens_per_ssm_batch == -1) {
     max_tokens_per_ssm_batch = max_tokens_per_batch;
   }
@@ -507,6 +514,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->set_greedy_schedule(greedy_schedule);
   rm->set_equal_schedule(equal_schedule);
   rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_eval_overhead_breakdown(eval_overhead_breakdown);
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6a14c452a..6fd0ac71a 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -374,6 +374,14 @@ bool RequestManager::get_stta() {
   return stta;
 }
 
+void RequestManager::set_eval_overhead_breakdown(bool eval_overhead_breakdown_) {
+  eval_overhead_breakdown = eval_overhead_breakdown_;
+}
+
+bool RequestManager::get_eval_overhead_breakdown() {
+  return eval_overhead_breakdown;
+}
+
 inline double RequestManager::get_slo_constraint(Request &request) {
   if (request.get_slo_ratio() < 0) {
     // we use negative number to specify the absolute slo constraint (ms)
@@ -682,8 +690,34 @@ BatchConfig RequestManager::get_next_batch_config_task(
 
 BatchConfig
     RequestManager::get_next_batch_config(InferenceResult const &result) {
+  static double process_this_start_us = 0.0, process_last_end_us = 0.0;
+  if (get_eval_overhead_breakdown()) {
+    process_this_start_us = Realm::Clock::current_time_in_microseconds();
+    if (process_last_end_us != 0) {
+      if (request_manager_status == PREFILLING) {
+        if (prefill_model == SSM) {
+          eval_ssm_prefill_latency_us +=
+              process_this_start_us - process_last_end_us;
+        } else {
+          eval_llm_prefill_latency_us +=
+              process_this_start_us - process_last_end_us;
+        }
+      } else if (request_manager_status == SSM_SPEC) {
+        eval_ssm_spec_latency_us +=
+            process_this_start_us - process_last_end_us;
+      } else if (request_manager_status == LLM_VERIFY) {
+        eval_llm_verify_latency_us +=
+            process_this_start_us - process_last_end_us;
+      }
+    }
+  }
   update_inference_results(result);
-  return prepare_next_batch();
+  BatchConfig bc = prepare_next_batch();
+  if (get_eval_overhead_breakdown()) {
+    process_last_end_us = Realm::Clock::current_time_in_microseconds();
+    eval_process_latency_us += process_last_end_us - process_this_start_us; 
+  }
+  return bc;
 }
 
 // Return value: true if load a pending request to the batch
@@ -2088,7 +2122,15 @@ bool RequestManager::update_ssm_inference_results(
   // BatchConfig and hence the last InferenceResult is equal to
   // the order of the request in the last BatchConfig
   if (!spec_infer_old_version) {
+    static double schedule_start = 0.0;
+    if (get_eval_overhead_breakdown()) {
+      schedule_start = Realm::Clock::current_time_in_microseconds();
+    }
     add_tokens_to_spec_token_tree(ssm_inference_result);
+    if (get_eval_overhead_breakdown()) {
+      eval_schedule_latency_us += Realm::Clock::current_time_in_microseconds() -
+                                  schedule_start;
+    }
   } else {
     add_tokens_to_spec_token_tree_old_version(ssm_inference_result);
   }
@@ -2131,7 +2173,15 @@ bool RequestManager::update_ssm_inference_results(
   if (current_ssm_step == ssm_tree_depth) {
     // Prune the token tree at the last step
     if (!spec_infer_old_version) {
+      static double schedule_start = 0.0;
+      if (get_eval_overhead_breakdown()) {
+        schedule_start = Realm::Clock::current_time_in_microseconds();
+      }
       prune_token_tree();
+      if (get_eval_overhead_breakdown()) {
+        eval_schedule_latency_us += Realm::Clock::current_time_in_microseconds() -
+                                    schedule_start;
+      }
     }
     // Update profiling statistics before returning
     profiling.ssm_step_times.push_back(
@@ -3138,6 +3188,25 @@ void RequestManager::terminate_background_server() {
     goodput_str += ")";
     str += goodput_str;
 
+    if (get_eval_overhead_breakdown()) {
+      eval_process_latency_us -= eval_schedule_latency_us;
+      std::string eval_overhead_breakdown_str = "\n eval_overhead_breakdown( ";
+      eval_overhead_breakdown_str += "\n  ssm_prefill_us: " +
+                                     std::to_string(eval_ssm_prefill_latency_us);
+      eval_overhead_breakdown_str += "\n  ssm_spec_us: " +
+                                     std::to_string(eval_ssm_spec_latency_us);
+      eval_overhead_breakdown_str += "\n  llm_prefill_us: " +
+                                     std::to_string(eval_llm_prefill_latency_us);
+      eval_overhead_breakdown_str += "\n  llm_verify_us: " +
+                                     std::to_string(eval_llm_verify_latency_us);
+      eval_overhead_breakdown_str += "\n  process_us: " +
+                                     std::to_string(eval_process_latency_us);
+      eval_overhead_breakdown_str += "\n  scheduling_us: " +
+                                     std::to_string(eval_schedule_latency_us);
+      eval_overhead_breakdown_str += ")";
+      str += eval_overhead_breakdown_str;
+    }
+
     write_to_output_file("", str);
     background_server_status = TERMINATED;
     request_queue_cv.notify_all();

From 17cbc9c73c8a9f68d7d140feaf8ceb6ede9e3c74 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Dec 2024 18:42:19 -0800
Subject: [PATCH 659/667] fix: overhead breakdown

---
 include/flexflow/request_manager.h |  1 +
 src/runtime/request_manager.cc     | 36 ++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index afbf61641..fc79401cf 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -480,6 +480,7 @@ class RequestManager {
   double eval_llm_verify_latency_us = 0.0;
   double eval_process_latency_us = 0.0;
   double eval_schedule_latency_us = 0.0;
+  double eval_other_latency_us = 0.0; // load pending request, request complete
 
   std::unique_ptr<Tokenizer> tokenizer_;
   bool verbose;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 6fd0ac71a..5a4e1c6c4 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -715,14 +715,24 @@ BatchConfig
   BatchConfig bc = prepare_next_batch();
   if (get_eval_overhead_breakdown()) {
     process_last_end_us = Realm::Clock::current_time_in_microseconds();
-    eval_process_latency_us += process_last_end_us - process_this_start_us; 
+    double process_time_us = process_last_end_us - process_this_start_us;
+    // printf("Process time: %.3f us\n", process_time_us);
+    eval_process_latency_us += process_time_us;
   }
   return bc;
 }
 
 // Return value: true if load a pending request to the batch
 bool RequestManager::load_pending_request_to_batch() {
+  static double load_request_start = 0.0;
+  if (get_eval_overhead_breakdown()) {
+    load_request_start = Realm::Clock::current_time_in_microseconds();
+  }
   if (num_running_requests >= get_max_requests_per_batch()) {
+    if (get_eval_overhead_breakdown()) {
+      eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
+                                  load_request_start;
+    }
     return false;
   }
   std::unique_lock<std::mutex> lock(request_queue_mutex);
@@ -730,6 +740,10 @@ bool RequestManager::load_pending_request_to_batch() {
     if (num_running_requests > 0) {
       // No pending request to process, but there are running requests in the
       // batch. Do nothing and return
+      if (get_eval_overhead_breakdown()) {
+        eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
+                                    load_request_start;
+      }
       return false;
     }
     // Wait until there is a pending request or the background server is
@@ -740,6 +754,10 @@ bool RequestManager::load_pending_request_to_batch() {
     });
     // If the background server has been terminated, exit
     if (is_background_server_terminated()) {
+      if (get_eval_overhead_breakdown()) {
+        eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
+                                    load_request_start;
+      }
       return false;
     }
   }
@@ -779,6 +797,10 @@ bool RequestManager::load_pending_request_to_batch() {
     profiling_requests[guid].start_time =
         Realm::Clock::current_time_in_microseconds();
   }
+  if (get_eval_overhead_breakdown()) {
+    eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
+                                load_request_start;
+  }
   return true;
 }
 
@@ -801,6 +823,10 @@ bool isPrefixAndRemove(std::vector<int> const &prefix, std::vector<int> &vec) {
 }
 
 void RequestManager::request_complete_clean_up(int batch_index) {
+  static double request_complete_start = 0.0;
+  if (get_eval_overhead_breakdown()) {
+    request_complete_start = Realm::Clock::current_time_in_microseconds();
+  }
   RequestGuid guid = guid_of_requests[batch_index];
   profiling_requests[guid].finish_time =
       Realm::Clock::current_time_in_microseconds();
@@ -910,6 +936,10 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   //         std::to_string(profile_info.ssm_decoding_steps) + ")";
   // }
   // write_to_output_file("", str);
+  if (get_eval_overhead_breakdown()) {
+    eval_other_latency_us +=
+        Realm::Clock::current_time_in_microseconds() - request_complete_start;
+  }
 }
 
 void RequestManager::request_offload_from_batch(int batch_index) {
@@ -3189,7 +3219,7 @@ void RequestManager::terminate_background_server() {
     str += goodput_str;
 
     if (get_eval_overhead_breakdown()) {
-      eval_process_latency_us -= eval_schedule_latency_us;
+      eval_process_latency_us -= eval_schedule_latency_us + eval_other_latency_us;
       std::string eval_overhead_breakdown_str = "\n eval_overhead_breakdown( ";
       eval_overhead_breakdown_str += "\n  ssm_prefill_us: " +
                                      std::to_string(eval_ssm_prefill_latency_us);
@@ -3203,6 +3233,8 @@ void RequestManager::terminate_background_server() {
                                      std::to_string(eval_process_latency_us);
       eval_overhead_breakdown_str += "\n  scheduling_us: " +
                                      std::to_string(eval_schedule_latency_us);
+      eval_overhead_breakdown_str += "\n  other_us: " +
+                                     std::to_string(eval_other_latency_us);
       eval_overhead_breakdown_str += ")";
       str += eval_overhead_breakdown_str;
     }

From a21f9fbb7b6f02bf441de09c374b85a4d2577698 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 9 Dec 2024 18:45:49 -0800
Subject: [PATCH 660/667] style: format

---
 include/flexflow/request_manager.h |  2 +-
 src/runtime/request_manager.cc     | 64 +++++++++++++++---------------
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index fc79401cf..b95cc37e4 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -472,7 +472,7 @@ class RequestManager {
   bool greedy_schedule = false;
   bool equal_schedule = false;
   bool fcfs_slo = false;
-  bool stta = false; // The smallest time to attain policy
+  bool stta = false;                    // The smallest time to attain policy
   bool eval_overhead_breakdown = false; // for evaluation purpose
   double eval_ssm_prefill_latency_us = 0.0;
   double eval_llm_prefill_latency_us = 0.0;
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 5a4e1c6c4..9da88d8ed 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -374,7 +374,8 @@ bool RequestManager::get_stta() {
   return stta;
 }
 
-void RequestManager::set_eval_overhead_breakdown(bool eval_overhead_breakdown_) {
+void RequestManager::set_eval_overhead_breakdown(
+    bool eval_overhead_breakdown_) {
   eval_overhead_breakdown = eval_overhead_breakdown_;
 }
 
@@ -703,8 +704,7 @@ BatchConfig
               process_this_start_us - process_last_end_us;
         }
       } else if (request_manager_status == SSM_SPEC) {
-        eval_ssm_spec_latency_us +=
-            process_this_start_us - process_last_end_us;
+        eval_ssm_spec_latency_us += process_this_start_us - process_last_end_us;
       } else if (request_manager_status == LLM_VERIFY) {
         eval_llm_verify_latency_us +=
             process_this_start_us - process_last_end_us;
@@ -730,8 +730,8 @@ bool RequestManager::load_pending_request_to_batch() {
   }
   if (num_running_requests >= get_max_requests_per_batch()) {
     if (get_eval_overhead_breakdown()) {
-      eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
-                                  load_request_start;
+      eval_other_latency_us +=
+          Realm::Clock::current_time_in_microseconds() - load_request_start;
     }
     return false;
   }
@@ -741,8 +741,8 @@ bool RequestManager::load_pending_request_to_batch() {
       // No pending request to process, but there are running requests in the
       // batch. Do nothing and return
       if (get_eval_overhead_breakdown()) {
-        eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
-                                    load_request_start;
+        eval_other_latency_us +=
+            Realm::Clock::current_time_in_microseconds() - load_request_start;
       }
       return false;
     }
@@ -755,8 +755,8 @@ bool RequestManager::load_pending_request_to_batch() {
     // If the background server has been terminated, exit
     if (is_background_server_terminated()) {
       if (get_eval_overhead_breakdown()) {
-        eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
-                                    load_request_start;
+        eval_other_latency_us +=
+            Realm::Clock::current_time_in_microseconds() - load_request_start;
       }
       return false;
     }
@@ -798,8 +798,8 @@ bool RequestManager::load_pending_request_to_batch() {
         Realm::Clock::current_time_in_microseconds();
   }
   if (get_eval_overhead_breakdown()) {
-    eval_other_latency_us += Realm::Clock::current_time_in_microseconds() -
-                                load_request_start;
+    eval_other_latency_us +=
+        Realm::Clock::current_time_in_microseconds() - load_request_start;
   }
   return true;
 }
@@ -2158,8 +2158,8 @@ bool RequestManager::update_ssm_inference_results(
     }
     add_tokens_to_spec_token_tree(ssm_inference_result);
     if (get_eval_overhead_breakdown()) {
-      eval_schedule_latency_us += Realm::Clock::current_time_in_microseconds() -
-                                  schedule_start;
+      eval_schedule_latency_us +=
+          Realm::Clock::current_time_in_microseconds() - schedule_start;
     }
   } else {
     add_tokens_to_spec_token_tree_old_version(ssm_inference_result);
@@ -2209,8 +2209,8 @@ bool RequestManager::update_ssm_inference_results(
       }
       prune_token_tree();
       if (get_eval_overhead_breakdown()) {
-        eval_schedule_latency_us += Realm::Clock::current_time_in_microseconds() -
-                                    schedule_start;
+        eval_schedule_latency_us +=
+            Realm::Clock::current_time_in_microseconds() - schedule_start;
       }
     }
     // Update profiling statistics before returning
@@ -3219,22 +3219,23 @@ void RequestManager::terminate_background_server() {
     str += goodput_str;
 
     if (get_eval_overhead_breakdown()) {
-      eval_process_latency_us -= eval_schedule_latency_us + eval_other_latency_us;
+      eval_process_latency_us -=
+          eval_schedule_latency_us + eval_other_latency_us;
       std::string eval_overhead_breakdown_str = "\n eval_overhead_breakdown( ";
-      eval_overhead_breakdown_str += "\n  ssm_prefill_us: " +
-                                     std::to_string(eval_ssm_prefill_latency_us);
-      eval_overhead_breakdown_str += "\n  ssm_spec_us: " +
-                                     std::to_string(eval_ssm_spec_latency_us);
-      eval_overhead_breakdown_str += "\n  llm_prefill_us: " +
-                                     std::to_string(eval_llm_prefill_latency_us);
-      eval_overhead_breakdown_str += "\n  llm_verify_us: " +
-                                     std::to_string(eval_llm_verify_latency_us);
-      eval_overhead_breakdown_str += "\n  process_us: " +
-                                     std::to_string(eval_process_latency_us);
-      eval_overhead_breakdown_str += "\n  scheduling_us: " +
-                                     std::to_string(eval_schedule_latency_us);
-      eval_overhead_breakdown_str += "\n  other_us: " +
-                                     std::to_string(eval_other_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  ssm_prefill_us: " + std::to_string(eval_ssm_prefill_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  ssm_spec_us: " + std::to_string(eval_ssm_spec_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  llm_prefill_us: " + std::to_string(eval_llm_prefill_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  llm_verify_us: " + std::to_string(eval_llm_verify_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  process_us: " + std::to_string(eval_process_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  scheduling_us: " + std::to_string(eval_schedule_latency_us);
+      eval_overhead_breakdown_str +=
+          "\n  other_us: " + std::to_string(eval_other_latency_us);
       eval_overhead_breakdown_str += ")";
       str += eval_overhead_breakdown_str;
     }
@@ -3495,7 +3496,8 @@ void RequestManager::prune_token_tree() {
         max(1.0,
             num_tokens_to_decode_per_step + expected_num_tokens_decoded -
                 request.decode_length());
-    num_tokens_to_decode = min(num_tokens_to_decode, (double)ssm_tree_depth + 1);
+    num_tokens_to_decode =
+        min(num_tokens_to_decode, (double)ssm_tree_depth + 1);
     num_tokens_to_decode_2_request_index.push_back(
         std::make_pair(num_tokens_to_decode, request_index));
   }

From a5b7de6ca052a1e654c5705a668d34d1ea07af14 Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Sun, 26 Jan 2025 13:07:55 +0000
Subject: [PATCH 661/667] fix: minor

---
 .../inc_multihead_self_attention_kernels.h    |  2 +-
 src/ops/inc_multihead_self_attention.cu       | 43 ++++++++++---------
 .../inc_multihead_self_attention_kernels.cu   |  6 +--
 src/ops/tree_inc_multihead_self_attention.cu  |  2 +-
 src/runtime/request_manager.cc                |  4 +-
 5 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9f886ffec..16fe78cc1 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -113,7 +113,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          cudaStream_t stream);
 
 template <typename DT>
-void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream,
                          bool is_spec);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ac3a4b24e..9af7c09cb 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -275,7 +275,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     apply_pos_encoding_to_tokens_in_batch(
         m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
     // Move the batch qkv values to where took by attention
-    update_qkv_in_batch<DT>(m, bc, stream, false);
+    update_qkv_in_batch_paged<DT>(m, bc, stream, false);
   }
 
   // phase 4: Attention computation
@@ -515,7 +515,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         }
         break;
       }
-      case TREE_SEARCH_MODE:
+      case TREE_SEARCH_MODE: {
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
@@ -524,29 +524,30 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         value_cache_size = num_kv_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
-        if (streaming_cache) {
-          size_t max_post_pos_enc_pages =
-              round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                             BatchConfig::get_max_tree_depth() +
-                             max(max_tokens_per_batch,
-                                 BatchConfig::max_spec_tree_token_num()));
-          key_cache_size = num_kv_heads * qk_dim *
-                           BatchConfig::max_requests_per_batch() *
-                           max_post_pos_enc_pages * kPagesize;
-          value_cache_size = num_kv_heads * v_dim *
-                             BatchConfig::max_requests_per_batch() *
-                             max_post_pos_enc_pages * kPagesize;
-          streaming_pre_pos_enc_size =
-              num_kv_heads * (qk_dim + v_dim) *
-              BatchConfig::max_requests_per_batch() *
-              round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                             BatchConfig::get_max_tree_depth()) *
-              kPagesize;
-        }
         break;
+      }
       default:
         assert(false && "Unkown inference mode");
     }
+    if (streaming_cache) {
+      size_t max_post_pos_enc_pages =
+          round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                          BatchConfig::get_max_tree_depth() +
+                          max(max_tokens_per_batch,
+                              BatchConfig::max_spec_tree_token_num()));
+      key_cache_size = num_kv_heads * qk_dim *
+                        BatchConfig::max_requests_per_batch() *
+                        max_post_pos_enc_pages * kPagesize;
+      value_cache_size = num_kv_heads * v_dim *
+                          BatchConfig::max_requests_per_batch() *
+                          max_post_pos_enc_pages * kPagesize;
+      streaming_pre_pos_enc_size =
+          num_kv_heads * (qk_dim + v_dim) *
+          BatchConfig::max_requests_per_batch() *
+          round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                          BatchConfig::get_max_tree_depth()) *
+          kPagesize;
+    }
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
     size_t output_tmp_size = max_tokens_per_batch * num_q_heads * v_dim;
     size_t complex_size =
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 55edaed6b..dc75eb4ee 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -599,7 +599,7 @@ __global__ void update_qkv_in_batch_paged_kernel(
 }
 
 template <typename DT>
-void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream,
                          bool is_spec) {
@@ -1050,13 +1050,13 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     BatchConfig const *bc,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream,
     bool is_spec);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a5c98e414..dc1860310 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -428,7 +428,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventRecord(t_start, stream);
 
   // Update key-val cache, compact q array
-  update_qkv_in_batch<DT>(m, bc, stream, true);
+  update_qkv_in_batch_paged<DT>(m, bc, stream, true);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b608b6e94..7be8cbc1a 100755
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1396,8 +1396,6 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   int num_tokens = 0;
   for (Request *request : prefilling_requests) {
     int request_index = request->batch_index;
-    // Only set the prefilling request to be available
-    bc.request_available[request_index] = true;
 
     // Request Info
     bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens;
@@ -1430,6 +1428,8 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
+      // Only set the prefilling request to be available
+      bc.request_available[request_index] = true;
     }
 
     // Record prefilling start time

From 9c042f5adc40967cea3f049868c5878f0129e330 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 26 Jan 2025 20:13:57 +0000
Subject: [PATCH 662/667] fix: merge page_manager, also fix some issues

---
 include/flexflow/page_manager.h              | 162 ++++++++++++
 include/flexflow/request_manager.h           |  17 ++
 src/ops/inc_multihead_self_attention.cu      |  20 +-
 src/ops/tree_inc_multihead_self_attention.cu |   9 +-
 src/runtime/page_manager.cc                  | 246 +++++++++++++++++++
 src/runtime/request_manager.cc               | 137 ++++++++++-
 src/runtime/request_manager.cu               |  17 +-
 7 files changed, 589 insertions(+), 19 deletions(-)
 create mode 100644 include/flexflow/page_manager.h
 create mode 100644 src/runtime/page_manager.cc

diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
new file mode 100644
index 000000000..c0d6df085
--- /dev/null
+++ b/include/flexflow/page_manager.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include "flexflow/batch_config.h"
+#include "flexflow/config.h"
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/file_loader.h"
+#include <deque>
+#include <future>
+#include <mutex>
+#include <tokenizers_cpp.h>
+
+namespace FlexFlow {
+
+using TokenId = BatchConfig::TokenId;
+
+/**
+ * @class LogicalTokenBlock
+ * @brief A class to represent a sequence of tokens for each request
+ */
+class LogicalTokenBlock {
+public:
+  using TokenId = BatchConfig::TokenId;
+
+  // Constructor
+  LogicalTokenBlock(int block_number, uint32_t block_size);
+
+  // Method to check if the block is empty
+  bool is_empty() const;
+
+  // Method to check if the block is full
+  bool is_full() const;
+
+  // Method to get the number of empty slots
+  int get_num_empty_slots() const;
+
+  // Method to get the number of allocated slots
+  int get_num_alloc_slots() const;
+
+  // Used to clean up the spec tokens in a block since these spec tokens may not
+  // be committed after use
+  void reset_num_spec_tokens();
+
+  // Method to append tokens
+  void append_tokens(std::vector<TokenId> const &token_ids_to_append,
+                     bool committed);
+
+  int get_num_tokens() const {
+    return num_tokens;
+  }
+  int get_num_commit_tokens() const {
+    return num_commit_tokens;
+  }
+  int get_num_spec_tokens() const {
+    return num_spec_tokens;
+  }
+
+  std::vector<TokenId> get_token_ids() const;
+
+private:
+  int block_number;      // the index of the logical token block
+  int block_size;        // the size of the block
+  int num_tokens;        // the number of tokens currently stored in the block
+  int num_commit_tokens; // the number of tokens inside this block that are
+                         // already committed
+  int num_spec_tokens;   // the number of tokens inside this block that are
+                         // speculative tokens, which is stored temporarily
+  std::vector<TokenId> token_ids; // store the token ids in a order that
+                                  // corresponds to the inference sequence
+};
+
+/**
+ * @class PhysicalTokenBlock
+ * @brief A class to represent a physical block of tokens similar to physical
+ * memory address It keeps track of the location of the tokens stored on GPU
+ * memory
+ */
+class PhysicalTokenBlock {
+public:
+  // Constructor
+  PhysicalTokenBlock(int block_number, int block_size);
+
+  // Method to get the block number
+  int get_block_number() const {
+    return block_number;
+  }
+  void incr_ref_count() {
+    ref_count++;
+  }
+  void decr_ref_count() {
+    ref_count--;
+  }
+  int ref_count; // reference count, TODO: move to private
+
+private:
+  int block_number; // the index of the physical token block
+  int block_size;   // the size of the block
+};
+
+/**
+ * @class BlockAllocator
+ * @brief A Block Manager that is reponsible for maintaining a pool of free
+ * blocks
+ */
+class BlockAllocator {
+public:
+  // Constructor
+  BlockAllocator(int block_size, int num_total_blocks);
+
+  // Allocate a block
+  PhysicalTokenBlock allocate();
+
+  // Free a block
+  void free(PhysicalTokenBlock &block);
+
+  // Get the number of free blocks
+  int get_num_free_blocks() const;
+
+private:
+  int block_size;
+  size_t num_total_blocks;
+  std::deque<PhysicalTokenBlock> free_blocks;
+};
+
+/*
+ * @class PageManager
+ * @brief A wrapper class that manages the kv cache allocation status
+ * notice that all the layers of model will share the same page manager because
+ * the position of kv cache will be the same
+ */
+class PageManager {
+public:
+  // Get the singleton instance of the PageManager as it will be shared in
+  // multiple places
+  static PageManager *get_page_manager();
+  static PageManager *get_page_manager(FFModel *ff, size_t kv_cache_size);
+  size_t get_kv_cache_size_per_layer();
+  using BlockTable = std::vector<PhysicalTokenBlock>;
+  using RequestGuid = BatchConfig::RequestGuid;
+  PageManager(int block_size, size_t num_total_blocks);
+  int allocate_one_block(RequestGuid const &request_guid);
+  void free_request(RequestGuid const &request_guid);
+  // used for the case that we want to free the last num_blocks that stores spec
+  // tokens(which are the tokens are not yet committed)
+  void free_multiple_blocks(RequestGuid const &request_guid, int num_blocks);
+  std::vector<int>
+      get_block_table_indices(RequestGuid const &request_guid) const;
+
+  void free_block_table(BlockTable &block_table);
+
+private:
+  size_t kv_cache_size_per_layer;
+  int block_size;       // the size of the block
+  int num_total_blocks; // the total number of blocks
+  BlockAllocator block_allocator;
+  std::unordered_map<RequestGuid, BlockTable> block_tables;
+
+  int get_num_total_free_blocks() const;
+  int get_num_allocated_blocks(RequestGuid const &request_guid) const;
+};
+
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 33b0ff415..16b41285b 100755
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -18,6 +18,7 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/utils/file_loader.h"
 #include <condition_variable>
 #include <future>
@@ -148,6 +149,12 @@ struct Request {
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
 
+  // page attention, page_last_committed should be -1 because there are no
+  // blocks at the beginning
+  int page_last_committed = -1;
+  std::vector<LogicalTokenBlock> blocks;
+
+  // TokenTree speculative_token_tree;
   std::vector<TokenTree> speculative_token_trees;
   // To make request manager stateful, we need to store the causal mask here
   BatchConfig::BitMask causal_mask;
@@ -576,6 +583,16 @@ class RequestManager {
   void init_bitmask_spec(RequestGuid guid);
   BatchConfig::BitMask create_llm_bitmask(RequestGuid guid);
 
+  // Page Attention related
+  int get_num_blocks_allocated(Request &request) const;
+  int get_len_last_block(Request &request) const;
+  int get_idx_last_logical_token(Request &request) const;
+  int idx_logical_to_physical(Request &request, int idx_logical);
+  void _append_block_to_request(Request &request, bool is_commit);
+  int append_token_to_block(Request &request, TokenId token, bool is_commit);
+  void reset_block_table(Request &request);
+  void print_num_tokens(Request &request);
+
   // Token tree related
   void init_token_tree(RequestGuid guid);
   void add_root_to_spec_token_tree(RequestGuid guid,
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 5d3e8ed06..b647437bc 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -22,6 +22,7 @@
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/page_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 #include <math_constants.h>
 
@@ -494,9 +495,26 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t max_num_pages =
         round_up_pages(BatchConfig::max_sequence_length() +
                        BatchConfig::max_spec_tree_token_num());
+    PageManager *pm = PageManager::get_page_manager();
+    size_t total_kv_cache_size_per_layer = pm->get_kv_cache_size_per_layer();
     switch (infer_mode) {
       case INC_DECODING_MODE:
-      case TREE_VERIFY_MODE:
+      case TREE_VERIFY_MODE: {
+        query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
+        // a K-ary tree max node is (k^n - 1) / 2
+        if (total_kv_cache_size_per_layer == 0) {
+          key_cache_size = num_kv_heads * qk_dim *
+                           BatchConfig::max_requests_per_batch() *
+                           max_num_pages * kPagesize;
+          value_cache_size = num_kv_heads * v_dim *
+                             BatchConfig::max_requests_per_batch() *
+                             max_num_pages * kPagesize;
+        } else {
+          key_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
+          value_cache_size = total_kv_cache_size_per_layer / 2 / size_of_dt;
+        }
+        break;
+      }
       case TREE_SEARCH_MODE: {
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 6911bb3a3..6c5b58270 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -49,8 +49,6 @@ using flashinfer::QKVLayout;
 
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
-    int32_t *kv_indptr,
-    int32_t *kv_page_indices,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     bool const *request_available,
     int num_requests,
@@ -79,10 +77,9 @@ __global__ void commit_tokens_kernel(
         continue;
       }
 
-      int const start = kv_indptr[request_compact_idx];
-      int const page_to_idx = kv_page_indices[start + committedTokenInfos[i].token_depth / kPagesize];
+      int const page_to_idx = committedTokenInfos[i].token_depth / kPagesize;
       int const page_from_idx =
-          kv_page_indices[start + committedTokenInfos[i].index_in_kv_cache / kPagesize];
+          committedTokenInfos[i].index_in_kv_cache / kPagesize;
 
       size_t from_k_idx = get_k_entry_offset_verify(
                  committedTokenInfos[i].index_in_kv_cache,
@@ -129,8 +126,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                          0,
                          stream>>>(
       static_cast<half *>(m->kvCache),
-      m->handle.tree_verify_attention_metadata->kv_indptr,
-      m->handle.tree_verify_attention_metadata->kv_indices,
       m->committed_token_infos,
       m->request_available,
       num_requests,
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
new file mode 100644
index 000000000..7fbb16bcd
--- /dev/null
+++ b/src/runtime/page_manager.cc
@@ -0,0 +1,246 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/page_manager.h"
+
+namespace FlexFlow {
+
+// For all runtime functions, they share a single page manager for pages
+// information
+PageManager *page_manager_singleton = nullptr;
+
+// the interface of logicaltokenblock
+LogicalTokenBlock::LogicalTokenBlock(int block_number, uint32_t block_size)
+    : block_number(block_number), block_size(block_size), num_tokens(0),
+      num_commit_tokens(0), num_spec_tokens(0) {}
+
+bool LogicalTokenBlock::is_empty() const {
+  assert(num_spec_tokens == 0 && num_commit_tokens == 0);
+  assert(num_tokens <= block_size);
+  return num_tokens == 0;
+}
+
+bool LogicalTokenBlock::is_full() const {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return num_tokens == block_size;
+}
+
+int LogicalTokenBlock::get_num_empty_slots() const {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return block_size - num_tokens;
+}
+
+int LogicalTokenBlock::get_num_alloc_slots() const {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  return num_tokens;
+}
+
+void LogicalTokenBlock::reset_num_spec_tokens() {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+
+  num_tokens -= num_spec_tokens;
+  num_spec_tokens = 0;
+
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+}
+
+void LogicalTokenBlock::append_tokens(
+    std::vector<TokenId> const &token_ids_to_append, bool committed) {
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+  if (num_tokens + token_ids_to_append.size() > block_size) {
+    printf("block is full! Cannot append more tokens\n");
+    throw std::runtime_error("Block is full! Cannot append more tokens.");
+  }
+  token_ids.insert(
+      token_ids.end(), token_ids_to_append.begin(), token_ids_to_append.end());
+  num_tokens += token_ids_to_append.size();
+  if (committed) {
+    num_commit_tokens += token_ids_to_append.size();
+  } else {
+    num_spec_tokens += token_ids_to_append.size();
+  }
+  assert(num_spec_tokens + num_commit_tokens == num_tokens);
+  assert(num_tokens <= block_size);
+}
+
+std::vector<TokenId> LogicalTokenBlock::get_token_ids() const {
+  return token_ids;
+}
+
+PhysicalTokenBlock::PhysicalTokenBlock(int block_number, int block_size)
+    : block_number(block_number), block_size(block_size), ref_count(0) {}
+
+BlockAllocator::BlockAllocator(int block_size, int num_total_blocks) {
+  for (int block_number = 0; block_number < num_total_blocks; ++block_number) {
+    free_blocks.push_back(PhysicalTokenBlock(block_number, block_size));
+  }
+  num_total_blocks = num_total_blocks;
+}
+
+// Allocate a block
+PhysicalTokenBlock BlockAllocator::allocate() {
+  if (free_blocks.empty()) {
+    printf("no free blocks are available\n");
+    throw std::runtime_error("Out of memory! No free blocks are available.");
+  }
+  PhysicalTokenBlock block = free_blocks.front();
+  free_blocks.pop_front();
+  block.incr_ref_count();
+  return block;
+}
+
+// Free a block
+void BlockAllocator::free(PhysicalTokenBlock &block) {
+  if (block.ref_count == 0) {
+    printf("block is already freed\n");
+    throw std::runtime_error("Double free! Block is already freed.");
+  }
+  block.decr_ref_count();
+  if (block.ref_count == 0) {
+    free_blocks.push_back(block);
+  } else {
+    // in current implementation this should not be the case
+    printf("block is not freed. Ref count: %d\n", block.ref_count);
+    throw std::runtime_error("Block is not freed. Ref count: " +
+                             std::to_string(block.ref_count));
+  }
+}
+
+int BlockAllocator::get_num_free_blocks() const {
+  return free_blocks.size();
+}
+
+PageManager::PageManager(int block_size, size_t num_total_blocks)
+    : block_size(block_size), num_total_blocks(num_total_blocks),
+      block_allocator(block_size, num_total_blocks) {}
+
+// return the physical number of this block
+int PageManager::allocate_one_block(RequestGuid const &request_guid) {
+  BlockTable &block_table = block_tables[request_guid];
+
+  PhysicalTokenBlock block = block_allocator.allocate();
+  block_table.push_back(block);
+  block_tables[request_guid] = block_table;
+  return block.get_block_number();
+}
+
+void PageManager::free_block_table(BlockTable &block_table) {
+  // make it reverse order to free the last allocated block first
+  BlockTable::reverse_iterator rit = block_table.rbegin();
+  for (; rit != block_table.rend(); ++rit) {
+    block_allocator.free(*rit);
+  }
+  return;
+}
+
+void PageManager::free_request(RequestGuid const &request_guid) {
+  // we only free the blocks that are already used
+  BlockTable block_table = block_tables[request_guid];
+  free_block_table(block_table);
+  block_tables.erase(request_guid);
+  return;
+}
+
+// delete the last num_blocks in the request_guid
+void PageManager::free_multiple_blocks(RequestGuid const &request_guid,
+                                       int num_blocks) {
+  // assert(block_tables.find(request_guid) != block_tables.end());
+  auto &block_table = block_tables[request_guid];
+  // assert(num_blocks <= block_table.size());
+  int num_blocks_allocated = block_table.size();
+  for (int i = 0; i < num_blocks; i++) {
+    block_allocator.free(block_table[num_blocks_allocated - i - 1]);
+  }
+  // only keep the first num_blocks_allocated - num_blocks blocks
+  block_table.erase(block_table.begin() + num_blocks_allocated - num_blocks,
+                    block_table.end());
+  block_tables[request_guid] = block_table;
+  return;
+}
+
+std::vector<int> PageManager::get_block_table_indices(
+    RequestGuid const &request_guid) const {
+  std::vector<int> indices;
+  auto const &it = block_tables.find(request_guid);
+  if (it == block_tables.end()) {
+    return indices;
+  }
+  auto const &block_table = it->second;
+  for (auto const &block : block_table) {
+    indices.push_back(block.get_block_number());
+  }
+  return indices;
+}
+
+int PageManager::get_num_total_free_blocks() const {
+  return block_allocator.get_num_free_blocks();
+}
+
+int PageManager::get_num_allocated_blocks(
+    RequestGuid const &request_guid) const {
+  auto it = block_tables.find(request_guid);
+  if (it == block_tables.end()) {
+    return 0;
+  } else {
+    return it->second.size();
+  }
+}
+
+PageManager *PageManager::get_page_manager(FFModel *ff,
+                                           size_t total_kv_cache_size) {
+  int num_kv_heads = ff->num_kv_heads;
+  int size_dt = ff->size_dt;
+  int qkv_dim = ff->qkv_dim;
+  int num_transformer_layers = ff->num_transformer_layers;
+  int pipeline_parallelism_degree = ff->config.pipeline_parallelism_degree;
+  assert(num_kv_heads > 0 && size_dt > 0 && qkv_dim > 0 &&
+         num_transformer_layers > 0 &&
+         pipeline_parallelism_degree >
+             0); // needs to make sure that the model is initialized
+  if (page_manager_singleton == nullptr) {
+    size_t num_total_blocks = 0;
+    if (total_kv_cache_size == 0) {
+      num_total_blocks = (BatchConfig::max_spec_tree_token_num() +
+                          BatchConfig::max_sequence_length() + kPagesize - 1) /
+                         kPagesize * BatchConfig::max_requests_per_batch();
+    } else {
+      num_total_blocks = total_kv_cache_size * 1024 * 1024 / size_dt / qkv_dim /
+                         num_kv_heads / num_transformer_layers / kPagesize;
+    }
+    printf("page manager singleton is initialized with %d blocks\n",
+           num_total_blocks);
+    page_manager_singleton = new PageManager(kPagesize, num_total_blocks);
+    page_manager_singleton->kv_cache_size_per_layer =
+        total_kv_cache_size * 1024 * 1024 / num_transformer_layers;
+  }
+  return page_manager_singleton;
+}
+
+size_t PageManager::get_kv_cache_size_per_layer() {
+  return kv_cache_size_per_layer;
+}
+
+PageManager *PageManager::get_page_manager() {
+  assert(page_manager_singleton != nullptr);
+  return page_manager_singleton;
+}
+
+}; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index aec2c8013..47c394f7e 100755
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -847,6 +847,10 @@ void RequestManager::request_complete_clean_up(int batch_index) {
   num_available_requests--;
   request.status = Request::COMPLETED;
 
+  // page attention: free the pages
+  PageManager *page_manager = PageManager::get_page_manager();
+  page_manager->free_request(guid);
+
   // Find the sos and eos in the sequence
   // auto bos_it = std::find(
   //     request.tokens.begin(), request.tokens.end(), this->bos_token_id);
@@ -1287,6 +1291,9 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
   assert(prefilling_requests.size() > 0 &&
          "No prefilling request to process in the prefilling phase.");
 
+  // get page manager
+  PageManager *page_manager = PageManager::get_page_manager();
+
   BatchConfig bc;
   if (decoding_mode == INCREMENTAL_DECODING) {
     bc.inference_mode = InferenceMode::INC_DECODING_MODE;
@@ -1328,12 +1335,16 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       assert(request->llm_prefill_len + idx < request->tokens.size());
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
+
+      append_token_to_block(
+          *request, request->tokens[request->llm_prefill_len + idx], true);
     }
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
       bc.request_available[request_index] = true;
     }
+    bc.requestsInfo[request_index].request_guid = request->guid;
 
     // Record prefilling start time. We don't do this for speculative decoding,
     // because in that case we start the timer in the ssm prefilling Step idx
@@ -1471,6 +1482,7 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
+    bc.requestsInfo[request_index].request_guid = request.guid;
 
     bc.num_tokens++;
 
@@ -1943,6 +1955,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
             std::begin(new_bc.request_available));
   new_bc.num_available_requests = num_available_requests;
 
+  // get page manager
+  PageManager *page_manager = PageManager::get_page_manager();
+
   for (int request_index = 0; request_index < get_max_requests_per_batch();
        ++request_index) {
     if (!request_available[request_index]) {
@@ -1952,6 +1967,12 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     Request &request = all_requests[guid];
     assert(request.status == Request::RUNNING);
 
+    // before commit token, reset the pages assigned by cleaning all the tokens
+    std::vector<int> block_table_before_commit =
+        page_manager->get_block_table_indices(guid);
+    // also need to reset the pages
+    reset_block_table(request);
+
     // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.tokens.size() - 1; // Exclude the last token
@@ -1970,12 +1991,20 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
          committed_token_index++) {
       Request::CommittedToken &committed_token =
           committed_tokens.at(committed_token_index);
+
+      int idx_to_physical =
+          append_token_to_block(request, committed_token.token_id, true);
+      int idx_from_logical = committed_token.from_index;
+      int idx_from_physical =
+          block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
+          idx_from_logical % kPagesize;
+
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].index_in_kv_cache =
-          committed_token.from_index;
+          idx_from_physical;
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].token_depth =
-          committed_token.to_index;
+          idx_to_physical;
       new_bc.num_tokens_to_commit++;
     }
 
@@ -1995,6 +2024,9 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
           new_bc.tokensInfo[new_bc.num_tokens].token_id = tree_node->id;
           new_bc.num_tokens++;
           token_tree_index++;
+
+          // Append the token to the block
+          append_token_to_block(request, tree_node->id, false);
         }
       }
       layer_index++;
@@ -2016,6 +2048,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
 
     // Copy the streaming cache info
     new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
+    new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
   if (verbose) {
@@ -2371,6 +2404,103 @@ BatchConfig::BitMask RequestManager::create_llm_bitmask(RequestGuid guid) {
   return llm_bitmask;
 }
 
+/* --------- Page Attention Related Functions --------- */
+int RequestManager::get_num_blocks_allocated(Request &request) const {
+  // needs some assertion
+  return request.blocks.size();
+}
+
+int RequestManager::get_len_last_block(Request &request) const {
+  int num_tokens = request.blocks.back().get_num_tokens();
+  if (request.blocks.empty()) {
+    return 0;
+  }
+  return request.blocks.back().get_num_tokens();
+}
+
+// get the index of the last token in the request
+int RequestManager::get_idx_last_logical_token(Request &request) const {
+  if (request.blocks.empty()) {
+    printf("Error: request.blocks is empty\n");
+    return -1;
+  } else {
+    return (request.blocks.size() - 1) * kPagesize +
+           request.blocks.back().get_num_tokens() - 1;
+  }
+}
+
+int RequestManager::idx_logical_to_physical(Request &request, int idx_logical) {
+  // get physical indices
+  PageManager *page_manager = PageManager::get_page_manager();
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  return block_table_indices[idx_logical / kPagesize] * kPagesize +
+         idx_logical % kPagesize;
+}
+
+// this will allocate one logical block and one physical block to the request
+void RequestManager::_append_block_to_request(Request &request,
+                                              bool is_commit) {
+  PageManager *page_manager = PageManager::get_page_manager();
+  // Append the logical block to the request
+  // page attention: in this function we need to remember the last logical block
+  // number that still contains committed tokens
+  LogicalTokenBlock block(request.blocks.size(), kPagesize);
+  request.blocks.push_back(block);
+  page_manager->allocate_one_block(request.guid);
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  // update page_id_commit
+  if (is_commit) {
+    request.page_last_committed++;
+    int size_blocks = request.blocks.size();
+  }
+}
+
+// this function is used for appending a token to the last logical block and
+// also the last physical block it will return the physical position of this
+// token
+int RequestManager::append_token_to_block(Request &request,
+                                          TokenId token,
+                                          bool is_commit) {
+  PageManager *page_manager = PageManager::get_page_manager();
+  if (request.blocks.empty() || request.blocks.back().is_full()) {
+    // Append a new logical block
+    _append_block_to_request(request, is_commit);
+    // also allocate one physical page
+  }
+  // insert token to both logical block and physical block
+  request.blocks.back().append_tokens({token}, is_commit);
+  int idx_logical = get_idx_last_logical_token(request);
+  assert(idx_logical >= 0);
+  int idx_physical = idx_logical_to_physical(request, idx_logical);
+  assert(idx_physical >= 0);
+  return idx_physical;
+}
+
+void RequestManager::reset_block_table(Request &request) {
+  // get the indices of original physical block table for request
+  PageManager *page_manager = PageManager::get_page_manager();
+  assert(request.page_last_committed < static_cast<int>(request.blocks.size()));
+  std::vector<int> block_table_indices =
+      page_manager->get_block_table_indices(request.guid);
+  // reset the block table according to the request's page_last_commit
+  page_manager->free_multiple_blocks(request.guid,
+                                     block_table_indices.size() -
+                                         request.page_last_committed - 1);
+  // reset this request's logical block table
+  if (request.page_last_committed < static_cast<int>(request.blocks.size())) {
+    request.blocks.erase(request.blocks.begin() + request.page_last_committed +
+                             1,
+                         request.blocks.end());
+  }
+  request.blocks.back().reset_num_spec_tokens();
+  // the indices of block table should be the same as the number of blocks
+  std::vector<int> block_table =
+      page_manager->get_block_table_indices(request.guid);
+  return;
+}
+
 /* --------- Bitmask Related Functions --------- */
 void RequestManager::gumbel_conditioned_on_max(
     double target_max, std::vector<std::pair<double, int>> &logits) {
@@ -2824,6 +2954,9 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
+  // page attention: initalize the page manager here
+  int kv_cache_size = rm->get_max_kv_cache_size();
+  PageManager::get_page_manager(llm, rm->get_max_kv_cache_size());
   if (rm->decoding_mode == INCREMENTAL_DECODING) {
     // No SSMs: perform incremental decoding
     rm->serve_decoding(llm);
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 42a1cca73..23b54d0c6 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -78,6 +78,7 @@ void RequestManager::load_tokens_task(
 }
 
 void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
+                                       PageManager *pm,
                                        AttentionMetaData *attention_metadata,
                                        cudaStream_t stream,
                                        uint32_t const max_num_pages,
@@ -109,8 +110,10 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
       kv_indptr_h[indptr_idx + 1] = round_up_pages(kv_len) +
           kv_indptr_h[indptr_idx];
+      std::vector<int32_t> kv_indices = pm->get_block_table_indices(
+          batch_config->requestsInfo[req_idx].request_guid);
       for (int i = indices_offset; i < indices_lens; i++) {
-        kv_indices_h[i] = max_num_pages * req_idx + (i - indices_offset);
+        kv_indices_h[i] = kv_indices[i - indices_offset];
       }
       kv_last_page_len_h[indptr_idx] = (kv_len - 1) % kPagesize + 1;
       qk_indptr_h[indptr_idx + 1] = qk_lens;
@@ -425,6 +428,7 @@ void RequestManager::load_batch_config_task(
 
   // load attention metadata
   if (batch_config->get_mode() == INC_DECODING_MODE) {
+    PageManager *pm = PageManager::get_page_manager();
     static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
         kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
     static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS *
@@ -434,20 +438,13 @@ void RequestManager::load_batch_config_task(
     if (handle.incr_attention_metadata->enabled()) {
       // calculate the attention meta data
       {
-        BatchConfig::PerRequestInfo *request_infos =
-            reinterpret_cast<BatchConfig::PerRequestInfo *>(
-                static_cast<char *>(handle.batch_config_metadata) +
-                sizeof(BatchConfig::tokensInfo));
-        bool *request_available = reinterpret_cast<bool *>(
-            static_cast<char *>(handle.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +
                            BatchConfig::max_spec_tree_token_num());
 
         prepare_inference_params_kernel_h(batch_config,
+                                          pm,
                                           handle.incr_attention_metadata,
                                           stream,
                                           max_num_pages,
@@ -650,6 +647,7 @@ void RequestManager::load_batch_config_task(
       }
     }
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
+    PageManager *pm = PageManager::get_page_manager();
     static int32_t q_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1],
         kv_indptr_h[BatchConfig::MAX_NUM_REQUESTS + 1];
     static int32_t kv_indices_h[BatchConfig::MAX_NUM_REQUESTS *
@@ -687,6 +685,7 @@ void RequestManager::load_batch_config_task(
 
         // int parallelism = batch_size;
         prepare_inference_params_kernel_h(batch_config,
+                                          pm,
                                           handle.tree_verify_attention_metadata,
                                           stream,
                                           max_num_pages,

From 2a751fda6709848feeacfceb6440869b3a50232a Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 26 Jan 2025 20:27:02 +0000
Subject: [PATCH 663/667] style: format code

---
 include/flexflow/batch_config.h               |  4 +++-
 .../inc_multihead_self_attention_kernels.h    |  6 ++---
 src/ops/inc_multihead_self_attention.cu       | 22 +++++++++----------
 .../inc_multihead_self_attention_kernels.cu   |  6 ++---
 src/ops/tree_inc_multihead_self_attention.cu  | 17 +++++++-------
 src/runtime/request_manager.cu                |  4 ++--
 6 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 9397e7e2f..76521e5cf 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -118,7 +118,9 @@ class BatchConfig {
 
     static constexpr size_t request_guid_size = sizeof(RequestGuid);
     static constexpr size_t alignment = 16;
-    static constexpr size_t padding_size = (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) % alignment;
+    static constexpr size_t padding_size =
+        (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
+        alignment;
     static constexpr size_t padding_length = padding_size / sizeof(int);
     int padding[padding_length] = {}; // Padding for memory pointer alignment
   };
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 16fe78cc1..4c66c1f2c 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -114,9 +114,9 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         cudaStream_t stream,
-                         bool is_spec);
+                               BatchConfig const *bc,
+                               cudaStream_t stream,
+                               bool is_spec);
 
 // [For the tokens in streaming cache]
 // Convert the out-of-order cache to in-order relative position.
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index b647437bc..30c0586a5 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -519,8 +519,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
-                         BatchConfig::max_requests_per_batch() *
-                         max_num_pages * kPagesize;
+                         BatchConfig::max_requests_per_batch() * max_num_pages *
+                         kPagesize;
         value_cache_size = num_kv_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
@@ -530,22 +530,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         assert(false && "Unkown inference mode");
     }
     if (streaming_cache) {
-      size_t max_post_pos_enc_pages =
-          round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                          BatchConfig::get_max_tree_depth() +
-                          max(max_tokens_per_batch,
-                              BatchConfig::max_spec_tree_token_num()));
+      size_t max_post_pos_enc_pages = round_up_pages(
+          BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+          max(max_tokens_per_batch, BatchConfig::max_spec_tree_token_num()));
       key_cache_size = num_kv_heads * qk_dim *
-                        BatchConfig::max_requests_per_batch() *
-                        max_post_pos_enc_pages * kPagesize;
+                       BatchConfig::max_requests_per_batch() *
+                       max_post_pos_enc_pages * kPagesize;
       value_cache_size = num_kv_heads * v_dim *
-                          BatchConfig::max_requests_per_batch() *
-                          max_post_pos_enc_pages * kPagesize;
+                         BatchConfig::max_requests_per_batch() *
+                         max_post_pos_enc_pages * kPagesize;
       streaming_pre_pos_enc_size =
           num_kv_heads * (qk_dim + v_dim) *
           BatchConfig::max_requests_per_batch() *
           round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                          BatchConfig::get_max_tree_depth()) *
+                         BatchConfig::get_max_tree_depth()) *
           kPagesize;
     }
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index dc75eb4ee..9bb58794a 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -600,9 +600,9 @@ __global__ void update_qkv_in_batch_paged_kernel(
 
 template <typename DT>
 void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         cudaStream_t stream,
-                         bool is_spec) {
+                               BatchConfig const *bc,
+                               cudaStream_t stream,
+                               bool is_spec) {
   // printf("entered update_qkv_in_batch_verify\n");
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 6c5b58270..058e223c4 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -124,15 +124,14 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
-                         stream>>>(
-      static_cast<half *>(m->kvCache),
-      m->committed_token_infos,
-      m->request_available,
-      num_requests,
-      m->num_kv_heads,
-      m->qk_dim,
-      m->num_tokens_to_commit,
-      max_num_pages);
+                         stream>>>(static_cast<half *>(m->kvCache),
+                                   m->committed_token_infos,
+                                   m->request_available,
+                                   num_requests,
+                                   m->num_kv_heads,
+                                   m->qk_dim,
+                                   m->num_tokens_to_commit,
+                                   max_num_pages);
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
   //   float elapsed = 0;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 23b54d0c6..be09ee7b2 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -108,8 +108,8 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       indices_offset = indices_lens;
       indices_lens += (kv_len + kPagesize - 1) / kPagesize;
       q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
-      kv_indptr_h[indptr_idx + 1] = round_up_pages(kv_len) +
-          kv_indptr_h[indptr_idx];
+      kv_indptr_h[indptr_idx + 1] =
+          round_up_pages(kv_len) + kv_indptr_h[indptr_idx];
       std::vector<int32_t> kv_indices = pm->get_block_table_indices(
           batch_config->requestsInfo[req_idx].request_guid);
       for (int i = indices_offset; i < indices_lens; i++) {

From 3ed67e4bcba2322a6529b9114be4720bacbc508e Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 26 Jan 2025 13:07:55 +0000
Subject: [PATCH 664/667] fix: minor

---
 .../inc_multihead_self_attention_kernels.h    |  2 +-
 src/ops/inc_multihead_self_attention.cu       | 43 ++++++++++---------
 .../inc_multihead_self_attention_kernels.cu   |  6 +--
 src/ops/tree_inc_multihead_self_attention.cu  |  2 +-
 src/runtime/request_manager.cc                |  4 +-
 5 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 9f886ffec..16fe78cc1 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -113,7 +113,7 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
                          cudaStream_t stream);
 
 template <typename DT>
-void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream,
                          bool is_spec);
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index ac3a4b24e..9af7c09cb 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -275,7 +275,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     apply_pos_encoding_to_tokens_in_batch(
         m, bc, static_cast<DT *>(m->devQKVProjArray), stream);
     // Move the batch qkv values to where took by attention
-    update_qkv_in_batch<DT>(m, bc, stream, false);
+    update_qkv_in_batch_paged<DT>(m, bc, stream, false);
   }
 
   // phase 4: Attention computation
@@ -515,7 +515,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         }
         break;
       }
-      case TREE_SEARCH_MODE:
+      case TREE_SEARCH_MODE: {
         query_tmp_size = num_q_heads * qk_dim * max_tokens_per_batch;
         // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_kv_heads * qk_dim *
@@ -524,29 +524,30 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         value_cache_size = num_kv_heads * v_dim *
                            BatchConfig::max_requests_per_batch() *
                            max_num_pages * kPagesize;
-        if (streaming_cache) {
-          size_t max_post_pos_enc_pages =
-              round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                             BatchConfig::get_max_tree_depth() +
-                             max(max_tokens_per_batch,
-                                 BatchConfig::max_spec_tree_token_num()));
-          key_cache_size = num_kv_heads * qk_dim *
-                           BatchConfig::max_requests_per_batch() *
-                           max_post_pos_enc_pages * kPagesize;
-          value_cache_size = num_kv_heads * v_dim *
-                             BatchConfig::max_requests_per_batch() *
-                             max_post_pos_enc_pages * kPagesize;
-          streaming_pre_pos_enc_size =
-              num_kv_heads * (qk_dim + v_dim) *
-              BatchConfig::max_requests_per_batch() *
-              round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                             BatchConfig::get_max_tree_depth()) *
-              kPagesize;
-        }
         break;
+      }
       default:
         assert(false && "Unkown inference mode");
     }
+    if (streaming_cache) {
+      size_t max_post_pos_enc_pages =
+          round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                          BatchConfig::get_max_tree_depth() +
+                          max(max_tokens_per_batch,
+                              BatchConfig::max_spec_tree_token_num()));
+      key_cache_size = num_kv_heads * qk_dim *
+                        BatchConfig::max_requests_per_batch() *
+                        max_post_pos_enc_pages * kPagesize;
+      value_cache_size = num_kv_heads * v_dim *
+                          BatchConfig::max_requests_per_batch() *
+                          max_post_pos_enc_pages * kPagesize;
+      streaming_pre_pos_enc_size =
+          num_kv_heads * (qk_dim + v_dim) *
+          BatchConfig::max_requests_per_batch() *
+          round_up_pages(BatchConfig::MAX_STREAMING_POS -
+                          BatchConfig::get_max_tree_depth()) *
+          kPagesize;
+    }
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
     size_t output_tmp_size = max_tokens_per_batch * num_q_heads * v_dim;
     size_t complex_size =
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index 55edaed6b..dc75eb4ee 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -599,7 +599,7 @@ __global__ void update_qkv_in_batch_paged_kernel(
 }
 
 template <typename DT>
-void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
                          BatchConfig const *bc,
                          cudaStream_t stream,
                          bool is_spec) {
@@ -1050,13 +1050,13 @@ template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
     BatchConfig const *bc,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<float>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream,
     bool is_spec);
 
-template void Kernels::IncMultiHeadAttention::update_qkv_in_batch<half>(
+template void Kernels::IncMultiHeadAttention::update_qkv_in_batch_paged<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
     cudaStream_t stream,
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index a5c98e414..dc1860310 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -428,7 +428,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   //   cudaEventRecord(t_start, stream);
 
   // Update key-val cache, compact q array
-  update_qkv_in_batch<DT>(m, bc, stream, true);
+  update_qkv_in_batch_paged<DT>(m, bc, stream, true);
 
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index b608b6e94..7be8cbc1a 100755
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1396,8 +1396,6 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
   int num_tokens = 0;
   for (Request *request : prefilling_requests) {
     int request_index = request->batch_index;
-    // Only set the prefilling request to be available
-    bc.request_available[request_index] = true;
 
     // Request Info
     bc.requestsInfo[request_index].first_token_offset_in_batch = num_tokens;
@@ -1430,6 +1428,8 @@ BatchConfig RequestManager::prepare_ssm_prefilling_batch() {
     num_tokens += num_tokens_in_batch;
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
+      // Only set the prefilling request to be available
+      bc.request_available[request_index] = true;
     }
 
     // Record prefilling start time

From 69b9f7210b26116538920b934d309b8bd6a04ba5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 26 Jan 2025 20:13:57 +0000
Subject: [PATCH 665/667] fix: merge page_manager, also fix some issues

---
 src/ops/tree_inc_multihead_self_attention.cu |  4 ---
 src/runtime/request_manager.cc               | 27 +-------------------
 src/runtime/request_manager.cu               | 14 ----------
 3 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index dc1860310..6c5b58270 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -49,8 +49,6 @@ using flashinfer::QKVLayout;
 
 __global__ void commit_tokens_kernel(
     half *kCache_ptr,
-    int32_t *kv_indptr,
-    int32_t *kv_page_indices,
     BatchConfig::CommittedTokensInfo const *committedTokenInfos,
     bool const *request_available,
     int num_requests,
@@ -128,8 +126,6 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                          0,
                          stream>>>(
       static_cast<half *>(m->kvCache),
-      m->handle.tree_verify_attention_metadata->kv_indptr,
-      m->handle.tree_verify_attention_metadata->kv_indices,
       m->committed_token_infos,
       m->request_available,
       num_requests,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 7be8cbc1a..f5af21a55 100755
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -1337,7 +1337,6 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
       bc.tokensInfo[token_idx].token_id =
           request->tokens[request->llm_prefill_len + idx];
 
-      assert(request->llm_prefill_len + idx < request->tokens.size());
       append_token_to_block(
           *request, request->tokens[request->llm_prefill_len + idx], true);
     }
@@ -1345,15 +1344,6 @@ BatchConfig RequestManager::prepare_llm_prefilling_batch() {
     if (num_tokens_in_batch > 0) {
       bc.num_available_requests++;
     }
-    // update related page info in batch config
-    bc.requestsInfo[request_index].num_kv_pages =
-        get_num_blocks_allocated(*request);
-    if (bc.requestsInfo[request_index].num_kv_pages == 0) {
-      // turn this request into not available for one round
-      bc.request_available[request_index] = false;
-    }
-    bc.requestsInfo[request_index].kv_last_page_len =
-        get_len_last_block(*request);
     bc.requestsInfo[request_index].request_guid = request->guid;
 
     // Record prefilling start time. We don't do this for speculative decoding,
@@ -1492,13 +1482,6 @@ BatchConfig RequestManager::prepare_decoding_batch() {
     bc.tokensInfo[bc.num_tokens].abs_index_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].abs_depth_in_request = request.llm_cache_size;
     bc.tokensInfo[bc.num_tokens].token_id = request.tokens.back();
-    // append the token here
-    int idx_to_physical =
-        append_token_to_block(request, request.tokens.back(), true);
-    bc.requestsInfo[request_index].num_kv_pages =
-        get_num_blocks_allocated(request);
-    bc.requestsInfo[request_index].kv_last_page_len =
-        get_len_last_block(request);
     bc.requestsInfo[request_index].request_guid = request.guid;
 
     bc.num_tokens++;
@@ -1990,8 +1973,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
     // also need to reset the pages
     reset_block_table(request);
 
-    int token_offset = request.first_token_offset_in_batch;
-
     // 1. Maintain requestsInfo
     new_bc.requestsInfo[request_index].first_token_index_in_request =
         request.tokens.size() - 1; // Exclude the last token
@@ -2016,7 +1997,7 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
       int idx_from_logical = committed_token.from_index;
       int idx_from_physical =
           block_table_before_commit[idx_from_logical / kPagesize] * kPagesize +
-          committed_token.from_index % kPagesize;
+          idx_from_logical % kPagesize;
 
       new_bc.committed_tokens[new_bc.num_tokens_to_commit].request_index =
           request_index;
@@ -2068,12 +2049,6 @@ BatchConfig RequestManager::prepare_verify_batch_config() {
 
     // Copy the streaming cache info
     new_bc.streamingCacheInfo[request_index] = request.streaming_cache_info;
-
-    // page attention information
-    new_bc.requestsInfo[request_index].num_kv_pages =
-        get_num_blocks_allocated(request);
-    new_bc.requestsInfo[request_index].kv_last_page_len =
-        get_len_last_block(request);
     new_bc.requestsInfo[request_index].request_guid = request.guid;
   }
 
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 903dadfcf..0033441dc 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -111,14 +111,8 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       kv_indptr_h[indptr_idx + 1] =
           batch_config->requestsInfo[req_idx].num_kv_pages +
           kv_indptr_h[indptr_idx];
-      assert(kv_indptr_h[indptr_idx] >= 0);
-
-      assert(batch_config->requestsInfo[req_idx].num_kv_pages ==
-             (kv_len + kPagesize - 1) / kPagesize);
-      assert(batch_config->requestsInfo[req_idx].kv_last_page_len <= kPagesize);
       std::vector<int32_t> kv_indices = pm->get_block_table_indices(
           batch_config->requestsInfo[req_idx].request_guid);
-      assert(kv_indices.size() == (kv_len + kPagesize - 1) / kPagesize);
       for (int i = indices_offset; i < indices_lens; i++) {
         kv_indices_h[i] = kv_indices[i - indices_offset];
       }
@@ -446,14 +440,6 @@ void RequestManager::load_batch_config_task(
     if (handle.incr_attention_metadata->enabled()) {
       // calculate the attention meta data
       {
-        BatchConfig::PerRequestInfo *request_infos =
-            reinterpret_cast<BatchConfig::PerRequestInfo *>(
-                static_cast<char *>(handle.batch_config_metadata) +
-                sizeof(BatchConfig::tokensInfo));
-        bool *request_available = reinterpret_cast<bool *>(
-            static_cast<char *>(handle.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
         int batch_size = batch_config->num_active_requests();
         uint32_t const max_num_pages =
             round_up_pages(BatchConfig::max_sequence_length() +

From e0eca51295fa60d91aa579ff18ad7e60e816bfb7 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 26 Jan 2025 20:27:02 +0000
Subject: [PATCH 666/667] style: format code

---
 include/flexflow/batch_config.h                | 11 ++++++++---
 .../inc_multihead_self_attention_kernels.h     |  6 +++---
 src/ops/inc_multihead_self_attention.cu        | 18 ++++++++----------
 .../inc_multihead_self_attention_kernels.cu    |  6 +++---
 src/ops/tree_inc_multihead_self_attention.cu   | 17 ++++++++---------
 src/runtime/request_manager.cu                 |  3 +--
 6 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 774039b4f..76521e5cf 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -114,10 +114,15 @@ class BatchConfig {
     int first_token_index_in_request = -1;
     int first_token_offset_in_batch = -1;
     int num_tokens_in_batch = 0;
-    int padding = 0;      // Padding for memory pointer alignment
-    int num_kv_pages;     // number of kv pages used
-    int kv_last_page_len; // last page length of kv
     RequestGuid request_guid;
+
+    static constexpr size_t request_guid_size = sizeof(RequestGuid);
+    static constexpr size_t alignment = 16;
+    static constexpr size_t padding_size =
+        (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
+        alignment;
+    static constexpr size_t padding_length = padding_size / sizeof(int);
+    int padding[padding_length] = {}; // Padding for memory pointer alignment
   };
 
   struct PerTokenInfo {
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 16fe78cc1..4c66c1f2c 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -114,9 +114,9 @@ void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
 
 template <typename DT>
 void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         cudaStream_t stream,
-                         bool is_spec);
+                               BatchConfig const *bc,
+                               cudaStream_t stream,
+                               bool is_spec);
 
 // [For the tokens in streaming cache]
 // Convert the out-of-order cache to in-order relative position.
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 9af7c09cb..30c0586a5 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -530,22 +530,20 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         assert(false && "Unkown inference mode");
     }
     if (streaming_cache) {
-      size_t max_post_pos_enc_pages =
-          round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                          BatchConfig::get_max_tree_depth() +
-                          max(max_tokens_per_batch,
-                              BatchConfig::max_spec_tree_token_num()));
+      size_t max_post_pos_enc_pages = round_up_pages(
+          BatchConfig::MAX_STREAMING_POS - BatchConfig::get_max_tree_depth() +
+          max(max_tokens_per_batch, BatchConfig::max_spec_tree_token_num()));
       key_cache_size = num_kv_heads * qk_dim *
-                        BatchConfig::max_requests_per_batch() *
-                        max_post_pos_enc_pages * kPagesize;
+                       BatchConfig::max_requests_per_batch() *
+                       max_post_pos_enc_pages * kPagesize;
       value_cache_size = num_kv_heads * v_dim *
-                          BatchConfig::max_requests_per_batch() *
-                          max_post_pos_enc_pages * kPagesize;
+                         BatchConfig::max_requests_per_batch() *
+                         max_post_pos_enc_pages * kPagesize;
       streaming_pre_pos_enc_size =
           num_kv_heads * (qk_dim + v_dim) *
           BatchConfig::max_requests_per_batch() *
           round_up_pages(BatchConfig::MAX_STREAMING_POS -
-                          BatchConfig::get_max_tree_depth()) *
+                         BatchConfig::get_max_tree_depth()) *
           kPagesize;
     }
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * v_dim;
diff --git a/src/ops/kernels/inc_multihead_self_attention_kernels.cu b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
index dc75eb4ee..9bb58794a 100644
--- a/src/ops/kernels/inc_multihead_self_attention_kernels.cu
+++ b/src/ops/kernels/inc_multihead_self_attention_kernels.cu
@@ -600,9 +600,9 @@ __global__ void update_qkv_in_batch_paged_kernel(
 
 template <typename DT>
 void update_qkv_in_batch_paged(IncMultiHeadSelfAttentionMeta const *m,
-                         BatchConfig const *bc,
-                         cudaStream_t stream,
-                         bool is_spec) {
+                               BatchConfig const *bc,
+                               cudaStream_t stream,
+                               bool is_spec) {
   // printf("entered update_qkv_in_batch_verify\n");
   int num_new_tokens = bc->num_active_tokens();
   if (num_new_tokens == 0) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 6c5b58270..058e223c4 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -124,15 +124,14 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
   commit_tokens_kernel<<<GET_BLOCKS(parallelism),
                          min(CUDA_NUM_THREADS, parallelism),
                          0,
-                         stream>>>(
-      static_cast<half *>(m->kvCache),
-      m->committed_token_infos,
-      m->request_available,
-      num_requests,
-      m->num_kv_heads,
-      m->qk_dim,
-      m->num_tokens_to_commit,
-      max_num_pages);
+                         stream>>>(static_cast<half *>(m->kvCache),
+                                   m->committed_token_infos,
+                                   m->request_available,
+                                   num_requests,
+                                   m->num_kv_heads,
+                                   m->qk_dim,
+                                   m->num_tokens_to_commit,
+                                   max_num_pages);
   //   cudaEventRecord(t_end, stream);
   //   checkCUDA(cudaEventSynchronize(t_end));
   //   float elapsed = 0;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 0033441dc..b579bb73b 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -109,8 +109,7 @@ void prepare_inference_params_kernel_h(BatchConfig const *batch_config,
       indices_lens += (kv_len + kPagesize - 1) / kPagesize;
       q_indptr_h[indptr_idx + 1] = q_indptr_h[indptr_idx] + q_len;
       kv_indptr_h[indptr_idx + 1] =
-          batch_config->requestsInfo[req_idx].num_kv_pages +
-          kv_indptr_h[indptr_idx];
+          round_up_pages(kv_len) + kv_indptr_h[indptr_idx];
       std::vector<int32_t> kv_indices = pm->get_block_table_indices(
           batch_config->requestsInfo[req_idx].request_guid);
       for (int i = indices_offset; i < indices_lens; i++) {

From e2d6fc6abf7f8a4616cb0d4d1722749b6b2f826d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Fri, 31 Jan 2025 14:55:05 +0000
Subject: [PATCH 667/667] chore: remove outdated comments

---
 src/ops/add_bias_residual_layer_norm.cc      |  2 +-
 src/ops/aggregate.cc                         | 11 +++++-----
 src/ops/aggregate_spec.cc                    | 12 +++++-----
 src/ops/arg_topk.cc                          | 11 +++++-----
 src/ops/arg_topk.cpp                         | 21 +++++++++---------
 src/ops/arg_topk.cu                          | 23 ++++++++++----------
 src/ops/argmax.cc                            | 11 +++++-----
 src/ops/attention.cc                         |  2 +-
 src/ops/cast.cc                              | 11 +++++-----
 src/ops/element_binary.cc                    | 12 +++++-----
 src/ops/element_unary.cc                     | 12 +++++-----
 src/ops/embedding.cc                         | 11 +++++-----
 src/ops/experts.cc                           | 11 +++++-----
 src/ops/fused.cc                             | 11 +++++-----
 src/ops/fused.cpp                            |  3 +--
 src/ops/group_by.cc                          | 11 +++++-----
 src/ops/gumbel_topk.cc                       | 12 +++++-----
 src/ops/gumbel_topk.cu                       | 23 ++++++++++----------
 src/ops/inc_multihead_self_attention.cc      |  2 +-
 src/ops/layer_norm.cc                        | 11 +++++-----
 src/ops/linear.cc                            | 11 +++++-----
 src/ops/noop.cc                              | 11 +++++-----
 src/ops/residual_layer_norm.cc               |  2 +-
 src/ops/residual_rms_norm.cc                 | 12 +++++-----
 src/ops/rms_norm.cc                          | 11 +++++-----
 src/ops/sampling.cc                          | 11 +++++-----
 src/ops/sigmoid_silu_multi.cc                |  2 +-
 src/ops/softmax.cc                           | 11 +++++-----
 src/ops/spec_inc_multihead_self_attention.cc |  2 +-
 src/ops/split.cc                             | 11 +++++-----
 src/ops/topk.cc                              | 11 +++++-----
 src/ops/tree_inc_multihead_self_attention.cc |  2 +-
 32 files changed, 150 insertions(+), 170 deletions(-)

diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index b2039e30c..e67038090 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -526,7 +526,7 @@ void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
 
 FutureMap AddBiasResidualLayerNorm::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 3baf469d1..5f05458e3 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -296,12 +296,11 @@ void Aggregate::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Aggregate::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Aggregate::inference(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 70f5508e0..1edd43088 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -264,12 +264,12 @@ void AggregateSpec::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap AggregateSpec::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap
+    AggregateSpec::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 6a5248712..ebed5ab0c 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -287,12 +287,11 @@ void ArgTopK::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap ArgTopK::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap ArgTopK::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/arg_topk.cpp b/src/ops/arg_topk.cpp
index cc2b8e9ec..90dbb5909 100644
--- a/src/ops/arg_topk.cpp
+++ b/src/ops/arg_topk.cpp
@@ -371,17 +371,16 @@ __global__ void arg_topk_forward_kernel(T const *__restrict__ input,
 
 /*static*/
 template <typename DT>
-void ArgTopK::forward_kernel(
-    ArgTopKMeta const *m,
-    DT const *input_ptr,
-    float *output_ptr,
-    int *indices_ptr,
-    size_t batch_size,
-    int length,
-    int k,
-    bool sorted,
-    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
-    hipStream_t stream) {
+void ArgTopK::forward_kernel(ArgTopKMeta const *m,
+                             DT const *input_ptr,
+                             float *output_ptr,
+                             int *indices_ptr,
+                             size_t batch_size,
+                             int length,
+                             int k,
+                             bool sorted,
+                             BatchConfig const *bc,
+                             hipStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
   int num_shards = 0;
diff --git a/src/ops/arg_topk.cu b/src/ops/arg_topk.cu
index 7558fdbcc..0d4ea2045 100644
--- a/src/ops/arg_topk.cu
+++ b/src/ops/arg_topk.cu
@@ -83,18 +83,17 @@ __global__ void renormalize_kernel(DT *topk_values,
 
 /*static*/
 template <typename DT>
-void ArgTopK::forward_kernel(
-    ArgTopKMeta *m,
-    DT const *input_ptr,
-    DT *output_ptr,
-    int *indices_ptr,
-    size_t batch_size,
-    int length,
-    int k,
-    bool sorted,
-    bool renormalize,
-    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
-    cudaStream_t stream) {
+void ArgTopK::forward_kernel(ArgTopKMeta *m,
+                             DT const *input_ptr,
+                             DT *output_ptr,
+                             int *indices_ptr,
+                             size_t batch_size,
+                             int length,
+                             int k,
+                             bool sorted,
+                             bool renormalize,
+                             BatchConfig const *bc,
+                             cudaStream_t stream) {
   assert(bc->num_active_requests() >= 0);
   if (m->device_resources.find(stream) == m->device_resources.end()) {
     m->device_resources[stream] = new raft::device_resources(stream);
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index fc6fec5b9..0524defce 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -260,12 +260,11 @@ void ArgMax::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap ArgMax::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap ArgMax::inference(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index 37f971668..97afc9434 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -577,7 +577,7 @@ void MultiHeadAttention::forward(FFModel const &ff) {
 
 FutureMap MultiHeadAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/cast.cc b/src/ops/cast.cc
index 701f407de..e514236a3 100644
--- a/src/ops/cast.cc
+++ b/src/ops/cast.cc
@@ -226,12 +226,11 @@ void Cast::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Cast::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Cast::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 921800da1..4352f459b 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -540,12 +540,12 @@ void ElementBinary::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap ElementBinary::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap
+    ElementBinary::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index b657cf657..0e1d11555 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -420,12 +420,12 @@ void ElementUnary::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap ElementUnary::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap
+    ElementUnary::inference(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index 4e7b01dca..3cc8ceea0 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -455,12 +455,11 @@ void Embedding::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Embedding::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Embedding::inference(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index bbcbcda91..8c66f9c7b 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -668,12 +668,11 @@ void Experts::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Experts::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Experts::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index d95cf1469..a22873847 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -516,12 +516,11 @@ void FusedOp::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap FusedOp::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap FusedOp::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   // Set iter_config
   iter_config = ff.iter_config;
   ArgumentMap argmap;
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 8b39b8b37..6111a8fd0 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -525,8 +525,7 @@ __host__ void
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  /* Reserved: BatchConfig Updated */ BatchConfig const *bc =
-      BatchConfig::from_future(task->futures[0]);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
   }
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index bb254a714..f2f402737 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -321,12 +321,11 @@ void Group_by::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Group_by::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Group_by::inference(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/gumbel_topk.cc b/src/ops/gumbel_topk.cc
index 99244efb2..fb7f8a978 100644
--- a/src/ops/gumbel_topk.cc
+++ b/src/ops/gumbel_topk.cc
@@ -296,12 +296,12 @@ void GumbelTopK::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap GumbelTopK::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap
+    GumbelTopK::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/gumbel_topk.cu b/src/ops/gumbel_topk.cu
index 3635fda9c..1af6c5eab 100644
--- a/src/ops/gumbel_topk.cu
+++ b/src/ops/gumbel_topk.cu
@@ -430,18 +430,17 @@ __global__ void
 
 /*static*/
 template <typename DT>
-void GumbelTopK::forward_kernel(
-    GumbelTopKMeta const *m,
-    DT const *input_ptr,
-    float *log_probs_ptr,
-    float *perturbed_log_probs_ptr,
-    int *indices_ptr,
-    size_t batch_size,
-    int length,
-    int k,
-    bool sorted,
-    /* Reserved: BatchConfig Updated */ BatchConfig const *bc,
-    cudaStream_t stream) {
+void GumbelTopK::forward_kernel(GumbelTopKMeta const *m,
+                                DT const *input_ptr,
+                                float *log_probs_ptr,
+                                float *perturbed_log_probs_ptr,
+                                int *indices_ptr,
+                                size_t batch_size,
+                                int length,
+                                int k,
+                                bool sorted,
+                                BatchConfig const *bc,
+                                cudaStream_t stream) {
   // Adopted from TensorFlow's ArgTopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
   int num_shards = 0;
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index bfcc7dc4c..b819b4936 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -767,7 +767,7 @@ void IncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap IncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index 745e6806c..2218ffe39 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -438,12 +438,11 @@ void LayerNorm::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap LayerNorm::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap LayerNorm::inference(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 6f01cf431..0c7a0f78f 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -555,12 +555,11 @@ void Linear::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap Linear::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Linear::inference(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index a4b3222e7..da2d4922e 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -258,12 +258,11 @@ void NoOp::init(FFModel const &ff) {
 
 void NoOp::forward(FFModel const &ff) {}
 
-FutureMap NoOp::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap NoOp::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
   FutureMap empty;
   return empty;
 }
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index be4ac4833..ed9252c30 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -536,7 +536,7 @@ Op *ResidualLayerNorm::materialize(FFModel &ff,
 
 FutureMap ResidualLayerNorm::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index 9cea6421f..713486268 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -363,12 +363,12 @@ void ResidualRMSNorm::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap ResidualRMSNorm::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap
+    ResidualRMSNorm::inference(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 3070368ff..bf07ee6bb 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -339,12 +339,11 @@ void RMSNorm::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap RMSNorm::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap RMSNorm::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index d273780c2..92db9a958 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -246,12 +246,11 @@ void Sampling::forward(FFModel const &ff) {
   assert(false);
 }
 
-FutureMap Sampling::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Sampling::inference(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 9d1261123..b39a424c6 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -293,7 +293,7 @@ void SigmoidSiluMulti::backward(FFModel const &ff) {
 
 FutureMap SigmoidSiluMulti::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index decffaaad..4c94f3e5a 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -278,12 +278,11 @@ OpMeta *Softmax::init_task(Task const *task,
   return m;
 }
 
-FutureMap Softmax::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Softmax::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 2e5cc9fa7..421780dd4 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -710,7 +710,7 @@ void SpecIncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap SpecIncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {
diff --git a/src/ops/split.cc b/src/ops/split.cc
index f3deadd80..7c6b631b2 100644
--- a/src/ops/split.cc
+++ b/src/ops/split.cc
@@ -249,12 +249,11 @@ void Split::forward(FFModel const &ff) {
   }
   runtime->execute_index_space(ctx, launcher);
 }
-FutureMap Split::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap Split::inference(FFModel const &ff,
+                           BatchConfigFuture const &bc,
+                           std::vector<ParallelTensor> const &batch_inputs,
+                           std::vector<ParallelTensor> const &batch_outputs,
+                           MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index b8d53fb24..7d30a8aff 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -269,12 +269,11 @@ void TopK::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
-FutureMap TopK::inference(
-    FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
-    std::vector<ParallelTensor> const &batch_inputs,
-    std::vector<ParallelTensor> const &batch_outputs,
-    MachineView const *mv) {
+FutureMap TopK::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index 0e1c83b6e..a69bf61b1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -750,7 +750,7 @@ void TreeIncMultiHeadSelfAttention::forward(FFModel const &ff) {
 
 FutureMap TreeIncMultiHeadSelfAttention::inference(
     FFModel const &ff,
-    /* Reserved: BatchConfig Updated */ BatchConfigFuture const &bc,
+    BatchConfigFuture const &bc,
     std::vector<ParallelTensor> const &batch_inputs,
     std::vector<ParallelTensor> const &batch_outputs,
     MachineView const *mv) {