From 05f57b0026960cbd87e5ed7a40a554151a0c22a7 Mon Sep 17 00:00:00 2001
From: Chinmaya Andukuri <andukuri@stanford.edu>
Date: Tue, 12 Nov 2024 22:30:49 -0800
Subject: [PATCH 1/2] support large vocab sizes (i.e. llama3) for
 DraftRetriever datastore

---
 DraftRetriever/src/lib.rs | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)
diff --git a/DraftRetriever/src/lib.rs b/DraftRetriever/src/lib.rs
index 024295f..d251a5b 100644
--- a/DraftRetriever/src/lib.rs
+++ b/DraftRetriever/src/lib.rs
@@ -1,4 +1,4 @@
-// The code for retrival is adapted from https://github.com/Intsights/PySubstringSearch; 
+// The code for retrival is adapted from https://github.com/Intsights/PySubstringSearch;
 // The code for drafft buffer is adapted from https://github.com/FasterDecoding/Medusa/blob/main/medusa/model/utils.py#L31-L124
 use ahash::AHashSet;
 use byteorder::{ReadBytesExt, WriteBytesExt, ByteOrder, LittleEndian};
@@ -111,10 +111,11 @@ impl Writer {
             return Ok(());
         }
 
-        self.index_file.write_u32::<LittleEndian>((self.buffer.len() * 2) as u32)?;
+        self.index_file.write_u32::<LittleEndian>((self.buffer.len() * 4) as u32)?; // self.buffer.len() is the length of the buffer (in # of integers). This is variable because sometimes we dump_data() early, its not always self.buffer.capacity().
+        // * 4 because this value will actually tell us how much space is needed for this buffer in file, and we store each as 4 bytes
 
         for &item in &self.buffer {
-            self.index_file.write_u16::<LittleEndian>(item as u16)?;
+            self.index_file.write_i32::<LittleEndian>(item as i32)?;
         }
 
         let suffix_array = construct_suffix_array(&self.buffer, self.vocab_size);
@@ -127,6 +128,10 @@ impl Writer {
         Ok(())
     }
 
+    // Some personal notes:
+    // (1) max_chunk_len can be whatever we want it to be, but the draftretriever Reader() works fastest when we choose something large
+    // (2) vocab_size should be the size of the vocabulary + 1. This is used in the suffix array construction.
+
     fn finalize(
         &mut self,
     ) -> PyResult<()> {
@@ -188,8 +193,9 @@ impl Reader {
 
             let mut data: Vec<i32> = Vec::new();
 
-            for i in (0..data_u8.len()).step_by(2) {
-                let int = LittleEndian::read_u16(&data_u8[i..i+2]) as i32;
+            // Step by 4 to read in each 4-byte int (i32) from index file
+            for i in (0..data_u8.len()).step_by(4) {
+                let int = LittleEndian::read_i32(&data_u8[i..i+4]) as i32;
                 data.push(int);
             }
 
@@ -259,7 +265,7 @@ impl Reader {
                 if start_of_indices.is_none() {
                     return;
                 }
-                
+
                 // this binary search finds the end of the matching suffixes
                 let mut right_anchor = sub_index.suffixes_file_end - 4;
                 while left_anchor <= right_anchor {
@@ -300,7 +306,7 @@ impl Reader {
                     let data_index = LittleEndian::read_i32(suffix);
                     if matches_ranges.insert(data_index) {
                         let sub_string_plus = &sub_index.data[data_index as usize + substring_i32.len() ..std::cmp::min(data_index as usize + substring_i32.len() + long as usize,  sub_index.data.len())];
-                    
+
                         local_results.push(sub_string_plus.to_vec());
                         cnt += 1;
                         if cnt >= k as usize {
@@ -328,7 +334,7 @@ impl Reader {
                 *counter += 1;
             }
         }
-        
+
         let choices = choices.unwrap_or(64);
         // The items in the heap must be a Trie.
         let mut heap = BinaryHeap::new();
@@ -348,7 +354,7 @@ impl Reader {
         let verified: Vec<_> = verified.into_iter().collect();
 
         // Because multiple nodes in the Trie may have same weights around the threshold, the number of draft tokens may exceed choices
-        // We roughly cut nodes to be less than choices in most cases. 
+        // We roughly cut nodes to be less than choices in most cases.
         let paths = cut_to_choices(verified, choices);
 
         let (draft_choices, max_branch) = get_draft_choices(paths.clone());
@@ -562,4 +568,3 @@ fn draftretriever(
 
     Ok(())
 }
-

From 9b9cebde70ca98dbc4d381d2c8831e421bb9c3a0 Mon Sep 17 00:00:00 2001
From: Chinmaya Andukuri <andukuri@stanford.edu>
Date: Tue, 12 Nov 2024 22:38:08 -0800
Subject: [PATCH 2/2] updates comments to explain implementation changes

---
 DraftRetriever/src/lib.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/DraftRetriever/src/lib.rs b/DraftRetriever/src/lib.rs
index d251a5b..96f796b 100644
--- a/DraftRetriever/src/lib.rs
+++ b/DraftRetriever/src/lib.rs
@@ -67,6 +67,7 @@ impl Writer {
         let index_file = File::create(index_file_path)?;
         let index_file = BufWriter::new(index_file);
 
+        //  max_chunk_len can be whatever we want it to be, but the draftretriever Reader() seems to work fastest when we choose something large (i.e. 2e27)
         let max_chunk_len = max_chunk_len.unwrap_or(512 * 1024 * 1024);
         let vocab_size = vocab_size.unwrap_or(35000);
 
@@ -114,6 +115,8 @@ impl Writer {
         self.index_file.write_u32::<LittleEndian>((self.buffer.len() * 4) as u32)?; // self.buffer.len() is the length of the buffer (in # of integers). This is variable because sometimes we dump_data() early, its not always self.buffer.capacity().
         // * 4 because this value will actually tell us how much space is needed for this buffer in file, and we store each as 4 bytes
 
+        // For larger vocabularies (ie > 65,535), we should write the integers as i32 instead of u16
+        // Keeping i32 instead of u32 so negative values can be used as pad tokens (i.e. pad_path(path, max_length, -2))
         for &item in &self.buffer {
             self.index_file.write_i32::<LittleEndian>(item as i32)?;
         }
@@ -128,10 +131,6 @@ impl Writer {
         Ok(())
     }
 
-    // Some personal notes:
-    // (1) max_chunk_len can be whatever we want it to be, but the draftretriever Reader() works fastest when we choose something large
-    // (2) vocab_size should be the size of the vocabulary + 1. This is used in the suffix array construction.
-
     fn finalize(
         &mut self,
     ) -> PyResult<()> {