From 05f57b0026960cbd87e5ed7a40a554151a0c22a7 Mon Sep 17 00:00:00 2001 From: Chinmaya Andukuri Date: Tue, 12 Nov 2024 22:30:49 -0800 Subject: [PATCH 1/2] support large vocab sizes (i.e. llama3) for DraftRetriever datastore --- DraftRetriever/src/lib.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/DraftRetriever/src/lib.rs b/DraftRetriever/src/lib.rs index 024295f..d251a5b 100644 --- a/DraftRetriever/src/lib.rs +++ b/DraftRetriever/src/lib.rs @@ -1,4 +1,4 @@ -// The code for retrival is adapted from https://github.com/Intsights/PySubstringSearch; +// The code for retrival is adapted from https://github.com/Intsights/PySubstringSearch; // The code for drafft buffer is adapted from https://github.com/FasterDecoding/Medusa/blob/main/medusa/model/utils.py#L31-L124 use ahash::AHashSet; use byteorder::{ReadBytesExt, WriteBytesExt, ByteOrder, LittleEndian}; @@ -111,10 +111,11 @@ impl Writer { return Ok(()); } - self.index_file.write_u32::((self.buffer.len() * 2) as u32)?; + self.index_file.write_u32::((self.buffer.len() * 4) as u32)?; // self.buffer.len() is the length of the buffer (in # of integers). This is variable because sometimes we dump_data() early, its not always self.buffer.capacity(). + // * 4 because this value will actually tell us how much space is needed for this buffer in file, and we store each as 4 bytes for &item in &self.buffer { - self.index_file.write_u16::(item as u16)?; + self.index_file.write_i32::(item as i32)?; } let suffix_array = construct_suffix_array(&self.buffer, self.vocab_size); @@ -127,6 +128,10 @@ impl Writer { Ok(()) } + // Some personal notes: + // (1) max_chunk_len can be whatever we want it to be, but the draftretriever Reader() works fastest when we choose something large + // (2) vocab_size should be the size of the vocabulary + 1. This is used in the suffix array construction. + fn finalize( &mut self, ) -> PyResult<()> { @@ -188,8 +193,9 @@ impl Reader { let mut data: Vec = Vec::new(); - for i in (0..data_u8.len()).step_by(2) { - let int = LittleEndian::read_u16(&data_u8[i..i+2]) as i32; + // Step by 4 to read in each 4-byte int (i32) from index file + for i in (0..data_u8.len()).step_by(4) { + let int = LittleEndian::read_i32(&data_u8[i..i+4]) as i32; data.push(int); } @@ -259,7 +265,7 @@ impl Reader { if start_of_indices.is_none() { return; } - + // this binary search finds the end of the matching suffixes let mut right_anchor = sub_index.suffixes_file_end - 4; while left_anchor <= right_anchor { @@ -300,7 +306,7 @@ impl Reader { let data_index = LittleEndian::read_i32(suffix); if matches_ranges.insert(data_index) { let sub_string_plus = &sub_index.data[data_index as usize + substring_i32.len() ..std::cmp::min(data_index as usize + substring_i32.len() + long as usize, sub_index.data.len())]; - + local_results.push(sub_string_plus.to_vec()); cnt += 1; if cnt >= k as usize { @@ -328,7 +334,7 @@ impl Reader { *counter += 1; } } - + let choices = choices.unwrap_or(64); // The items in the heap must be a Trie. let mut heap = BinaryHeap::new(); @@ -348,7 +354,7 @@ impl Reader { let verified: Vec<_> = verified.into_iter().collect(); // Because multiple nodes in the Trie may have same weights around the threshold, the number of draft tokens may exceed choices - // We roughly cut nodes to be less than choices in most cases. + // We roughly cut nodes to be less than choices in most cases. let paths = cut_to_choices(verified, choices); let (draft_choices, max_branch) = get_draft_choices(paths.clone()); @@ -562,4 +568,3 @@ fn draftretriever( Ok(()) } - From 9b9cebde70ca98dbc4d381d2c8831e421bb9c3a0 Mon Sep 17 00:00:00 2001 From: Chinmaya Andukuri Date: Tue, 12 Nov 2024 22:38:08 -0800 Subject: [PATCH 2/2] updates comments to explain implementation changes --- DraftRetriever/src/lib.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/DraftRetriever/src/lib.rs b/DraftRetriever/src/lib.rs index d251a5b..96f796b 100644 --- a/DraftRetriever/src/lib.rs +++ b/DraftRetriever/src/lib.rs @@ -67,6 +67,7 @@ impl Writer { let index_file = File::create(index_file_path)?; let index_file = BufWriter::new(index_file); + // max_chunk_len can be whatever we want it to be, but the draftretriever Reader() seems to work fastest when we choose something large (i.e. 2e27) let max_chunk_len = max_chunk_len.unwrap_or(512 * 1024 * 1024); let vocab_size = vocab_size.unwrap_or(35000); @@ -114,6 +115,8 @@ impl Writer { self.index_file.write_u32::((self.buffer.len() * 4) as u32)?; // self.buffer.len() is the length of the buffer (in # of integers). This is variable because sometimes we dump_data() early, its not always self.buffer.capacity(). // * 4 because this value will actually tell us how much space is needed for this buffer in file, and we store each as 4 bytes + // For larger vocabularies (ie > 65,535), we should write the integers as i32 instead of u16 + // Keeping i32 instead of u32 so negative values can be used as pad tokens (i.e. pad_path(path, max_length, -2)) for &item in &self.buffer { self.index_file.write_i32::(item as i32)?; } @@ -128,10 +131,6 @@ impl Writer { Ok(()) } - // Some personal notes: - // (1) max_chunk_len can be whatever we want it to be, but the draftretriever Reader() works fastest when we choose something large - // (2) vocab_size should be the size of the vocabulary + 1. This is used in the suffix array construction. - fn finalize( &mut self, ) -> PyResult<()> {