diff --git a/CMakeLists.txt b/CMakeLists.txt index 622f97a..6f7c933 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") # Supported/expected torch versions for CUDA. -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") +set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0") # # Try to find python package with an executable that exactly matches @@ -193,38 +193,45 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c + GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() -# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization. -set(VLLM_PARENT_BUILD ON) - -# Ensure the gllm/vllm_flash_attn directory exists before installation -install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/gllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c) +# Ensure the vllm/vllm_flash_attn directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/gllm/vllm_flash_attn\")" ALL_COMPONENTS) # Make sure vllm-flash-attn install rules are nested under gllm/ -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c) -install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) -install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/gllm/\")" COMPONENT vllm_flash_attn_c) +# This is here to support installing all components under the same prefix with cmake --install. +# setup.py installs every component separately but uses the same prefix for all. +# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3, +# and these statements don't hurt when installing neither component. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS) +install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/gllm/\")" ALL_COMPONENTS) # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") # Restore the install prefix -install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c) -install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) -# Copy over the vllm-flash-attn python files +# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in +# case only one is built, in the case both are built redundant work is done) install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION gllm/vllm_flash_attn - COMPONENT vllm_flash_attn_c - FILES_MATCHING PATTERN "*.py" + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION gllm/vllm_flash_attn + COMPONENT _vllm_fa2_C + FILES_MATCHING PATTERN "*.py" ) -# Nothing after vllm-flash-attn, see comment about macros above +# install( +# DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ +# DESTINATION gllm/vllm_flash_attn +# COMPONENT _vllm_fa3_C +# FILES_MATCHING PATTERN "*.py" +# ) \ No newline at end of file diff --git a/README.md b/README.md index dd794b3..3369745 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Integreted with features like **continuous batching**, **paged attention**, **ch ## Install gLLM ``` -pip install torch==2.5.1 +pip install torch==2.7.0 pip install -v -e . ``` diff --git a/gllm/input_data.py b/gllm/input_data.py index 50a8895..f18c28d 100644 --- a/gllm/input_data.py +++ b/gllm/input_data.py @@ -53,12 +53,10 @@ def get_position(self): def get_seq_len_loc(self): max_seqlen = 0 - cu_seqs_len_num = 0 - seq_start_loc = [0] + seq_start_loc = [] for seq in self.seqs: seq_len = seq.computed_token_num + seq.to_compute_token_num - cu_seqs_len_num += seq_len - seq_start_loc.append(cu_seqs_len_num) + seq_start_loc.append(seq_len) max_seqlen = max(seq_len, max_seqlen) return max_seqlen, async_tensor_h2d(seq_start_loc, torch.int32, 'cuda', True) diff --git a/gllm/layers/attention.py b/gllm/layers/attention.py index 5a6ba0e..976b46a 100644 --- a/gllm/layers/attention.py +++ b/gllm/layers/attention.py @@ -41,7 +41,7 @@ def forward(self, v_cache, cu_seqlens_q=input_data.query_start_loc, max_seqlen_q=input_data.max_query_len, - cu_seqlens_k=input_data.seq_start_loc, + seqused_k=input_data.seq_start_loc, max_seqlen_k=input_data.max_seq_len, softmax_scale=self.scaling, causal=True, diff --git a/pyproject.toml b/pyproject.toml index 1431240..70f5afa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.5.1", + "torch == 2.7.0", "wheel", "jinja2", ] diff --git a/requirements.txt b/requirements.txt index 209488e..dac505b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' uvicorn[standard] openai >= 1.52.0 logger -torch==2.5.1 +torch==2.7.0 zmq aiohttp datasets diff --git a/setup.py b/setup.py index dc1f475..16f25dc 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ class cmake_build_ext(build_ext): def compute_num_jobs(self): # `num_jobs` is either the value of the MAX_JOBS environment variable # (if defined) or the number of CPUs available. - num_jobs = None + num_jobs = os.environ['MAX_JOBS'] if num_jobs is not None: num_jobs = int(num_jobs) logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs) @@ -309,9 +309,12 @@ def _read_requirements(filename: str) -> List[str]: ext_modules.append(CMakeExtension(name="gllm._C")) ext_modules.append(CMakeExtension(name="gllm._moe_C")) -ext_modules.append( - CMakeExtension(name="gllm.vllm_flash_attn.vllm_flash_attn_c")) +ext_modules.append(CMakeExtension(name="gllm.vllm_flash_attn._vllm_fa2_C")) +# if get_nvcc_cuda_version() >= Version("12.3"): +# # FA3 requires CUDA 12.3 or later +# ext_modules.append( +# CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) setup( name="gllm",