gty111 · gty111 · Jun 7, 2025 · Jun 6, 2025 · Jun 7, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,7 +21,7 @@ set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 
 # Supported/expected torch versions for CUDA.
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -193,38 +193,45 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
+          GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
   )
 endif()
 
-# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
-set(VLLM_PARENT_BUILD ON)
-
-# Ensure the gllm/vllm_flash_attn directory exists before installation
-install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/gllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/gllm/vllm_flash_attn\")" ALL_COMPONENTS)
 
 # Make sure vllm-flash-attn install rules are nested under gllm/
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
-install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/gllm/\")" COMPONENT vllm_flash_attn_c)
+# This is here to support installing all components under the same prefix with cmake --install.
+# setup.py installs every component separately but uses the same prefix for all.
+# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
+# and these statements don't hurt when installing neither component.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/gllm/\")" ALL_COMPONENTS)
 
 # Fetch the vllm-flash-attn library
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 
 # Restore the install prefix
-install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 
-# Copy over the vllm-flash-attn python files
+# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
+# case only one is built, in the case both are built redundant work is done)
 install(
-        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-        DESTINATION gllm/vllm_flash_attn
-        COMPONENT vllm_flash_attn_c
-        FILES_MATCHING PATTERN "*.py"
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION gllm/vllm_flash_attn
+  COMPONENT _vllm_fa2_C
+  FILES_MATCHING PATTERN "*.py"
 )
 
-# Nothing after vllm-flash-attn, see comment about macros above
+# install(
+#   DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+#   DESTINATION gllm/vllm_flash_attn
+#   COMPONENT _vllm_fa3_C
+#   FILES_MATCHING PATTERN "*.py"
+# )
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ Integreted with features like **continuous batching**, **paged attention**, **ch
 
 ## Install gLLM
 ```
-pip install torch==2.5.1
+pip install torch==2.7.0
 pip install -v -e .
 ```
 

diff --git a/gllm/input_data.py b/gllm/input_data.py
@@ -53,12 +53,10 @@ def get_position(self):
 
     def get_seq_len_loc(self):
         max_seqlen = 0
-        cu_seqs_len_num = 0
-        seq_start_loc = [0]
+        seq_start_loc = []
         for seq in self.seqs:
             seq_len = seq.computed_token_num + seq.to_compute_token_num
-            cu_seqs_len_num += seq_len
-            seq_start_loc.append(cu_seqs_len_num)
+            seq_start_loc.append(seq_len)
             max_seqlen = max(seq_len, max_seqlen)
         return max_seqlen, async_tensor_h2d(seq_start_loc, torch.int32, 'cuda', True)
 

diff --git a/gllm/layers/attention.py b/gllm/layers/attention.py
@@ -41,7 +41,7 @@ def forward(self,
                                      v_cache,
                                      cu_seqlens_q=input_data.query_start_loc,
                                      max_seqlen_q=input_data.max_query_len,
-                                     cu_seqlens_k=input_data.seq_start_loc,
+                                     seqused_k=input_data.seq_start_loc,
                                      max_seqlen_k=input_data.max_seq_len,
                                      softmax_scale=self.scaling,
                                      causal=True,

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging",
     "setuptools>=61",
     "setuptools-scm>=8.0",
-    "torch == 2.5.1",
+    "torch == 2.7.0",
     "wheel",
     "jinja2",
 ]

diff --git a/requirements.txt b/requirements.txt
@@ -12,7 +12,7 @@ fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 uvicorn[standard]
 openai >= 1.52.0
 logger
-torch==2.5.1
+torch==2.7.0
 zmq
 aiohttp
 datasets

diff --git a/setup.py b/setup.py
@@ -70,7 +70,7 @@ class cmake_build_ext(build_ext):
     def compute_num_jobs(self):
         # `num_jobs` is either the value of the MAX_JOBS environment variable
         # (if defined) or the number of CPUs available.
-        num_jobs = None
+        num_jobs = os.environ['MAX_JOBS']
         if num_jobs is not None:
             num_jobs = int(num_jobs)
             logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
@@ -309,9 +309,12 @@ def _read_requirements(filename: str) -> List[str]:
 
 ext_modules.append(CMakeExtension(name="gllm._C"))
 ext_modules.append(CMakeExtension(name="gllm._moe_C"))
-ext_modules.append(
-    CMakeExtension(name="gllm.vllm_flash_attn.vllm_flash_attn_c"))
+ext_modules.append(CMakeExtension(name="gllm.vllm_flash_attn._vllm_fa2_C"))
 
+# if get_nvcc_cuda_version() >= Version("12.3"):
+#     # FA3 requires CUDA 12.3 or later
+#     ext_modules.append(
+#         CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
 
 setup(
     name="gllm",