Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 25 additions & 18 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")


# Supported/expected torch versions for CUDA.
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")

#
# Try to find python package with an executable that exactly matches
Expand Down Expand Up @@ -193,38 +193,45 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()

# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
set(VLLM_PARENT_BUILD ON)

# Ensure the gllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/gllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/gllm/vllm_flash_attn\")" ALL_COMPONENTS)

# Make sure vllm-flash-attn install rules are nested under gllm/
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/gllm/\")" COMPONENT vllm_flash_attn_c)
# This is here to support installing all components under the same prefix with cmake --install.
# setup.py installs every component separately but uses the same prefix for all.
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
# and these statements don't hurt when installing neither component.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/gllm/\")" ALL_COMPONENTS)

# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")

# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)

# Copy over the vllm-flash-attn python files
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION gllm/vllm_flash_attn
COMPONENT vllm_flash_attn_c
FILES_MATCHING PATTERN "*.py"
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION gllm/vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)

# Nothing after vllm-flash-attn, see comment about macros above
# install(
# DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
# DESTINATION gllm/vllm_flash_attn
# COMPONENT _vllm_fa3_C
# FILES_MATCHING PATTERN "*.py"
# )
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Integreted with features like **continuous batching**, **paged attention**, **ch

## Install gLLM
```
pip install torch==2.5.1
pip install torch==2.7.0
pip install -v -e .
```

Expand Down
6 changes: 2 additions & 4 deletions gllm/input_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,10 @@ def get_position(self):

def get_seq_len_loc(self):
max_seqlen = 0
cu_seqs_len_num = 0
seq_start_loc = [0]
seq_start_loc = []
for seq in self.seqs:
seq_len = seq.computed_token_num + seq.to_compute_token_num
cu_seqs_len_num += seq_len
seq_start_loc.append(cu_seqs_len_num)
seq_start_loc.append(seq_len)
max_seqlen = max(seq_len, max_seqlen)
return max_seqlen, async_tensor_h2d(seq_start_loc, torch.int32, 'cuda', True)

Expand Down
2 changes: 1 addition & 1 deletion gllm/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def forward(self,
v_cache,
cu_seqlens_q=input_data.query_start_loc,
max_seqlen_q=input_data.max_query_len,
cu_seqlens_k=input_data.seq_start_loc,
seqused_k=input_data.seq_start_loc,
max_seqlen_k=input_data.max_seq_len,
softmax_scale=self.scaling,
causal=True,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ requires = [
"packaging",
"setuptools>=61",
"setuptools-scm>=8.0",
"torch == 2.5.1",
"torch == 2.7.0",
"wheel",
"jinja2",
]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
uvicorn[standard]
openai >= 1.52.0
logger
torch==2.5.1
torch==2.7.0
zmq
aiohttp
datasets
Expand Down
9 changes: 6 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class cmake_build_ext(build_ext):
def compute_num_jobs(self):
# `num_jobs` is either the value of the MAX_JOBS environment variable
# (if defined) or the number of CPUs available.
num_jobs = None
num_jobs = os.environ['MAX_JOBS']
if num_jobs is not None:
num_jobs = int(num_jobs)
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
Expand Down Expand Up @@ -309,9 +309,12 @@ def _read_requirements(filename: str) -> List[str]:

ext_modules.append(CMakeExtension(name="gllm._C"))
ext_modules.append(CMakeExtension(name="gllm._moe_C"))
ext_modules.append(
CMakeExtension(name="gllm.vllm_flash_attn.vllm_flash_attn_c"))
ext_modules.append(CMakeExtension(name="gllm.vllm_flash_attn._vllm_fa2_C"))

# if get_nvcc_cuda_version() >= Version("12.3"):
# # FA3 requires CUDA 12.3 or later
# ext_modules.append(
# CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))

setup(
name="gllm",
Expand Down