-
Notifications
You must be signed in to change notification settings - Fork 14.4k
vulkan: Warptile tuning for Intel Xe2/Xe3 #18178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
0f82d0a
c908711
3441283
6b7f1e8
0cfa616
c40396a
dc14792
328116e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2838,18 +2838,25 @@ static void ggml_vk_load_shaders(vk_device& device) { | |
| m_warptile_mmqid_int_k = { 128, 64, 64, 32, mul_mat_subgroup_size_16, 32, 1, 2, 2, 1, mul_mat_subgroup_size_16 }; | ||
| s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, 32, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; | ||
|
|
||
| l_mmq_wg_denoms = l_wg_denoms = { 128, 128, 1 }; | ||
| m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; | ||
| s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; | ||
| l_align = 128; | ||
| m_align = 64; | ||
| s_align = 32; | ||
|
|
||
| // chip specific tuning | ||
| if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { | ||
| m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; | ||
| m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; | ||
| } | ||
|
|
||
| l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; | ||
| m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; | ||
| s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; | ||
| l_align = 128; | ||
| m_align = 64; | ||
| s_align = 32; | ||
| else if (device->vendor_id == VK_VENDOR_ID_INTEL) { | ||
| if (device->coopmat_support && device->architecture == INTEL_XE2) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's stick to one if until we actually have more Intel cases that would justify a secondary branch. |
||
| // Xe2/Xe3 with coopmat enabled - warptile performance tuning | ||
| l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; | ||
| l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; | ||
| } | ||
| } | ||
|
|
||
| for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { | ||
| ggml_type t = (ggml_type)i; | ||
|
|
@@ -3521,6 +3528,13 @@ static void ggml_vk_load_shaders(vk_device& device) { | |
| m_wg_denoms = { 64, 64, 1 }; | ||
| s_wg_denoms = { 32, 32, 1 }; | ||
|
|
||
| if (device->vendor_id == VK_VENDOR_ID_INTEL) { | ||
| if (device->architecture == INTEL_XE2) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above. |
||
| // Xe2/Xe3 - bf16 warptile performance tuning | ||
| l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, 4, 4, 1, subgroup_size_8 }; | ||
| } | ||
| } | ||
|
|
||
| CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); | ||
| CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4, _id, 0); | ||
| } | ||
|
|
@@ -4868,10 +4882,15 @@ static vk_device ggml_vk_get_device(size_t idx) { | |
| #ifndef GGML_VULKAN_RUN_TESTS | ||
| case VK_VENDOR_ID_AMD: | ||
| case VK_VENDOR_ID_INTEL: | ||
| device->mul_mat_l[i] = false; | ||
| if (!device->coopmat_support || device->architecture != INTEL_XE2) { | ||
| device->mul_mat_l[i] = false; | ||
| device->mul_mat_id_l[i] = false; | ||
| } else { | ||
| device->mul_mat_l[i] = true; // if coopmat & XE2+, allow large matmul warptile config for Intel | ||
| device->mul_mat_id_l[i] = true; | ||
| } | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're now diverging between Intel and AMD, please copy the original AMD code to the top and separate them from Intel. |
||
| device->mul_mat_m[i] = true; | ||
| device->mul_mat_s[i] = true; | ||
| device->mul_mat_id_l[i] = false; | ||
| device->mul_mat_id_m[i] = true; | ||
| device->mul_mat_id_s[i] = true; | ||
| break; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see a reason this needs to be set earlier, so please leave it as is.