-
Notifications
You must be signed in to change notification settings - Fork 802
Fix data race when terminating or waiting for a thread. #1924
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,6 +76,58 @@ traverse_list(bh_list *l, list_visitor visitor, void *user_data) | |
| } | ||
| } | ||
|
|
||
| /* Assumes cluster->lock is locked */ | ||
| static bool | ||
| safe_traverse_exec_env_list(WASMCluster *cluster, list_visitor visitor, | ||
| void *user_data) | ||
| { | ||
| Vector proc_nodes; | ||
| void *node; | ||
| bool ret = true; | ||
|
|
||
| if (!bh_vector_init(&proc_nodes, cluster->exec_env_list.len, sizeof(void *), | ||
| false)) { | ||
| ret = false; | ||
| goto final; | ||
| } | ||
|
|
||
| node = bh_list_first_elem(&cluster->exec_env_list); | ||
|
|
||
| while (node) { | ||
| bool already_processed = false; | ||
| void *proc_node; | ||
| for (size_t i = 0; i < bh_vector_size(&proc_nodes); i++) { | ||
| if (!bh_vector_get(&proc_nodes, i, &proc_node)) { | ||
| ret = false; | ||
| goto final; | ||
| } | ||
| if (proc_node == node) { | ||
| already_processed = true; | ||
| break; | ||
| } | ||
| } | ||
| if (already_processed) { | ||
| node = bh_list_elem_next(node); | ||
| continue; | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i guess for many (all?) of usage it's ok to visit nodes multiple times and thus it isn't worth to have this O(x^2) check.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, most likely that won't be an issue; having said that, I don't think |
||
|
|
||
| os_mutex_unlock(&cluster->lock); | ||
| visitor(node, user_data); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what keeps "node" from being freed while we release cluster->lock here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's visitor's responsibility. See |
||
| os_mutex_lock(&cluster->lock); | ||
| if (!bh_vector_append(&proc_nodes, &node)) { | ||
| ret = false; | ||
| goto final; | ||
| } | ||
|
|
||
| node = bh_list_first_elem(&cluster->exec_env_list); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will make it traversing from the first node again, nodes from the list head to current will be tested again whether they are in the vector, it is nearly O(n^2), and besides the top I think that we can check whether the most recent visited node is still in the exec_env list, if yes, we can get its next, and then continue to traverse, if not, traversing from the head of the list: static bool
safe_traverse_exec_env_list(WASMCluster *cluster, list_visitor visitor,
void *user_data)
{
Vector proc_nodes;
void *node;
bool ret = false;
if (!bh_vector_init(&proc_nodes, cluster->exec_env_list.len, sizeof(void *),
false)) {
goto final;
}
node = bh_list_first_elem(&cluster->exec_env_list);
while (node) {
void *proc_node;
bool found;
int i;
os_mutex_unlock(&cluster->lock);
visitor(node, user_data);
os_mutex_lock(&cluster->lock);
if (!bh_vector_append(&proc_nodes, &node)) {
goto final;
}
found = false;
/* Find the most recent visited and still existing node */
for (i = (int)bh_vector_size(&proc_nodes) - 1; i >= 0; i--) {
if (!bh_vector_get(&proc_nodes, i, &proc_node)) {
goto final;
}
node = bh_list_first_elem(&cluster->exec_env_list);
while (node) {
if (proc_node == node) {
found = true;
break;
}
node = bh_list_elem_next(node);
}
if (found)
break;
else {
/* The node has been removed from cluster->exec_env_list */
(void)bh_vector_remove(&proc_nodes, i, NULL);
}
}
if (found)
node = bh_list_elem_next(node);
else
node = bh_list_first_elem(&cluster->exec_env_list);
}
ret = true;
final:
bh_vector_destroy(&proc_nodes);
return ret;
}Could you help check whether it works?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is still not very efficient, seems to be O(n^2).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wenyongh please note the critical operation in the loop is: and that in either of the implementations is executed only N times. Also, I think in the current implementation we only go beyond for loop (and therefore, goes at the beginning of the list) if the node is not in the vector, so the actual complexity is quadratic. I think there's a room for improvement (not only in the iteration itself, but e.g. by using hash set instead of vector), but I don't think this will significantly improve performance of this function.
Yes, they lock the cluster's lock internally. My suggestion is to push the fix as it is (unless you see bugs in it) to not block potential customers (please note the problem is also with the main branch, so I'm updating the PR to point to main) and we can tune the performance of the function later if needed. What do you think?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, let's merge this PR first. |
||
| } | ||
|
|
||
| final: | ||
| bh_vector_destroy(&proc_nodes); | ||
|
|
||
| return ret; | ||
| } | ||
|
|
||
| /* The caller must lock cluster->lock */ | ||
| static bool | ||
| allocate_aux_stack(WASMCluster *cluster, uint32 *start, uint32 *size) | ||
|
|
@@ -299,7 +351,6 @@ wasm_cluster_del_exec_env(WASMCluster *cluster, WASMExecEnv *exec_env) | |
| os_mutex_unlock(&cluster->debug_inst->wait_lock); | ||
| } | ||
| #endif | ||
|
|
||
| if (bh_list_remove(&cluster->exec_env_list, exec_env) != 0) | ||
| ret = false; | ||
|
|
||
|
|
@@ -724,16 +775,22 @@ wasm_cluster_join_thread(WASMExecEnv *exec_env, void **ret_val) | |
| korp_tid handle; | ||
|
|
||
| os_mutex_lock(&cluster_list_lock); | ||
| os_mutex_lock(&exec_env->cluster->lock); | ||
|
|
||
| if (!clusters_have_exec_env(exec_env) || exec_env->thread_is_detached) { | ||
| /* Invalid thread, thread has exited or thread has been detached */ | ||
| if (ret_val) | ||
| *ret_val = NULL; | ||
| os_mutex_unlock(&exec_env->cluster->lock); | ||
| os_mutex_unlock(&cluster_list_lock); | ||
| return 0; | ||
| } | ||
| exec_env->wait_count++; | ||
| handle = exec_env->handle; | ||
|
|
||
| os_mutex_unlock(&exec_env->cluster->lock); | ||
| os_mutex_unlock(&cluster_list_lock); | ||
|
|
||
| return os_thread_join(handle, ret_val); | ||
| } | ||
|
|
||
|
|
@@ -816,15 +873,22 @@ int32 | |
| wasm_cluster_cancel_thread(WASMExecEnv *exec_env) | ||
| { | ||
| os_mutex_lock(&cluster_list_lock); | ||
| os_mutex_lock(&exec_env->cluster->lock); | ||
|
|
||
| if (!exec_env->cluster) { | ||
| goto final; | ||
| } | ||
| if (!clusters_have_exec_env(exec_env)) { | ||
| /* Invalid thread or the thread has exited */ | ||
| os_mutex_unlock(&cluster_list_lock); | ||
| return 0; | ||
| goto final; | ||
| } | ||
| os_mutex_unlock(&cluster_list_lock); | ||
|
|
||
| set_thread_cancel_flags(exec_env); | ||
|
|
||
| final: | ||
| os_mutex_unlock(&exec_env->cluster->lock); | ||
| os_mutex_unlock(&cluster_list_lock); | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
|
|
@@ -846,11 +910,9 @@ wasm_cluster_terminate_all(WASMCluster *cluster) | |
| { | ||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = true; | ||
| os_mutex_unlock(&cluster->lock); | ||
|
|
||
| traverse_list(&cluster->exec_env_list, terminate_thread_visitor, NULL); | ||
| safe_traverse_exec_env_list(cluster, terminate_thread_visitor, NULL); | ||
|
|
||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = false; | ||
| os_mutex_unlock(&cluster->lock); | ||
| } | ||
|
|
@@ -861,12 +923,10 @@ wasm_cluster_terminate_all_except_self(WASMCluster *cluster, | |
| { | ||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = true; | ||
| os_mutex_unlock(&cluster->lock); | ||
|
|
||
| traverse_list(&cluster->exec_env_list, terminate_thread_visitor, | ||
| (void *)exec_env); | ||
| safe_traverse_exec_env_list(cluster, terminate_thread_visitor, | ||
| (void *)exec_env); | ||
|
|
||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = false; | ||
| os_mutex_unlock(&cluster->lock); | ||
| } | ||
|
|
@@ -888,11 +948,9 @@ wams_cluster_wait_for_all(WASMCluster *cluster) | |
| { | ||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = true; | ||
| os_mutex_unlock(&cluster->lock); | ||
|
|
||
| traverse_list(&cluster->exec_env_list, wait_for_thread_visitor, NULL); | ||
| safe_traverse_exec_env_list(cluster, wait_for_thread_visitor, NULL); | ||
|
|
||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = false; | ||
| os_mutex_unlock(&cluster->lock); | ||
| } | ||
|
|
@@ -903,12 +961,10 @@ wasm_cluster_wait_for_all_except_self(WASMCluster *cluster, | |
| { | ||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = true; | ||
| os_mutex_unlock(&cluster->lock); | ||
|
|
||
| traverse_list(&cluster->exec_env_list, wait_for_thread_visitor, | ||
| (void *)exec_env); | ||
| safe_traverse_exec_env_list(cluster, wait_for_thread_visitor, | ||
| (void *)exec_env); | ||
|
|
||
| os_mutex_lock(&cluster->lock); | ||
| cluster->processing = false; | ||
| os_mutex_unlock(&cluster->lock); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Had better declare
size_t i;with a single line, here it it C source code, we usually keep the coding style of C.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I'll update the code. It'd be good to have some sort of CI check to enforce that though (I guess compile with
-std=c89would do, but is the whole codebase c89 compliant?)