Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 74 additions & 18 deletions core/iwasm/libraries/thread-mgr/thread_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,58 @@ traverse_list(bh_list *l, list_visitor visitor, void *user_data)
}
}

/* Assumes cluster->lock is locked */
static bool
safe_traverse_exec_env_list(WASMCluster *cluster, list_visitor visitor,
void *user_data)
{
Vector proc_nodes;
void *node;
bool ret = true;

if (!bh_vector_init(&proc_nodes, cluster->exec_env_list.len, sizeof(void *),
false)) {
ret = false;
goto final;
}

node = bh_list_first_elem(&cluster->exec_env_list);

while (node) {
bool already_processed = false;
void *proc_node;
for (size_t i = 0; i < bh_vector_size(&proc_nodes); i++) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had better declare size_t i; with a single line, here it it C source code, we usually keep the coding style of C.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'll update the code. It'd be good to have some sort of CI check to enforce that though (I guess compile with -std=c89 would do, but is the whole codebase c89 compliant?)

if (!bh_vector_get(&proc_nodes, i, &proc_node)) {
ret = false;
goto final;
}
if (proc_node == node) {
already_processed = true;
break;
}
}
if (already_processed) {
node = bh_list_elem_next(node);
continue;
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess for many (all?) of usage it's ok to visit nodes multiple times and thus it isn't worth to have this O(x^2) check.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, most likely that won't be an issue; having said that, I don't think x is going to be large enough to worry about that (so I'd rather keep the implementation slow but always correct, than making it a bit faster but risk somebody will use it incorrectly). If you think that will have a significant performance impact (because x can be large), I can update the code.


os_mutex_unlock(&cluster->lock);
visitor(node, user_data);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what keeps "node" from being freed while we release cluster->lock here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's visitor's responsibility. See wait_for_thread_visitor and terminate_thread_visitor - they both check if the exec_env if clusters_have_exec_env() while keeping the lock.

os_mutex_lock(&cluster->lock);
if (!bh_vector_append(&proc_nodes, &node)) {
ret = false;
goto final;
}

node = bh_list_first_elem(&cluster->exec_env_list);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will make it traversing from the first node again, nodes from the list head to current will be tested again whether they are in the vector, it is nearly O(n^2), and besides the top while (node) loop, the algorithm's complexity is nearly O(n^3). Not very efficient.

I think that we can check whether the most recent visited node is still in the exec_env list, if yes, we can get its next, and then continue to traverse, if not, traversing from the head of the list:

static bool
safe_traverse_exec_env_list(WASMCluster *cluster, list_visitor visitor,
                            void *user_data)
{
    Vector proc_nodes;
    void *node;
    bool ret = false;

    if (!bh_vector_init(&proc_nodes, cluster->exec_env_list.len, sizeof(void *),
                        false)) {
        goto final;
    }

    node = bh_list_first_elem(&cluster->exec_env_list);

    while (node) {
        void *proc_node;
        bool found;
        int i;

        os_mutex_unlock(&cluster->lock);
        visitor(node, user_data);
        os_mutex_lock(&cluster->lock);

        if (!bh_vector_append(&proc_nodes, &node)) {
            goto final;
        }

        found = false;
        /* Find the most recent visited and still existing node */
        for (i = (int)bh_vector_size(&proc_nodes) - 1; i >= 0; i--) {
            if (!bh_vector_get(&proc_nodes, i, &proc_node)) {
                goto final;
            }
            node = bh_list_first_elem(&cluster->exec_env_list);
            while (node) {
                if (proc_node == node) {
                    found = true;
                    break;
                }
                node = bh_list_elem_next(node);
            }
            if (found)
                break;
            else {
                /* The node has been removed from cluster->exec_env_list */
                (void)bh_vector_remove(&proc_nodes, i, NULL);
            }
        }

        if (found)
            node = bh_list_elem_next(node);
        else
            node = bh_list_first_elem(&cluster->exec_env_list);
    }

    ret = true;
final:
    bh_vector_destroy(&proc_nodes);

    return ret;
}

Could you help check whether it works?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is still not very efficient, seems to be O(n^2).
Wondering why we need to unlock and lock again when visiting the node? Will the visiting lock the cluster->lock again?

Copy link
Copy Markdown
Contributor Author

@loganek loganek Feb 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wenyongh please note the critical operation in the loop is:

os_mutex_unlock(&cluster->lock);
visitor(node, user_data);
os_mutex_lock(&cluster->lock);

and that in either of the implementations is executed only N times. Also, I think in the current implementation we only go beyond for loop (and therefore, goes at the beginning of the list) if the node is not in the vector, so the actual complexity is quadratic. I think there's a room for improvement (not only in the iteration itself, but e.g. by using hash set instead of vector), but I don't think this will significantly improve performance of this function.

Wondering why we need to unlock and lock again when visiting the node? Will the visiting lock the cluster->lock again?

Yes, they lock the cluster's lock internally.

My suggestion is to push the fix as it is (unless you see bugs in it) to not block potential customers (please note the problem is also with the main branch, so I'm updating the PR to point to main) and we can tune the performance of the function later if needed. What do you think?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, let's merge this PR first.

}

final:
bh_vector_destroy(&proc_nodes);

return ret;
}

/* The caller must lock cluster->lock */
static bool
allocate_aux_stack(WASMCluster *cluster, uint32 *start, uint32 *size)
Expand Down Expand Up @@ -299,7 +351,6 @@ wasm_cluster_del_exec_env(WASMCluster *cluster, WASMExecEnv *exec_env)
os_mutex_unlock(&cluster->debug_inst->wait_lock);
}
#endif

if (bh_list_remove(&cluster->exec_env_list, exec_env) != 0)
ret = false;

Expand Down Expand Up @@ -724,16 +775,22 @@ wasm_cluster_join_thread(WASMExecEnv *exec_env, void **ret_val)
korp_tid handle;

os_mutex_lock(&cluster_list_lock);
os_mutex_lock(&exec_env->cluster->lock);

if (!clusters_have_exec_env(exec_env) || exec_env->thread_is_detached) {
/* Invalid thread, thread has exited or thread has been detached */
if (ret_val)
*ret_val = NULL;
os_mutex_unlock(&exec_env->cluster->lock);
os_mutex_unlock(&cluster_list_lock);
return 0;
}
exec_env->wait_count++;
handle = exec_env->handle;

os_mutex_unlock(&exec_env->cluster->lock);
os_mutex_unlock(&cluster_list_lock);

return os_thread_join(handle, ret_val);
}

Expand Down Expand Up @@ -816,15 +873,22 @@ int32
wasm_cluster_cancel_thread(WASMExecEnv *exec_env)
{
os_mutex_lock(&cluster_list_lock);
os_mutex_lock(&exec_env->cluster->lock);

if (!exec_env->cluster) {
goto final;
}
if (!clusters_have_exec_env(exec_env)) {
/* Invalid thread or the thread has exited */
os_mutex_unlock(&cluster_list_lock);
return 0;
goto final;
}
os_mutex_unlock(&cluster_list_lock);

set_thread_cancel_flags(exec_env);

final:
os_mutex_unlock(&exec_env->cluster->lock);
os_mutex_unlock(&cluster_list_lock);

return 0;
}

Expand All @@ -846,11 +910,9 @@ wasm_cluster_terminate_all(WASMCluster *cluster)
{
os_mutex_lock(&cluster->lock);
cluster->processing = true;
os_mutex_unlock(&cluster->lock);

traverse_list(&cluster->exec_env_list, terminate_thread_visitor, NULL);
safe_traverse_exec_env_list(cluster, terminate_thread_visitor, NULL);

os_mutex_lock(&cluster->lock);
cluster->processing = false;
os_mutex_unlock(&cluster->lock);
}
Expand All @@ -861,12 +923,10 @@ wasm_cluster_terminate_all_except_self(WASMCluster *cluster,
{
os_mutex_lock(&cluster->lock);
cluster->processing = true;
os_mutex_unlock(&cluster->lock);

traverse_list(&cluster->exec_env_list, terminate_thread_visitor,
(void *)exec_env);
safe_traverse_exec_env_list(cluster, terminate_thread_visitor,
(void *)exec_env);

os_mutex_lock(&cluster->lock);
cluster->processing = false;
os_mutex_unlock(&cluster->lock);
}
Expand All @@ -888,11 +948,9 @@ wams_cluster_wait_for_all(WASMCluster *cluster)
{
os_mutex_lock(&cluster->lock);
cluster->processing = true;
os_mutex_unlock(&cluster->lock);

traverse_list(&cluster->exec_env_list, wait_for_thread_visitor, NULL);
safe_traverse_exec_env_list(cluster, wait_for_thread_visitor, NULL);

os_mutex_lock(&cluster->lock);
cluster->processing = false;
os_mutex_unlock(&cluster->lock);
}
Expand All @@ -903,12 +961,10 @@ wasm_cluster_wait_for_all_except_self(WASMCluster *cluster,
{
os_mutex_lock(&cluster->lock);
cluster->processing = true;
os_mutex_unlock(&cluster->lock);

traverse_list(&cluster->exec_env_list, wait_for_thread_visitor,
(void *)exec_env);
safe_traverse_exec_env_list(cluster, wait_for_thread_visitor,
(void *)exec_env);

os_mutex_lock(&cluster->lock);
cluster->processing = false;
os_mutex_unlock(&cluster->lock);
}
Expand Down