Skip to content

Commit 08d8893

Browse files
Modify code for unlimited number of threads.
1 parent 9be804c commit 08d8893

8 files changed

Lines changed: 67 additions & 53 deletions

File tree

src/Makefile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ STRIP_CMD = strip
3232
# DCLUSTER -- Compile MPI cluster code
3333
# DCLUSTER_TT_TYPE=n -- Distributed transposition table type
3434
# (0 - global, 1 - distributed, 2 - local) default=2
35-
# DMAX_CPUS=n -- Compile for maximum of n cpus
3635
# DYBW -- Compile with YBW
3736
# DTUNE -- Compile evaluation tuning code [NOT recommended]
3837
# DMYDEBUG -- Turn on some MPI debugging
@@ -44,7 +43,6 @@ ARCH_DEFINES = -DARC_64BIT -DHAS_POPCNT
4443

4544
# Feature options
4645
FEATURE_DEFINES = -DNNUE_INC
47-
#FEATURE_DEFINES += -DMAX_CPUS=1360
4846

4947
# Conditional options based on compiler
5048
CONDITIONAL_DEFINES =

src/mcts.cpp

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,9 @@ static std::atomic_int n_collisions = {0};
7373
static std::atomic_int n_terminals = {0};
7474

7575
/*Nodes and edges of tree*/
76-
std::vector<Node*> Node::mem_[MAX_CPUS];
77-
std::vector<uint16_t*> Edges::mem_[MAX_CPUS][MAX_MOVES_NN >> 3];
76+
std::vector<std::vector<Node*>> Node::mem_;
77+
std::vector<std::vector<Node*>> Node::gc;
78+
std::vector<std::array<std::vector<uint16_t*>, (MAX_MOVES_NN >> 3)>> Edges::mem_;
7879
std::atomic_uint Node::total_nodes = {0};
7980
unsigned int Node::max_tree_nodes = 0;
8081
unsigned int Node::max_tree_depth = 0;
@@ -270,12 +271,12 @@ void Node::reset_bounds(Node* n) {
270271
}
271272
}
272273

273-
void Node::split(Node* n, std::vector<Node*>* pn, const int S, int& T) {
274+
void Node::split(Node* n, const int S, int& T) {
274275
static int id = 0;
275276
Node* current = n->child;
276277
while(current) {
277278
if(current->visits <= (unsigned)S || !current->child) {
278-
pn[id].push_back(current);
279+
Node::gc[id].push_back(current);
279280
current->set_dead();
280281

281282
T += current->visits;
@@ -284,7 +285,7 @@ void Node::split(Node* n, std::vector<Node*>* pn, const int S, int& T) {
284285
if(++id >= PROCESSOR::n_processors) id = 0;
285286
}
286287
} else {
287-
split(current,pn,S,T);
288+
split(current,S,T);
288289
}
289290
current = current->next;
290291
}
@@ -1765,31 +1766,29 @@ void SEARCHER::search_mc(bool single, unsigned int nodes_limit) {
17651766
/*
17661767
Traverse tree in parallel
17671768
*/
1768-
static std::vector<Node*> gc[MAX_CPUS+1];
1769-
17701769
void CDECL gc_thread_proc(void* seid_) {
17711770
int* seid = (int*)seid_;
17721771
for(int proc_id = seid[0]; proc_id < seid[1]; proc_id++) {
1773-
for(unsigned int i = 0; i < gc[proc_id].size(); i++) {
1774-
Node::reclaim(gc[proc_id][i],proc_id);
1772+
for(unsigned int i = 0; i < Node::gc[proc_id].size(); i++) {
1773+
Node::reclaim(Node::gc[proc_id][i],proc_id);
17751774
}
17761775
}
17771776
}
17781777
void CDECL rank_reset_thread_proc(void* seid_) {
17791778
int* seid = (int*)seid_;
17801779
for(int proc_id = seid[0]; proc_id < seid[1]; proc_id++) {
1781-
for(unsigned int i = 0; i < gc[proc_id].size(); i++) {
1782-
Node::rank_children(gc[proc_id][i]);
1783-
Node::reset_bounds(gc[proc_id][i]);
1780+
for(unsigned int i = 0; i < Node::gc[proc_id].size(); i++) {
1781+
Node::rank_children(Node::gc[proc_id][i]);
1782+
Node::reset_bounds(Node::gc[proc_id][i]);
17841783
}
17851784
}
17861785
}
17871786

17881787
void CDECL convert_score_thread_proc(void* seid_) {
17891788
int* seid = (int*)seid_;
17901789
for(int proc_id = seid[0]; proc_id < seid[1]; proc_id++) {
1791-
for(unsigned int i = 0; i < gc[proc_id].size(); i++) {
1792-
Node::convert_score(gc[proc_id][i]);
1790+
for(unsigned int i = 0; i < Node::gc[proc_id].size(); i++) {
1791+
Node::convert_score(Node::gc[proc_id][i]);
17931792
}
17941793
}
17951794
}
@@ -1803,15 +1802,15 @@ void Node::parallel_job(Node* n, PTHREAD_PROC func, bool recursive) {
18031802
for(int i = 1;i < PROCESSOR::n_processors;i++)
18041803
PROCESSOR::park(i);
18051804

1806-
Node::split(n, gc, S, T);
1805+
Node::split(n, S, T);
18071806

18081807
int* seid = new int[2 * ncores];
18091808
std::thread* tid = new std::thread[ncores];
18101809

18111810
if(!recursive)
1812-
gc[0].push_back(n);
1811+
Node::gc[0].push_back(n);
18131812
else {
1814-
gc[nprocs].push_back(n);
1813+
Node::gc[nprocs].push_back(n);
18151814
seid[0] = nprocs;
18161815
seid[1] = nprocs + 1;
18171816
tid[0] = t_create(*func,&seid[0]);
@@ -1830,10 +1829,10 @@ void Node::parallel_job(Node* n, PTHREAD_PROC func, bool recursive) {
18301829
delete[] tid;
18311830

18321831
for(int i = 0; i < nprocs;i++) {
1833-
for(unsigned int j = 0; j < gc[i].size(); j++) {
1834-
gc[i][j]->clear_dead();
1832+
for(unsigned int j = 0; j < Node::gc[i].size(); j++) {
1833+
Node::gc[i][j]->clear_dead();
18351834
}
1836-
gc[i].clear();
1835+
Node::gc[i].clear();
18371836
}
18381837

18391838
for(int i = 1;i < PROCESSOR::n_processors;i++)

src/parallel.cpp

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* at each node will create enough threads to engage all its processors.
1010
*/
1111

12-
static std::thread threads[MAX_CPUS];
12+
static std::vector<std::thread> threads;
1313

1414
#ifdef CLUSTER
1515

@@ -904,6 +904,12 @@ void SEARCHER::update_master(int skip) {
904904
* Copy board and other relevant data..
905905
*/
906906
void SEARCHER::attach_processor(int new_proc_id) {
907+
if(workers == 0) {
908+
workers = new std::atomic<SEARCHER*>[PROCESSOR::n_processors];
909+
for(int i = 0; i < PROCESSOR::n_processors; i++)
910+
workers[i] = 0;
911+
}
912+
907913
int j = 0;
908914
for(j = 0; (j < MAX_SEARCHERS_PER_CPU) && processors[new_proc_id]->searchers[j].used; j++);
909915
if(j < MAX_SEARCHERS_PER_CPU) {
@@ -941,8 +947,6 @@ void SEARCHER::clear_block() {
941947
host_workers.clear();
942948
#endif
943949
n_workers = 0;
944-
for(int i = 0; i < PROCESSOR::n_processors;i++)
945-
workers[i] = 0;
946950
l_unlock(lock);
947951

948952
/*reset counts*/
@@ -961,12 +965,14 @@ void SEARCHER::clear_block() {
961965
*/
962966
void SEARCHER::stop_workers() {
963967
l_lock(lock);
964-
for(int i = 0; i < PROCESSOR::n_processors; i++) {
965-
SEARCHER* pworker = workers[i].load();
966-
if(pworker) {
967-
if(pworker->n_workers)
968-
pworker->stop_workers();
969-
pworker->stop_searcher = 1;
968+
if(workers) {
969+
for(int i = 0; i < PROCESSOR::n_processors; i++) {
970+
SEARCHER* pworker = workers[i].load();
971+
if(pworker) {
972+
if(pworker->n_workers)
973+
pworker->stop_workers();
974+
pworker->stop_searcher = 1;
975+
}
970976
}
971977
}
972978
#ifdef CLUSTER
@@ -1126,6 +1132,13 @@ void PROCESSOR::wait(int id) {
11261132
* Initialize mt number of threads by creating/deleting
11271133
* threads from the pool of processors.
11281134
*/
1135+
static void resize_threads(int mt) {
1136+
threads.resize(mt);
1137+
processors.resize(mt, 0);
1138+
Node::mem_.resize(mt);
1139+
Node::gc.resize(mt+1);
1140+
Edges::mem_.resize(mt);
1141+
}
11291142
void init_smp(int mt) {
11301143
PPROCESSOR proc = processors[0];
11311144
int n_procs = PROCESSOR::n_processors;
@@ -1134,7 +1147,8 @@ void init_smp(int mt) {
11341147
reset_tables(proc,0);
11351148

11361149
if(n_procs < mt) {
1137-
for(int i = 1; i < MAX_CPUS;i++) {
1150+
resize_threads(mt);
1151+
for(int i = 1; i < mt;i++) {
11381152
if(n_procs < mt) {
11391153
if(processors[i] == 0) {
11401154
PROCESSOR::create(i);
@@ -1145,14 +1159,15 @@ void init_smp(int mt) {
11451159
while(PROCESSOR::n_idle_processors < mt - 1)
11461160
t_yield();
11471161
} else if(n_procs > mt) {
1148-
for(int i = MAX_CPUS - 1; i >= 1;i--) {
1162+
for(int i = n_procs - 1; i >= 1;i--) {
11491163
if(n_procs > mt) {
11501164
if(processors[i] != 0) {
11511165
PROCESSOR::kill(i);
11521166
n_procs--;
11531167
}
11541168
}
11551169
}
1170+
resize_threads(mt);
11561171
}
11571172
}
11581173

@@ -1165,5 +1180,6 @@ void PROCESSOR::set_main() {
11651180
proc->searcher->used = true;
11661181
proc->searcher->processor_id = 0;
11671182
proc->state = GO;
1183+
resize_threads(1);
11681184
processors[0] = proc;
11691185
}

src/probe.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,10 @@ bool SEARCHER::bitbase_cutoff() {
384384
Neural network
385385
*/
386386

387-
static float* inp_planes[MAX_CPUS];
388-
static unsigned short* all_pindex[MAX_CPUS];
389-
static float* all_policy[MAX_CPUS];
390-
static float all_wdl[MAX_CPUS][3];
387+
static std::vector<float*> inp_planes;
388+
static std::vector<unsigned short*> all_pindex;
389+
static std::vector<float*> all_policy;
390+
static std::vector<std::array<float,3>> all_wdl;
391391

392392
void init_input_planes() {
393393
static bool init_done = false;
@@ -399,6 +399,11 @@ void init_input_planes() {
399399
(8 * 8 * net_channels[NNUE] * 2) :
400400
(8 * 8 * net_channels[LCZERO]);
401401

402+
inp_planes.reserve(PROCESSOR::n_processors);
403+
all_pindex.reserve(PROCESSOR::n_processors);
404+
all_policy.reserve(PROCESSOR::n_processors);
405+
all_wdl.reserve(PROCESSOR::n_processors);
406+
402407
aligned_reserve<float>(planes, PROCESSOR::n_processors * N_PLANE);
403408
aligned_reserve<unsigned short>(index, PROCESSOR::n_processors * MAX_MOVES_NN);
404409
aligned_reserve<float>(policy, PROCESSOR::n_processors * MAX_MOVES_NN);

src/scorpio.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ int move_overhead = 1500;
3434
/*
3535
parallel search
3636
*/
37-
PPROCESSOR processors[MAX_CPUS] = {0};
37+
std::vector<PPROCESSOR> processors;
3838
int PROCESSOR::n_processors;
3939
int PROCESSOR::n_cores;
4040
std::atomic_int PROCESSOR::n_idle_processors;
@@ -437,7 +437,7 @@ static void print_options() {
437437
print_check("log",log_on);
438438
print_button("clear_hash");
439439
print_spin("resign",SEARCHER::resign_value,100,30000);
440-
print_spin("mt",PROCESSOR::n_processors,1,MAX_CPUS);
440+
print_spin("mt",PROCESSOR::n_processors,1,(1<<20));
441441
print_spin("ht",ht,1,131072);
442442
print_spin("eht",eht,1,16384);
443443
print_spin("pht",pht,1,256);
@@ -537,7 +537,6 @@ int internal_commands(char** commands,char* command,int& command_num) {
537537
*/
538538
} else if(!strcmp(command,"affinity")) {
539539
int affinity = atoi(commands[command_num]);
540-
affinity = MIN(affinity, MAX_CPUS);
541540
PROCESSOR::n_cores = set_affinity(affinity);
542541
command_num++;
543542
} else if(!strcmp(command,"mt") || !strcmp(command,"cores") || !strcmp(command,"Threads") ) {
@@ -553,7 +552,6 @@ int internal_commands(char** commands,char* command,int& command_num) {
553552
mt = PROCESSOR::n_cores / r;
554553
} else
555554
mt = atoi(commands[command_num]);
556-
mt = MIN(mt, MAX_CPUS);
557555
ht_setting_changed = true;
558556
}
559557
command_num++;

src/scorpio.h

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Some definitions to include/remove code
3333
# include <sys/time.h>
3434
#endif
3535
#include <vector>
36+
#include <array>
3637
#ifdef CLUSTER
3738
# include <list>
3839
# include "mpi.h"
@@ -42,9 +43,6 @@ Some definitions to include/remove code
4243
/*
4344
parallel search options
4445
*/
45-
#if !defined(MAX_CPUS)
46-
# define MAX_CPUS 512
47-
#endif
4846
#if defined(YBW)
4947
# define MAX_SEARCHERS_PER_CPU 32
5048
# define MAX_CPUS_PER_SPLIT 8
@@ -425,7 +423,7 @@ class Edges {
425423

426424
static void allocate(Edges&, int, int);
427425
static void reclaim(Edges&, int);
428-
static std::vector<uint16_t*> mem_[MAX_CPUS][MAX_MOVES_NN >> 3];
426+
static std::vector<std::array<std::vector<uint16_t*>, (MAX_MOVES_NN >> 3)>> mem_;
429427
};
430428
/*
431429
* Nodes of the tree
@@ -517,9 +515,10 @@ struct Node {
517515
static unsigned int max_tree_nodes;
518516
static unsigned int max_tree_depth;
519517
static unsigned int sum_tree_depth;
520-
static std::vector<Node*> mem_[MAX_CPUS];
518+
static std::vector<std::vector<Node*>> mem_;
519+
static std::vector<std::vector<Node*>> gc;
521520
static Node* allocate(int);
522-
static void split(Node*, std::vector<Node*>*, const int, int&);
521+
static void split(Node*, const int, int&);
523522
static void reclaim(Node*,int);
524523
static void rank_children(Node*);
525524
static void convert_score(Node*);
@@ -882,7 +881,7 @@ typedef struct SEARCHER{
882881
void clear_block();
883882
LOCK lock;
884883
std::atomic_int n_workers;
885-
std::atomic<SEARCHER*> workers[MAX_CPUS];
884+
std::atomic<SEARCHER*>* workers;
886885

887886
int get_smp_move();
888887
int check_split();
@@ -1220,7 +1219,7 @@ typedef struct PROCESSOR {
12201219

12211220
} *PPROCESSOR;
12221221

1223-
extern PPROCESSOR processors[MAX_CPUS];
1222+
extern std::vector<PPROCESSOR> processors;
12241223

12251224
/*
12261225
multi processors

src/search.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,7 +1342,7 @@ void SEARCHER::search() {
13421342
/*alphabeta*/
13431343
} else {
13441344
bool split_work =
1345-
(use_abdada_smp && search_depth > PROCESSOR::SMP_SPLIT_DEPTH);
1345+
(PROCESSOR::n_processors > 1 && use_abdada_smp && search_depth > PROCESSOR::SMP_SPLIT_DEPTH);
13461346

13471347
/*attach helper processor here once for abdada*/
13481348
if(split_work) {
@@ -2035,7 +2035,7 @@ MOVE SEARCHER::find_best() {
20352035

20362036
/*park threads*/
20372037
int n_end = (montecarlo && montecarlo_skipped) ?
2038-
PROCESSOR::n_cores : PROCESSOR::n_processors;
2038+
MIN(PROCESSOR::n_cores, PROCESSOR::n_processors) : PROCESSOR::n_processors;
20392039
for(int i = 1;i < n_end;i++)
20402040
PROCESSOR::park(i);
20412041
#if defined(CLUSTER)

src/util.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,8 +1054,7 @@ SEARCHER::SEARCHER() : board(&temp_board[36])
10541054
l_create(lock);
10551055
used = false;
10561056
n_workers = 0;
1057-
for(int i = 0; i < MAX_CPUS;i++)
1058-
workers[i] = NULL;
1057+
workers = 0;
10591058
processor_id = 0;
10601059
#ifdef CLUSTER
10611060
host_workers.clear();

0 commit comments

Comments
 (0)