Skip to content

Commit 3151788

Browse files
SChernykhmalbit
authored andcommitted
Optimized dataset read (tevador#211)
* Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing
1 parent ea839a6 commit 3151788

File tree

5 files changed

+20
-18
lines changed

5 files changed

+20
-18
lines changed

src/asm/program_prologue_linux.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
mov rsi, rdx ;# uint8_t* scratchpad
1616

1717
mov rax, rbp
18+
ror rbp, 32
1819

1920
;# zero integer registers
2021
xor r8, r8

src/asm/program_prologue_win64.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
mov rbx, r9 ;# loop counter
2929

3030
mov rax, rbp
31+
ror rbp, 32
3132

3233
;# zero integer registers
3334
xor r8, r8

src/asm/program_read_dataset.inc

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
1+
mov ecx, ebp ;# ecx = ma
2+
and ecx, RANDOMX_DATASET_BASE_MASK
3+
xor r8, qword ptr [rdi+rcx]
4+
ror rbp, 32 ;# swap "ma" and "mx"
15
xor rbp, rax ;# modify "mx"
26
mov edx, ebp ;# edx = mx
37
and edx, RANDOMX_DATASET_BASE_MASK
48
prefetchnta byte ptr [rdi+rdx]
5-
ror rbp, 32 ;# swap "ma" and "mx"
6-
mov edx, ebp ;# edx = ma
7-
and edx, RANDOMX_DATASET_BASE_MASK
8-
lea rcx, [rdi+rdx] ;# dataset cache line
9-
xor r8, qword ptr [rcx+0]
10-
xor r9, qword ptr [rcx+8]
11-
xor r10, qword ptr [rcx+16]
12-
xor r11, qword ptr [rcx+24]
13-
xor r12, qword ptr [rcx+32]
14-
xor r13, qword ptr [rcx+40]
15-
xor r14, qword ptr [rcx+48]
16-
xor r15, qword ptr [rcx+56]
9+
xor r9, qword ptr [rdi+rcx+8]
10+
xor r10, qword ptr [rdi+rcx+16]
11+
xor r11, qword ptr [rdi+rcx+24]
12+
xor r12, qword ptr [rdi+rcx+32]
13+
xor r13, qword ptr [rdi+rcx+40]
14+
xor r14, qword ptr [rdi+rcx+48]
15+
xor r15, qword ptr [rdi+rcx+56]
1716

src/asm/program_read_dataset_sshash_init.inc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
mov qword ptr [rsp+16], r13
99
mov qword ptr [rsp+8], r14
1010
mov qword ptr [rsp+0], r15
11-
xor rbp, rax ;# modify "mx"
1211
ror rbp, 32 ;# swap "ma" and "mx"
13-
mov ebx, ebp ;# ecx = ma
14-
and ebx, RANDOMX_DATASET_BASE_MASK
15-
shr ebx, 6 ;# ebx = Dataset block number
12+
xor rbp, rax ;# modify "mx"
13+
mov rbx, rbp ;# ebx = ma
14+
shr rbx, 38
15+
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
1616
;# add ebx, datasetOffset / 64
1717
;# call 32768

src/jit_compiler_a64_static.S

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0
307307
literal_v15: .fill 2,8,0
308308

309309
DECL(randomx_program_aarch64_vm_instructions_end):
310+
# Calculate dataset pointer for dataset read
311+
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
312+
lsr x10, x9, 32
310313

311314
# mx ^= r[readReg2] ^ r[readReg3];
312315
eor x9, x9, x18
@@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
324327
# mx <-> ma
325328
ror x9, x9, 32
326329

327-
# Calculate dataset pointer for dataset read
328-
mov w10, w9
329330
DECL(randomx_program_aarch64_cacheline_align_mask2):
330331
# Actual mask will be inserted by JIT compiler
331332
and x10, x10, 1

0 commit comments

Comments
 (0)