حمیدرضا قبادی; fib.asm
;void fib(int32_t *dest, uint32_t count);
; not-unrolled version. See below for a version which avoids all the mov instructions
global fib
fib:
; 64bit SysV register-call ABI:
; args: rdi: output buffer pointer. esi: count (and you can assume the upper32 are zeroed, so using rsi is safe)
;; locals: rsi: endp
;; eax: current edx: prev
;; ecx: tmp
;; all of these are caller-saved in the SysV ABI, like r8-r11
;; so we can use them without push/pop to save/restore them.
;; The Windows ABI is different.
test esi, esi ; test a reg against itself instead of cmp esi, 0
jz .early_out ; count == 0.
mov eax, 1 ; current = 1
xor edx, edx ; prev = 0
lea rsi, [rdi + rsi * 4] ; endp = &out[count]; // loop-end pointer
;; lea is very useful for combining add, shift, and non-destructive operation
;; this is equivalent to shl rsi, 4 / add rsi, rdi
align 16
.loop: ; do {
mov [rdi], eax ; *buf = current
add rdi, 4 ; buf++
lea ecx, [rax + rdx] ; tmp = curr+prev = next_cur
mov edx, eax ; prev = curr
mov eax, ecx ; curr=tmp
;; see below for an unrolled version that doesn't need any reg->reg mov instructions
; you might think this would be faster:
; add edx, eax ; but it isn't
; xchg eax, edx ; This is as slow as 3 mov instructions, but we only needed 2 thanks to using lea
cmp rdi, rsi ; } while(buf < endp);
jb .loop ; jump if (rdi BELOW rsi). unsigned compare
;; the LOOP instruction is very slow, avoid it
.early_out:
ret