/* * Fast Fourier transform * * Copyright (c) 2021 Project Nayuki. (MIT License) * https://www.nayuki.io/page/fast-fourier-transform-in-x86-assembly * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in * the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of * the Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * - The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * - The Software is provided "as is", without warranty of any kind, express or * implied, including but not limited to the warranties of merchantability, * fitness for a particular purpose and noninfringement. In no event shall the * authors or copyright holders be liable for any claim, damages or other * liability, whether in an action of contract, tort or otherwise, arising from, * out of or in connection with the Software or the use or other dealings in the * Software. */ /* * Storage usage: * Bytes Location Description * 8 rcx Size of FFT (i.e. number of elements in the vector) (a power of 2), or this times 16 * 8 rdi Base of complex vector array, constant (64-bit floats) * 8 rsi Base of bit reversal array (64-bit ints), then base of complex exponential array (64-bit floats) * 8 rax Primary loop counter * 8 rdx Temporary, loop counter * 8 r8 Temporary, loop counter * 8 r9 Temporary * 8 r10 Temporary * 224 ymm0-6 Temporary (64-bit float vectors) * 16 xmm7 Temporary (64-bit float vector) * 32 ymm8 Negation constants (64-bit float vector) */ # void Fft_transformImpl(uint64_t n, const uint64_t *bitReversal, const double *expTable, double *vec) .globl Fft_transformImpl Fft_transformImpl: # Permute register names for aesthetics xchgq %rcx, %rdi movq %rdx, %r9 # Bit-reversed addressing permutation movq $0, %rax .bitrevloop: movq (%rsi,%rax,8), %rdx cmpq %rdx, %rax jae .bitrevskip movq %rax, %r8 shlq $4, %r8 shlq $4, %rdx vmovupd (%rdi,%r8 ), %xmm0 vmovupd (%rdi,%rdx), %xmm1 vmovupd %xmm1, (%rdi,%r8 ) vmovupd %xmm0, (%rdi,%rdx) .bitrevskip: incq %rax cmpq %rcx, %rax jb .bitrevloop # Size-2 merge (special) shlq $4, %rcx movq $0, %rax .size2loop: vmovupd 0(%rdi,%rax), %xmm0 vmovupd 16(%rdi,%rax), %xmm1 vaddpd %xmm1, %xmm0, %xmm2 vsubpd %xmm1, %xmm0, %xmm3 vmovupd %xmm2, 0(%rdi,%rax) vmovupd %xmm3, 16(%rdi,%rax) addq $32, %rax cmpq %rcx, %rax jb .size2loop # Size-4 merge (special) vmovapd .size4negation(%rip), %xmm8 movq $0, %rax .size4loop: vmovupd 0(%rdi,%rax), %xmm0 vmovupd 16(%rdi,%rax), %xmm1 vmovupd 32(%rdi,%rax), %xmm2 vmovupd 48(%rdi,%rax), %xmm3 vaddpd %xmm2, %xmm0, %xmm4 vsubpd %xmm2, %xmm0, %xmm6 vpermilpd $1, %xmm3, %xmm3 vxorpd %xmm3, %xmm8, %xmm3 vaddpd %xmm3, %xmm1, %xmm5 vsubpd %xmm3, %xmm1, %xmm7 vmovupd %xmm4, 0(%rdi,%rax) vmovupd %xmm5, 16(%rdi,%rax) vmovupd %xmm6, 32(%rdi,%rax) vmovupd %xmm7, 48(%rdi,%rax) addq $64, %rax cmpq %rcx, %rax jb .size4loop shrq $4, %rcx # Size-8 and larger merges (general) vmovapd .generalnegation(%rip), %ymm8 movq %r9, %rsi movq $8, %r8 jmp .outerloop1 .outerloop0: leaq (%rsi,%r8,8), %rsi # Advance the trigonometric tables shlq %r8 .outerloop1: movq $0, %rdx .middleloop: movq $0, %rax .innerloop: leaq (%rdx,%rax), %r9 leaq (%r9,%r8), %r10 vmovupd (%rdi,%r9 ,8), %ymm0 vmovupd (%rdi,%r10,8), %ymm1 vmovupd (%rsi,%rax,8), %ymm2 vpermilpd $15, %ymm2, %ymm3 vpermilpd $0 , %ymm2, %ymm2 vxorpd %ymm3, %ymm8, %ymm3 vmulpd %ymm1, %ymm2, %ymm4 vpermilpd $5, %ymm1, %ymm1 vfmadd231pd %ymm1, %ymm3, %ymm4 vaddpd %ymm4, %ymm0, %ymm5 vsubpd %ymm4, %ymm0, %ymm6 vmovupd %ymm5, (%rdi,%r9 ,8) vmovupd %ymm6, (%rdi,%r10,8) addq $4, %rax cmpq %r8, %rax jb .innerloop # End inner loop leaq (%rdx,%r8,2), %rdx leaq (%rcx,%rcx), %r9 cmpq %r9, %rdx jb .middleloop # End middle loop cmpq %rcx, %r8 jb .outerloop0 # End outer loop # Clean up vzeroall retq # Constants for XMM/YMM .balign 16 .size4negation: .quad 0x0000000000000000, 0x8000000000000000 .balign 32 .generalnegation: .quad 0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000