# Introduction from the Author
#
# This file contains a sample of code produced by Eric Postpischil
# during a project to design and implement a high-performance FFT
# European Aeronautic Defence and Space Company (EADS).
#
# EADS added the legal notices below in the process of extracting
# and approving this file for use as a code sample. The grammar in
# the notices is theirs.
#
# I suggest you start reading by going to the first occurrence
# of "_fft_r4_1i". There you will find a thorough description of
# the function performed, followed by additional comments and
# declarations and then the code.
#
# There are two versions of the code in this module. The first is
# a straightforward implementation, written to illustrate the
# operations in a natural order. The second is a high-performance
# implementation, in which operations have been rearranged to perform
# as quickly as possible on the processor. This means that instructions
# operating on different sets of data are interwoven, and the flow of
# data is hard to follow.
#
# The hope is that another engineer can understand the illustration
# version and use it as a guide to understanding the high-performance
# version. In addition, any changes that are desired in the future can
# be developed relatively easily in the illustration version (which is
# fully functional except for speed), and then the new illustration
# version can be used as the basis for creating a new high-performance
# version.
#
# -- edp (Eric Postpischil)
##############################################################################
#
# EADS Deutschland GmbH is the sole owner of any exploitation rights of the
# copyright of this code.
# The code is supplied by EADS Deutschland GmbH on the express
# understanding that it is to be treated as confidential and that it
# may not be copied, used or disclosed to others in whole or in part
# for any purpose except as authorised in writing by EADS Deutschland GmbH.
#
# (c) EADS Deutschland GmbH 2003
#
# The code was generated as part of a contract fully payed by
# EADS Deutschland GmbH.
#
# EADS Deutschland GmbH grants the author "Eric Postpischil" the right
# to use the code as a code display only for the indication of reference.
# However, the code may not be made any use directed to any commercial ex-
# ploitation (including, but not limited to the sale, assignment, grants
# of rights, improvements, modifications) of the sample of the source code
# by the author or any other third person and in particular may it be used
# as part of a different contract.
#
##############################################################################
#
# Original Author: Eric Postpischil
#
##############################################################################
# Description
#
# This module contains a single high-performance FFT subroutine.
# See below.
#
##############################################################################
##============================================================================#
# Include header Files
#============================================================================##
.include "fft_interface.s"
##============================================================================#
# Macros and defines
#============================================================================##
##============================================================================#
# Private data
#============================================================================##
##============================================================================#
# Imported data
#============================================================================##
##============================================================================#
# Private functions
#============================================================================##
##============================================================================#
# Imported functions
#============================================================================##
##============================================================================#
# Function bodies
#============================================================================##
# The FFT routines must be placed near each other so that branch displacements
# will not exceed the field sizes in the branch instructions.
.section "fft_complex", "arx"
.balign 4
###############################################################################
#F
#F FUNCTION
#F
#F _fft_r4_1i
#F
#F PURPOSE
#F
#F High-performance subroutine to perform a radix-4 butterfly
#F with one weight per loop iteration.
#F
#F Let:
#F
#F h represent the input vector of complex numbers, which
#F is stored in the arrays of float dataR and dataI so that
#F h[k] = dataR[k] + dataI[k] * i,
#F
#F w[k] be weights[k].w1r + weights[k].w1r*weight[k].w1i * i
#F (given that weights is an array of CommonWeight), and
#F
#F H represent the output vector of complex numbers, which
#F is stored in the arrays of float dataR and dataI so that
#F H[k] = dataR[k] + dataI[k] * i.
#F
#F This routine calculates:
#F
#F for (k0 = 0; k0 < count; ++k0)
#F for (k1 = 0; k1 < 4; ++k1)
#F for (k2 = 0; k2 < 4; ++k2)
#F H[(k0*4 + k1) * 4 + k2] =
#F sum( 1^(j1*r(k1))
#F * w[k0]^j
#F * h[(k0*4 + j1) * 4 + k2],
#F 0 <= j1 < 4).
#F
#F provided that for each w[k0] used:
#F
#F weights[k0].w1r = Re(w[k0]),
#F weights[k0].w1i = Im(w[k0])/Re(w[k0]),
#F weights[k0].w2r = Re(w[k0]^2),
#F weights[k0].w2i = Im(w[k0]^2)/Re(w[k0]^2),
#F weights[k0].w3r = Re(w[k0]^3)/Re(w[k0]), and
#F weights[k0].w3i = Im(w[k0]^3)/Re(w[k0]^3).
#F
# [Note: Other project documentation explains that 1^x (used above) stands for
# exp(2*pi*i*x). See my _Construction of a High-Performance FFT_ paper.]
#F
#F These calculations are performed as though h and H are separate,
#F even though they use the same memory.
#F
#F INPUTS
#F
#F See fft_interface.s for documentation on the FFT subroutine interfaces.
#F
#F This routine uses these registers as input:
#F
#F dataR, dataI
#F Addresses of real and imaginary parts of
#F data, respectively. If this routine is used
#F on a part of the data, these are the addresses
#F of the part to be transformed, not the
#F addresses of the starts of the entire vectors.
#F
#F count
#F Number of iterations to perform. Each iteration
#F does four butterflies. Each butterfly transforms
#F four elements.
#F
#F weights
#F Address in CommonWeights. If this routine is
#F used on a part of the data, this is the address
#F of the start of the weights to be used, not the
#F address of the start of the entire table.
#F
#F w1r, w1i, w2r, w2i, w3r, w3i
#F Scratch registers to be changed by this routine.
#F
#F This routine uses these constants:
#F
#F BytesPerBlock
#F Offset in bytes between the AltiVec block containing
#F data for one set of four butterflies and the block
#F containing data for the next set of butterflies.
#F
#F SizeOfCommonWeight
#F Size in bytes of one structure of common weights.
#F (Six floating-point numbers.)
#F
#F Note that the address, offset, and count given as arguments
#F define locations in memory. The data in those locations is also
#F input to this routine.
#F
#F LOCAL DATA
#F None
#F
#F PROCESS CONTROL
#F None
#F
#F PROCESSING
#F
#F See the sample code below. This routine is documented by giving
#F a straightforward instruction sequence that performs the butterfly
#F calculations. This sequence is simple, commented, and fully
#F functional. Changing the expression in the .if statement from 0
#F to 1 will yield a module that produces exactly the same results
#F but not at high performance.
#F
#F The code in the .else section performs identical calculations but
#F uses duplicated and reordered instructions, unrolled loops, and
#F minor changes in address arithmetic needed to support the other
#F changes.
#F
#F UTILIZATION OF OTHER ELEMENTS
#F None
#F
#F LIMITATIONS
#F
#F The pointer into the weights must be aligned to an AltiVec
#F boundary, a multiple of 16 bytes. Since the weights are in
#F sets of six real numbers (which are four bytes each), this
#F means the pointer must point to an even-indexed set.
#F
#F This requirement is easily met by always performing an even
#F number of iterations. After performing an even number of
#F iterations, the weight pointer will be pointing to an even-indexed
#F set.
#F
#F OUTPUTS
#F
#F The input registers are modified:
#F
#F dataR, dataI
#F Unspecified modifications. Could perhaps be made
#F useful if desired, by being left pointing to the
#F following data.
#F
#F count
#F Unchanged.
#F
#F weights
#F Unspecified modifications. Could perhaps be made
#F useful if desired, by being left pointing to the
#F following weights.
#F
#F w1r, w1i, w2r, w2i, w3r, w3i
#F Unspecified modifications.
#F
#F The defined data in memory is transformed.
#F
#F RETURNS
#F None
#F
###############################################################################
.if 0
# This is a demonstration version of the routine. See comments
# under PROCESSING above.
.global _fft_r4_1i
_fft_r4_1i:
# Assign AltiVec registers to local symbol names.
# The naming convention is:
#
#
#
# starts at "a" for input data and advances to
# "b", "c", and "d" as calculations are performed.
#
# is 0, 1, 2, and 3 for the four elements needed
# for each butterfly.
#
# is "r" for real and "i" for imaginary.
#
# Thus b2i is the imaginary component of element 2 of the data
# in the second phase of the butterfly calculations.
.equiv a0r, volatileV0
.equiv a0i, volatileV1
.equiv a1r, volatileV2
.equiv a1i, volatileV3
.equiv a2r, volatileV4
.equiv a2i, volatileV5
.equiv a3r, volatileV6
.equiv a3i, volatileV7
.equiv b1r, volatileV10
.equiv b1i, volatileV11
.equiv b2r, volatileV12
.equiv b2i, volatileV13
.equiv b3r, volatileV14
.equiv b3i, volatileV15
.equiv c0r, volatileV16
.equiv c0i, volatileV17
.equiv c1r, volatileV18
.equiv c1i, volatileV19
.equiv c2r, volatileV20
.equiv c2i, volatileV21
.equiv c3r, volatileV22
.equiv c3i, volatileV23
.equiv d0r, volatileV0
.equiv d0i, volatileV1
.equiv d1r, volatileV2
.equiv d1i, volatileV3
.equiv d2r, volatileV4
.equiv d2i, volatileV5
.equiv d3r, volatileV6
.equiv d3i, volatileV7
.equiv t0, volatileV24
.equiv t1, volatileV25
# Assign some general registers to local symbol names.
.equiv LoadOffset0, 0 # Assigned to %r0 because always zero.
.equiv LoadOffset1, volatileR0
.equiv LoadOffset2, volatileR1
.equiv LoadOffset3, volatileR2
.equiv StoreOffset0, volatileR3
.equiv StoreOffset1, volatileR4
.equiv StoreOffset2, volatileR5
.equiv StoreOffset3, volatileR6
.equiv Minus16, volatileR7
# Set the byte offset between input elements for the butterfly.
li LoadOffset1, 1*BytesPerBlock
# Get the offset to the element 2 of the butterfly.
li LoadOffset2, 2*BytesPerBlock
# Set the number of iterations.
mtctr count
# Get the offset to element 3 of the butterfly.
li LoadOffset3, 3*BytesPerBlock
# Prepare the store offsets to be stage displacements from the
# load offsets. There is no stage displacement in a single-stage
# version, but these are different in the optimized version below.
addi StoreOffset0, LoadOffset0, -0*4*BytesPerBlock
addi StoreOffset1, LoadOffset1, -0*4*BytesPerBlock
addi StoreOffset2, LoadOffset2, -0*4*BytesPerBlock
addi StoreOffset3, LoadOffset3, -0*4*BytesPerBlock
# -16 is used as an offset when addressing weights. See below.
li Minus16, -16
# A complication in this routine is that the weights it must load are in
# groups of six, which is not good for AltiVec’s four-element registers.
# This routine requires that we start at an even-indexed weight set (see
# LIMITATIONS above), so the weight pointer points to a weight set beginning
# on an AltiVec boundary. For that set, we load the AltiVec block at the
# pointer and the block after it into registers t0 and t1. That gives us
# eight numbers. Using splat instructions, we get the six numbers we
# need, replicating each of them four times in its own register.
#
# Then we do one loop iteration (four butterflies). To do the next
# iteration, we need another weight set. We already have two of its numbers,
# left over in t1. So we load the next block from the weights into t0, and
# that gives us the other four. We splat those into weight registers, and
# we are ready for the next set of butterflies.
#
# Since we get the weights in two different ways, we have two copies of this
# loop with the different weight-loading instructions in each. We simply
# go through one and then the other repeatedly.
1:
# This code handles even iterations.
# Load the weights.
lvx t0, Zero, weights
lvx t1, Sixteen, weights
# Unpack the weights.
vspltw w1r, t0, 0
vspltw w1i, t0, 1
vspltw w2r, t0, 2
vspltw w2i, t0, 3
vspltw w3r, t1, 0
vspltw w3i, t1, 1
# Load the input data.
lvx a0r, LoadOffset0, dataR
lvx a0i, LoadOffset0, dataI
lvx a1r, LoadOffset1, dataR
lvx a1i, LoadOffset1, dataI
lvx a2r, LoadOffset2, dataR
lvx a2i, LoadOffset2, dataI
lvx a3r, LoadOffset3, dataR
lvx a3i, LoadOffset3, dataI
# Do the butterflies.
vnmsubfp b1r, a1i, w1i, a1r
vmaddfp b1i, a1r, w1i, a1i
vnmsubfp b2r, a2i, w2i, a2r
vmaddfp b2i, a2r, w2i, a2i
vnmsubfp b3r, a3i, w3i, a3r
vmaddfp b3i, a3r, w3i, a3i
vmaddfp c0r, b2r, w2r, a0r
vmaddfp c0i, b2i, w2r, a0i
vnmsubfp c2r, b2r, w2r, a0r
vnmsubfp c2i, b2i, w2r, a0i
vmaddfp c1r, b3r, w3r, b1r
vmaddfp c1i, b3i, w3r, b1i
vnmsubfp c3r, b3r, w3r, b1r
vnmsubfp c3i, b3i, w3r, b1i
vmaddfp d0r, c1r, w1r, c0r
vmaddfp d0i, c1i, w1r, c0i
vnmsubfp d1r, c1r, w1r, c0r
vnmsubfp d1i, c1i, w1r, c0i
vnmsubfp d2r, c3i, w1r, c2r
vmaddfp d2i, c3r, w1r, c2i
vmaddfp d3r, c3i, w1r, c2r
vnmsubfp d3i, c3r, w1r, c2i
stvx d0r, StoreOffset0, dataR
stvx d0i, StoreOffset0, dataI
stvx d1r, StoreOffset1, dataR
stvx d1i, StoreOffset1, dataI
stvx d2r, StoreOffset2, dataR
stvx d2i, StoreOffset2, dataI
stvx d3r, StoreOffset3, dataR
stvx d3i, StoreOffset3, dataI
addi dataR, dataR, 4*BytesPerBlock
addi dataI, dataI, 4*BytesPerBlock
# Decrement counter and branch out of loop if caller requested
# an odd number of iterations.
bdz 2f
# This code handles odd iterations.
# Advance the weight pointer. Note that we advance it by the size of
# two weight sets. This allows us to do only one addi instruction
# every two iterations, which is important in the high-performance
# version, but it complicates calculating the address of the
# weights. Having advanced the pointer beyond the weights we want,
# we need an offset of -16 to get those weights. Also note that
# 2*SizeOfCommonWeight must be a multiple of 16 bytes.
addi weights, weights, 2*SizeOfCommonWeight
# Load the weights. At this point, weights is pointing to the
# next set of weights beyond the set we want. We already have
# two numbers from the set in t1, because they are left over
# from the even iteration. We need to get the last four numbers
# of the set. They are four bytes each, so they are at an offset
# of -4*4 bytes from where the pointer points now.
lvx t0, Minus16, weights
# Unpack the weights.
vspltw w1r, t1, 2
vspltw w1i, t1, 3
vspltw w2r, t0, 0
vspltw w2i, t0, 1
vspltw w3r, t0, 2
vspltw w3i, t0, 3
# Load the input data.
lvx a0r, LoadOffset0, dataR
lvx a0i, LoadOffset0, dataI
lvx a1r, LoadOffset1, dataR
lvx a1i, LoadOffset1, dataI
lvx a2r, LoadOffset2, dataR
lvx a2i, LoadOffset2, dataI
lvx a3r, LoadOffset3, dataR
lvx a3i, LoadOffset3, dataI
vnmsubfp b1r, a1i, w1i, a1r
vmaddfp b1i, a1r, w1i, a1i
vnmsubfp b2r, a2i, w2i, a2r
vmaddfp b2i, a2r, w2i, a2i
vnmsubfp b3r, a3i, w3i, a3r
vmaddfp b3i, a3r, w3i, a3i
vmaddfp c0r, b2r, w2r, a0r
vmaddfp c0i, b2i, w2r, a0i
vnmsubfp c2r, b2r, w2r, a0r
vnmsubfp c2i, b2i, w2r, a0i
vmaddfp c1r, b3r, w3r, b1r
vmaddfp c1i, b3i, w3r, b1i
vnmsubfp c3r, b3r, w3r, b1r
vnmsubfp c3i, b3i, w3r, b1i
vmaddfp d0r, c1r, w1r, c0r
vmaddfp d0i, c1i, w1r, c0i
vnmsubfp d1r, c1r, w1r, c0r
vnmsubfp d1i, c1i, w1r, c0i
vnmsubfp d2r, c3i, w1r, c2r
vmaddfp d2i, c3r, w1r, c2i
vmaddfp d3r, c3i, w1r, c2r
vnmsubfp d3i, c3r, w1r, c2i
stvx d0r, StoreOffset0, dataR
stvx d0i, StoreOffset0, dataI
stvx d1r, StoreOffset1, dataR
stvx d1i, StoreOffset1, dataI
stvx d2r, StoreOffset2, dataR
stvx d2i, StoreOffset2, dataI
stvx d3r, StoreOffset3, dataR
stvx d3i, StoreOffset3, dataI
addi dataR, dataR, 4*BytesPerBlock
addi dataI, dataI, 4*BytesPerBlock
# Decrement the counter and go back to the even iteration.
bdnz 1b
2:
blr
.else
# Here is a high-performance version of the routine.
#
# The main loop runs in 31 cycles per iteration (62 per double iteration).
#
# For information about how this version of the routine was created
# from the straightforward version above, see the sections in
# ReadMe.html beginning with "The Structure of My Optimized Routines".
# This routine is complicated by having to deal with
# loading weights different for weights with even and odd
# indices. So there are two versions of the loop body, with
# slightly different instructions for loading the weights.
# This is described in the non-optimized version above and
# fortunately causes little problem here.
#
# Instructions that differ in even and odd iterations have
# been marked by adding "even" or "odd" to the comment on
# the line with the instruction. Most of those instructions
# just change form slightly, getting data from a different
# place in t0 or t1. However, two instructions are totally
# different in the two iterations:
#
# lvx t0, Zero, weights # S0 even
#
# and
#
# addi weights, weights, 2*SizeOfCommonWeight # S0 odd
.global _fft_r4_1i
_fft_r4_1i:
# Assign AltiVec registers to local symbol names.
# The naming convention is:
#
#
#
# starts at "a" for input data and advances to
# "b", "c", and "d" as calculations are performed.
#
# is 0, 1, 2, and 3 for the four elements needed
# for each butterfly.
#
# is "r" for real and "i" for imaginary.
#
# Thus b2i is the imaginary component of element 2 of the data
# in the second phase of the butterfly calculations.
.equiv a0r, volatileV0
.equiv a0i, volatileV1
.equiv a1r, volatileV2
.equiv a1i, volatileV3
.equiv a2r, volatileV4
.equiv a2i, volatileV5
.equiv a3r, volatileV6
.equiv a3i, volatileV7
.equiv b1r, volatileV8
.equiv b1i, volatileV9
.equiv b2r, volatileV10
.equiv b2i, volatileV11
.equiv b3r, volatileV12
.equiv b3i, volatileV13
.equiv c0r, volatileV14
.equiv c0i, volatileV15
.equiv c1r, volatileV16
.equiv c1i, volatileV17
.equiv c2r, volatileV18
.equiv c2i, volatileV19
.equiv c3r, b1r
.equiv c3i, b1i
.equiv d0r, volatileV20
.equiv d0i, volatileV21
.equiv d1r, volatileV22
.equiv d1i, volatileV23
.equiv d2r, volatileV20
.equiv d2i, volatileV21
.equiv d3r, volatileV22
.equiv d3i, volatileV23
.equiv t0, volatileV24
.equiv t1, volatileV25
# Assign some general registers to local symbol names.
.equiv LoadOffset0, 0 # Assigned to %r0 because always zero.
.equiv LoadOffset1, volatileR0
.equiv LoadOffset2, volatileR1
.equiv LoadOffset3, volatileR2
.equiv StoreOffset0, volatileR3
.equiv StoreOffset1, volatileR4
.equiv StoreOffset2, volatileR5
.equiv StoreOffset3, volatileR6
.equiv Minus16, volatileR7
# Note: For the in-cache routine, this routine is called only for
# the entire vector, so the first iteration is always for the first
# butterfly of the past, which has weight one. Part of that iteration
# is performed in the prologue, separate from other iterations, so
# it could be changed to omit some weight operations and save a few
# cycles. (Very few, maybe six or so out of the entire FFT.)
# Prologue, stage 0 for an even iteration.
lvx t0, Zero, weights # S0 even
li LoadOffset2, 2*BytesPerBlock
lvx t1, Sixteen, weights # S0 even
li LoadOffset1, 1*BytesPerBlock
lvx a2r, LoadOffset2, dataR # S0
li LoadOffset3, 3*BytesPerBlock
lvx a2i, LoadOffset2, dataI # S0
mtctr count
lvx a1r, LoadOffset1, dataR # S0
vspltw w2i, t0, 3 # S0 even
lvx a1i, LoadOffset1, dataI # S0
vspltw w1i, t0, 1 # S0 even
lvx a3r, LoadOffset3, dataR # S0
vspltw w2r, t0, 2 # S0 even
lvx a3i, LoadOffset3, dataI # S0
vspltw w3i, t1, 1 # S0 even
lvx a0r, LoadOffset0, dataR # S0
vspltw w3r, t1, 0 # S0 even
lvx a0i, LoadOffset0, dataI # S0
vnmsubfp b2r, a2i, w2i, a2r # S0
vmaddfp b2i, a2r, w2i, a2i # S0
vnmsubfp b1r, a1i, w1i, a1r # S0
addi StoreOffset0, LoadOffset0, -1*4*BytesPerBlock
vmaddfp b1i, a1r, w1i, a1i # S0
addi StoreOffset1, LoadOffset1, -1*4*BytesPerBlock
vnmsubfp b3r, a3i, w3i, a3r # S0
addi StoreOffset2, LoadOffset2, -1*4*BytesPerBlock
vmaddfp b3i, a3r, w3i, a3i # S0
addi StoreOffset3, LoadOffset3, -1*4*BytesPerBlock
vmaddfp c0r, b2r, w2r, a0r # S0
li Minus16, -16
vmaddfp c0i, b2i, w2r, a0i # S0
bdz 2f
.balign 8
# Main loop.
# Stage 0 is in an odd iteration, stage 1 is in an even iteration.
1:
vmaddfp c1r, b3r, w3r, b1r # S1
vmaddfp c1i, b3i, w3r, b1i # S1
addi weights, weights, 2*SizeOfCommonWeight # S0 odd
vnmsubfp c2r, b2r, w2r, a0r # S1
vnmsubfp c2i, b2i, w2r, a0i # S1
addi dataR, dataR, 4*BytesPerBlock
vnmsubfp c3r, b3r, w3r, b1r # S1
vnmsubfp c3i, b3i, w3r, b1i # S1
vspltw w1r, t0, 0 # S1 even
lvx t0, Minus16, weights # S0 odd
addi dataI, dataI, 4*BytesPerBlock
lvx a2r, LoadOffset2, dataR # S0
lvx a2i, LoadOffset2, dataI # S0
lvx a1r, LoadOffset1, dataR # S0
vspltw w1i, t1, 3 # S0 odd
lvx a1i, LoadOffset1, dataI # S0
vspltw w3i, t0, 3 # S0 odd
lvx a3r, LoadOffset3, dataR # S0
vspltw w3r, t0, 2 # S0 odd
lvx a3i, LoadOffset3, dataI # S0
vspltw w2i, t0, 1 # S0 odd
lvx a0r, LoadOffset0, dataR # S0
vspltw w2r, t0, 0 # S0 odd
lvx a0i, LoadOffset0, dataI # S0
vmaddfp d0r, c1r, w1r, c0r # S1
stvx d0r, StoreOffset0, dataR # S1
vmaddfp d0i, c1i, w1r, c0i # S1
vnmsubfp d1r, c1r, w1r, c0r # S1
stvx d0i, StoreOffset0, dataI # S1
vnmsubfp d1i, c1i, w1r, c0i # S1
vnmsubfp d2r, c3i, w1r, c2r # S1
stvx d1r, StoreOffset1, dataR # S1
vmaddfp d2i, c3r, w1r, c2i # S1
vmaddfp d3r, c3i, w1r, c2r # S1
stvx d1i, StoreOffset1, dataI # S1
vnmsubfp d3i, c3r, w1r, c2i # S1
vnmsubfp b2r, a2i, w2i, a2r # S0
stvx d2r, StoreOffset2, dataR # S1
vmaddfp b2i, a2r, w2i, a2i # S0
vnmsubfp b1r, a1i, w1i, a1r # S0
stvx d2i, StoreOffset2, dataI # S1
vmaddfp b1i, a1r, w1i, a1i # S0
vnmsubfp b3r, a3i, w3i, a3r # S0
stvx d3r, StoreOffset3, dataR # S1
vmaddfp b3i, a3r, w3i, a3i # S0
vmaddfp c0r, b2r, w2r, a0r # S0
stvx d3i, StoreOffset3, dataI # S1
vmaddfp c0i, b2i, w2r, a0i # S0
bdz 3f
# Stage 0 is in an even iteration, stage 1 is in an odd iteration.
vmaddfp c1r, b3r, w3r, b1r # S1
vmaddfp c1i, b3i, w3r, b1i # S1
lvx t0, Zero, weights # S0 even
vnmsubfp c2r, b2r, w2r, a0r # S1
vnmsubfp c2i, b2i, w2r, a0i # S1
addi dataR, dataR, 4*BytesPerBlock
vnmsubfp c3r, b3r, w3r, b1r # S1
vnmsubfp c3i, b3i, w3r, b1i # S1
vspltw w1r, t1, 2 # S1 odd
lvx t1, Sixteen, weights # S0 even
addi dataI, dataI, 4*BytesPerBlock
lvx a2r, LoadOffset2, dataR # S0
lvx a2i, LoadOffset2, dataI # S0
lvx a1r, LoadOffset1, dataR # S0
vspltw w2i, t0, 3 # S0 even
lvx a1i, LoadOffset1, dataI # S0
vspltw w1i, t0, 1 # S0 even
lvx a3r, LoadOffset3, dataR # S0
vspltw w2r, t0, 2 # S0 even
lvx a3i, LoadOffset3, dataI # S0
vspltw w3i, t1, 1 # S0 even
lvx a0r, LoadOffset0, dataR # S0
vspltw w3r, t1, 0 # S0 even
lvx a0i, LoadOffset0, dataI # S0
vmaddfp d0r, c1r, w1r, c0r # S1
stvx d0r, StoreOffset0, dataR # S1
vmaddfp d0i, c1i, w1r, c0i # S1
vnmsubfp d1r, c1r, w1r, c0r # S1
stvx d0i, StoreOffset0, dataI # S1
vnmsubfp d1i, c1i, w1r, c0i # S1
vnmsubfp d2r, c3i, w1r, c2r # S1
stvx d1r, StoreOffset1, dataR # S1
vmaddfp d2i, c3r, w1r, c2i # S1
vmaddfp d3r, c3i, w1r, c2r # S1
stvx d1i, StoreOffset1, dataI # S1
vnmsubfp d3i, c3r, w1r, c2i # S1
vnmsubfp b2r, a2i, w2i, a2r # S0
stvx d2r, StoreOffset2, dataR # S1
vmaddfp b2i, a2r, w2i, a2i # S0
vnmsubfp b1r, a1i, w1i, a1r # S0
stvx d2i, StoreOffset2, dataI # S1
vmaddfp b1i, a1r, w1i, a1i # S0
vnmsubfp b3r, a3i, w3i, a3r # S0
stvx d3r, StoreOffset3, dataR # S1
vmaddfp b3i, a3r, w3i, a3i # S0
vmaddfp c0r, b2r, w2r, a0r # S0
stvx d3i, StoreOffset3, dataI # S1
vmaddfp c0i, b2i, w2r, a0i # S0
bdnz 1b
# Epilogue, stage 1 for an even iteration.
2:
vmaddfp c1r, b3r, w3r, b1r # S1
vmaddfp c1i, b3i, w3r, b1i # S1
vspltw w1r, t0, 0 # S1 even
# The remaining code is the same for both epilogues.
b 4f
# Epilogue, stage 1 for an odd iteration.
3:
vmaddfp c1r, b3r, w3r, b1r # S1
vmaddfp c1i, b3i, w3r, b1i # S1
vspltw w1r, t1, 2 # S1 odd
4:
vnmsubfp c2r, b2r, w2r, a0r # S1
vnmsubfp c2i, b2i, w2r, a0i # S1
addi dataR, dataR, 4*BytesPerBlock
vnmsubfp c3r, b3r, w3r, b1r # S1
vnmsubfp c3i, b3i, w3r, b1i # S1
addi dataI, dataI, 4*BytesPerBlock
vmaddfp d0r, c1r, w1r, c0r # S1
stvx d0r, StoreOffset0, dataR # S1
vmaddfp d0i, c1i, w1r, c0i # S1
stvx d0i, StoreOffset0, dataI # S1
vnmsubfp d1r, c1r, w1r, c0r # S1
stvx d1r, StoreOffset1, dataR # S1
vnmsubfp d1i, c1i, w1r, c0i # S1
stvx d1i, StoreOffset1, dataI # S1
vnmsubfp d2r, c3i, w1r, c2r # S1
stvx d2r, StoreOffset2, dataR # S1
vmaddfp d2i, c3r, w1r, c2i # S1
stvx d2i, StoreOffset2, dataI # S1
vmaddfp d3r, c3i, w1r, c2r # S1
stvx d3r, StoreOffset3, dataR # S1
vnmsubfp d3i, c3r, w1r, c2i # S1
stvx d3i, StoreOffset3, dataI # S1
blr
.endif