#   Introduction from the Author
#
#   This file contains a sample of code produced by Eric Postpischil
#   during a project to design and implement a high-performance FFT
#   European Aeronautic Defence and Space Company (EADS).
#
#   EADS added the legal notices below in the process of extracting
#   and approving this file for use as a code sample.  The grammar in
#   the notices is theirs.
#
#   I suggest you start reading by going to the first occurrence
#   of "_fft_r4_1i".  There you will find a thorough description of
#   the function performed, followed by additional comments and
#   declarations and then the code.
#
#   There are two versions of the code in this module.  The first is
#   a straightforward implementation, written to illustrate the
#   operations in a natural order.  The second is a high-performance
#   implementation, in which operations have been rearranged to perform
#   as quickly as possible on the processor.  This means that instructions
#   operating on different sets of data are interwoven, and the flow of
#   data is hard to follow.
#
#   The hope is that another engineer can understand the illustration
#   version and use it as a guide to understanding the high-performance
#   version.  In addition, any changes that are desired in the future can
#   be developed relatively easily in the illustration version (which is
#   fully functional except for speed), and then the new illustration
#   version can be used as the basis for creating a new high-performance
#   version.
#
#               -- edp (Eric Postpischil)


##############################################################################
#
# EADS Deutschland GmbH is the sole owner of any exploitation rights of the
# copyright of this code.
# The code is supplied by EADS Deutschland GmbH on the express
# understanding that it is to be treated as confidential and that it
# may not be copied, used or disclosed to others in whole or in part
# for any purpose except as authorised in writing by EADS Deutschland GmbH.
#
# (c) EADS Deutschland GmbH 2003
#
# The code was generated as part of a contract fully payed by
# EADS Deutschland GmbH.
#
# EADS Deutschland GmbH grants the author "Eric Postpischil" the right
# to use the code as a code display only for the indication of reference.
# However, the code may not be made any use directed to any commercial ex-
# ploitation (including, but not limited to the sale, assignment, grants
# of rights, improvements, modifications) of the sample of the source code
# by the author or any other third person and in particular may it be used
# as part of a different contract.
#
##############################################################################
#
# Original Author:  Eric Postpischil
#
##############################################################################
# Description
#
#   This module contains a single high-performance FFT subroutine.
#   See below.
#
##############################################################################


##============================================================================#
 # Include header Files
 #============================================================================##

    .include    "fft_interface.s"

##============================================================================#
 # Macros and defines
 #============================================================================##


##============================================================================#
 # Private data
 #============================================================================##


##============================================================================#
 # Imported data
 #============================================================================##


##============================================================================#
 # Private functions
 #============================================================================##


##============================================================================#
 # Imported functions
 #============================================================================##


##============================================================================#
 # Function bodies
#============================================================================##


# The FFT routines must be placed near each other so that branch displacements
# will not exceed the field sizes in the branch instructions.
    .section    "fft_complex", "arx"
    .balign 4


###############################################################################
#F
#F  FUNCTION
#F
#F      _fft_r4_1i
#F
#F  PURPOSE
#F
#F      High-performance subroutine to perform a radix-4 butterfly
#F      with one weight per loop iteration.
#F
#F      Let:
#F
#F          h represent the input vector of complex numbers, which
#F          is stored in the arrays of float dataR and dataI so that
#F          h[k] = dataR[k] + dataI[k] * i,
#F
#F          w[k] be weights[k].w1r + weights[k].w1r*weight[k].w1i * i
#F          (given that weights is an array of CommonWeight), and
#F
#F          H represent the output vector of complex numbers, which
#F          is stored in the arrays of float dataR and dataI so that
#F          H[k] = dataR[k] + dataI[k] * i.
#F
#F      This routine calculates:
#F
#F          for (k0 = 0; k0 < count; ++k0)
#F          for (k1 = 0; k1 < 4; ++k1)
#F          for (k2 = 0; k2 < 4; ++k2)
#F              H[(k0*4 + k1) * 4 + k2] =
#F                  sum(    1^(j1*r(k1))
#F                      * w[k0]^j
#F                      * h[(k0*4 + j1) * 4 + k2],
#F                          0 <= j1 < 4).
#F
#F      provided that for each w[k0] used:
#F
#F          weights[k0].w1r = Re(w[k0]),
#F          weights[k0].w1i = Im(w[k0])/Re(w[k0]),
#F          weights[k0].w2r = Re(w[k0]^2),
#F          weights[k0].w2i = Im(w[k0]^2)/Re(w[k0]^2),
#F          weights[k0].w3r = Re(w[k0]^3)/Re(w[k0]), and
#F          weights[k0].w3i = Im(w[k0]^3)/Re(w[k0]^3).
#F
# [Note:  Other project documentation explains that 1^x (used above) stands for
# exp(2*pi*i*x).  See my _Construction of a High-Performance FFT_ paper.]
#F
#F      These calculations are performed as though h and H are separate,
#F      even though they use the same memory.
#F
#F  INPUTS
#F
#F      See fft_interface.s for documentation on the FFT subroutine interfaces.
#F
#F      This routine uses these registers as input:
#F
#F      dataR, dataI
#F          Addresses of real and imaginary parts of
#F          data, respectively. If this routine is used
#F          on a part of the data, these are the addresses
#F          of the part to be transformed, not the
#F          addresses of the starts of the entire vectors.
#F
#F      count
#F          Number of iterations to perform. Each iteration
#F          does four butterflies. Each butterfly transforms
#F          four elements.
#F
#F      weights
#F          Address in CommonWeights. If this routine is
#F          used on a part of the data, this is the address
#F          of the start of the weights to be used, not the
#F          address of the start of the entire table.
#F
#F      w1r, w1i, w2r, w2i, w3r, w3i
#F          Scratch registers to be changed by this routine.
#F
#F  This routine uses these constants:
#F
#F      BytesPerBlock
#F          Offset in bytes between the AltiVec block containing
#F          data for one set of four butterflies and the block
#F          containing data for the next set of butterflies.
#F
#F      SizeOfCommonWeight
#F          Size in bytes of one structure of common weights.
#F          (Six floating-point numbers.)
#F
#F      Note that the address, offset, and count given as arguments
#F      define locations in memory. The data in those locations is also
#F      input to this routine.
#F
#F  LOCAL DATA
#F      None
#F
#F  PROCESS CONTROL
#F      None
#F
#F  PROCESSING
#F
#F      See the sample code below. This routine is documented by giving
#F      a straightforward instruction sequence that performs the butterfly
#F      calculations. This sequence is simple, commented, and fully
#F      functional. Changing the expression in the .if statement from 0
#F      to 1 will yield a module that produces exactly the same results
#F      but not at high performance.
#F
#F      The code in the .else section performs identical calculations but
#F      uses duplicated and reordered instructions, unrolled loops, and
#F      minor changes in address arithmetic needed to support the other
#F      changes.
#F
#F  UTILIZATION OF OTHER ELEMENTS
#F      None
#F
#F  LIMITATIONS
#F
#F      The pointer into the weights must be aligned to an AltiVec
#F      boundary, a multiple of 16 bytes. Since the weights are in
#F      sets of six real numbers (which are four bytes each), this
#F      means the pointer must point to an even-indexed set.
#F
#F      This requirement is easily met by always performing an even
#F      number of iterations. After performing an even number of
#F      iterations, the weight pointer will be pointing to an even-indexed
#F      set.
#F
#F  OUTPUTS
#F
#F      The input registers are modified:
#F
#F      dataR, dataI
#F          Unspecified modifications. Could perhaps be made
#F          useful if desired, by being left pointing to the
#F          following data.
#F
#F      count
#F          Unchanged.
#F
#F      weights
#F          Unspecified modifications. Could perhaps be made
#F          useful if desired, by being left pointing to the
#F          following weights.
#F
#F      w1r, w1i, w2r, w2i, w3r, w3i
#F          Unspecified modifications.
#F
#F      The defined data in memory is transformed.
#F
#F  RETURNS
#F      None
#F
###############################################################################


    .if 0


    # This is a demonstration version of the routine. See comments
    # under PROCESSING above.

    .global _fft_r4_1i
_fft_r4_1i:

    # Assign AltiVec registers to local symbol names.

    # The naming convention is:
    #
    #   <phase><wing><component>
    #
    #   <phase> starts at "a" for input data and advances to
    #   "b", "c", and "d" as calculations are performed.
    #
    #   <wing> is 0, 1, 2, and 3 for the four elements needed
    #   for each butterfly.
    #
    #   <component> is "r" for real and "i" for imaginary.
    #
    # Thus b2i is the imaginary component of element 2 of the data
    # in the second phase of the butterfly calculations.

    .equiv  a0r,    volatileV0
    .equiv  a0i,    volatileV1
    .equiv  a1r,    volatileV2
    .equiv  a1i,    volatileV3
    .equiv  a2r,    volatileV4
    .equiv  a2i,    volatileV5
    .equiv  a3r,    volatileV6
    .equiv  a3i,    volatileV7
    .equiv  b1r,    volatileV10
    .equiv  b1i,    volatileV11
    .equiv  b2r,    volatileV12
    .equiv  b2i,    volatileV13
    .equiv  b3r,    volatileV14
    .equiv  b3i,    volatileV15
    .equiv  c0r,    volatileV16
    .equiv  c0i,    volatileV17
    .equiv  c1r,    volatileV18
    .equiv  c1i,    volatileV19
    .equiv  c2r,    volatileV20
    .equiv  c2i,    volatileV21
    .equiv  c3r,    volatileV22
    .equiv  c3i,    volatileV23
    .equiv  d0r,    volatileV0
    .equiv  d0i,    volatileV1
    .equiv  d1r,    volatileV2
    .equiv  d1i,    volatileV3
    .equiv  d2r,    volatileV4
    .equiv  d2i,    volatileV5
    .equiv  d3r,    volatileV6
    .equiv  d3i,    volatileV7

    .equiv  t0,     volatileV24
    .equiv  t1,     volatileV25

    # Assign some general registers to local symbol names.
    .equiv  LoadOffset0,    0           # Assigned to %r0 because always zero.
    .equiv  LoadOffset1,    volatileR0
    .equiv  LoadOffset2,    volatileR1
    .equiv  LoadOffset3,    volatileR2
    .equiv  StoreOffset0,   volatileR3
    .equiv  StoreOffset1,   volatileR4
    .equiv  StoreOffset2,   volatileR5
    .equiv  StoreOffset3,   volatileR6
    .equiv  Minus16,        volatileR7

    # Set the byte offset between input elements for the butterfly.
    li  LoadOffset1, 1*BytesPerBlock

    # Get the offset to the element 2 of the butterfly.
    li  LoadOffset2, 2*BytesPerBlock

    # Set the number of iterations.
    mtctr   count

    # Get the offset to element 3 of the butterfly.
    li  LoadOffset3, 3*BytesPerBlock

    # Prepare the store offsets to be stage displacements from the
    # load offsets. There is no stage displacement in a single-stage
    # version, but these are different in the optimized version below.
    addi    StoreOffset0, LoadOffset0, -0*4*BytesPerBlock
    addi    StoreOffset1, LoadOffset1, -0*4*BytesPerBlock
    addi    StoreOffset2, LoadOffset2, -0*4*BytesPerBlock
    addi    StoreOffset3, LoadOffset3, -0*4*BytesPerBlock

    # -16 is used as an offset when addressing weights. See below.
    li  Minus16, -16

# A complication in this routine is that the weights it must load are in
# groups of six, which is not good for AltiVec’s four-element registers.
# This routine requires that we start at an even-indexed weight set (see
# LIMITATIONS above), so the weight pointer points to a weight set beginning
# on an AltiVec boundary. For that set, we load the AltiVec block at the
# pointer and the block after it into registers t0 and t1. That gives us
# eight numbers. Using splat instructions, we get the six numbers we
# need, replicating each of them four times in its own register.
#
# Then we do one loop iteration (four butterflies). To do the next
# iteration, we need another weight set. We already have two of its numbers,
# left over in t1. So we load the next block from the weights into t0, and
# that gives us the other four. We splat those into weight registers, and
# we are ready for the next set of butterflies.
#
# Since we get the weights in two different ways, we have two copies of this
# loop with the different weight-loading instructions in each. We simply
# go through one and then the other repeatedly.

1:
# This code handles even iterations.

    # Load the weights.
    lvx t0, Zero,       weights
    lvx t1, Sixteen,    weights

    # Unpack the weights.
    vspltw  w1r, t0, 0
    vspltw  w1i, t0, 1
    vspltw  w2r, t0, 2
    vspltw  w2i, t0, 3
    vspltw  w3r, t1, 0
    vspltw  w3i, t1, 1

    # Load the input data.
    lvx a0r, LoadOffset0, dataR
    lvx a0i, LoadOffset0, dataI
    lvx a1r, LoadOffset1, dataR
    lvx a1i, LoadOffset1, dataI
    lvx a2r, LoadOffset2, dataR
    lvx a2i, LoadOffset2, dataI
    lvx a3r, LoadOffset3, dataR
    lvx a3i, LoadOffset3, dataI

    # Do the butterflies.
    vnmsubfp    b1r, a1i, w1i, a1r
    vmaddfp     b1i, a1r, w1i, a1i
    vnmsubfp    b2r, a2i, w2i, a2r
    vmaddfp     b2i, a2r, w2i, a2i
    vnmsubfp    b3r, a3i, w3i, a3r
    vmaddfp     b3i, a3r, w3i, a3i

    vmaddfp     c0r, b2r, w2r, a0r
    vmaddfp     c0i, b2i, w2r, a0i
    vnmsubfp    c2r, b2r, w2r, a0r
    vnmsubfp    c2i, b2i, w2r, a0i
    vmaddfp     c1r, b3r, w3r, b1r
    vmaddfp     c1i, b3i, w3r, b1i
    vnmsubfp    c3r, b3r, w3r, b1r
    vnmsubfp    c3i, b3i, w3r, b1i

    vmaddfp     d0r, c1r, w1r, c0r
    vmaddfp     d0i, c1i, w1r, c0i
    vnmsubfp    d1r, c1r, w1r, c0r
    vnmsubfp    d1i, c1i, w1r, c0i
    vnmsubfp    d2r, c3i, w1r, c2r
    vmaddfp     d2i, c3r, w1r, c2i
    vmaddfp     d3r, c3i, w1r, c2r
    vnmsubfp    d3i, c3r, w1r, c2i

    stvx    d0r, StoreOffset0, dataR
    stvx    d0i, StoreOffset0, dataI
    stvx    d1r, StoreOffset1, dataR
    stvx    d1i, StoreOffset1, dataI
    stvx    d2r, StoreOffset2, dataR
    stvx    d2i, StoreOffset2, dataI
    stvx    d3r, StoreOffset3, dataR
    stvx    d3i, StoreOffset3, dataI

    addi    dataR, dataR, 4*BytesPerBlock
    addi    dataI, dataI, 4*BytesPerBlock

    # Decrement counter and branch out of loop if caller requested
    # an odd number of iterations.
    bdz 2f

# This code handles odd iterations.

    # Advance the weight pointer. Note that we advance it by the size of
    # two weight sets. This allows us to do only one addi instruction
    # every two iterations, which is important in the high-performance
    # version, but it complicates calculating the address of the
    # weights. Having advanced the pointer beyond the weights we want,
    # we need an offset of -16 to get those weights. Also note that
    # 2*SizeOfCommonWeight must be a multiple of 16 bytes.
    addi    weights, weights, 2*SizeOfCommonWeight

    # Load the weights. At this point, weights is pointing to the
    # next set of weights beyond the set we want. We already have
    # two numbers from the set in t1, because they are left over
    # from the even iteration. We need to get the last four numbers
    # of the set. They are four bytes each, so they are at an offset
    # of -4*4 bytes from where the pointer points now.
    lvx t0, Minus16, weights

    # Unpack the weights.
    vspltw  w1r, t1, 2
    vspltw  w1i, t1, 3
    vspltw  w2r, t0, 0
    vspltw  w2i, t0, 1
    vspltw  w3r, t0, 2
    vspltw  w3i, t0, 3

    # Load the input data.
    lvx a0r, LoadOffset0, dataR
    lvx a0i, LoadOffset0, dataI
    lvx a1r, LoadOffset1, dataR
    lvx a1i, LoadOffset1, dataI
    lvx a2r, LoadOffset2, dataR
    lvx a2i, LoadOffset2, dataI
    lvx a3r, LoadOffset3, dataR
    lvx a3i, LoadOffset3, dataI

    vnmsubfp    b1r, a1i, w1i, a1r
    vmaddfp     b1i, a1r, w1i, a1i
    vnmsubfp    b2r, a2i, w2i, a2r
    vmaddfp     b2i, a2r, w2i, a2i
    vnmsubfp    b3r, a3i, w3i, a3r
    vmaddfp     b3i, a3r, w3i, a3i

    vmaddfp     c0r, b2r, w2r, a0r
    vmaddfp     c0i, b2i, w2r, a0i
    vnmsubfp    c2r, b2r, w2r, a0r
    vnmsubfp    c2i, b2i, w2r, a0i
    vmaddfp     c1r, b3r, w3r, b1r
    vmaddfp     c1i, b3i, w3r, b1i
    vnmsubfp    c3r, b3r, w3r, b1r
    vnmsubfp    c3i, b3i, w3r, b1i

    vmaddfp     d0r, c1r, w1r, c0r
    vmaddfp     d0i, c1i, w1r, c0i
    vnmsubfp    d1r, c1r, w1r, c0r
    vnmsubfp    d1i, c1i, w1r, c0i
    vnmsubfp    d2r, c3i, w1r, c2r
    vmaddfp     d2i, c3r, w1r, c2i
    vmaddfp     d3r, c3i, w1r, c2r
    vnmsubfp    d3i, c3r, w1r, c2i

    stvx    d0r, StoreOffset0, dataR
    stvx    d0i, StoreOffset0, dataI
    stvx    d1r, StoreOffset1, dataR
    stvx    d1i, StoreOffset1, dataI
    stvx    d2r, StoreOffset2, dataR
    stvx    d2i, StoreOffset2, dataI
    stvx    d3r, StoreOffset3, dataR
    stvx    d3i, StoreOffset3, dataI

    addi    dataR, dataR, 4*BytesPerBlock
    addi    dataI, dataI, 4*BytesPerBlock

    # Decrement the counter and go back to the even iteration.
    bdnz    1b
2:

    blr


    .else


# Here is a high-performance version of the routine.
#
# The main loop runs in 31 cycles per iteration (62 per double iteration).
#
# For information about how this version of the routine was created
# from the straightforward version above, see the sections in
# ReadMe.html beginning with "The Structure of My Optimized Routines".

#   This routine is complicated by having to deal with
#   loading weights different for weights with even and odd
#   indices. So there are two versions of the loop body, with
#   slightly different instructions for loading the weights.
#   This is described in the non-optimized version above and
#   fortunately causes little problem here.
#
#   Instructions that differ in even and odd iterations have
#   been marked by adding "even" or "odd" to the comment on
#   the line with the instruction. Most of those instructions
#   just change form slightly, getting data from a different
#   place in t0 or t1. However, two instructions are totally
#   different in the two iterations:
#
#   lvx t0, Zero, weights                       # S0 even
#
#   and
#
#   addi weights, weights, 2*SizeOfCommonWeight # S0 odd

    .global _fft_r4_1i
_fft_r4_1i:

    # Assign AltiVec registers to local symbol names.

    # The naming convention is:
    #
    #   <phase><wing><component>
    #
    #   <phase> starts at "a" for input data and advances to
    #   "b", "c", and "d" as calculations are performed.
    #
    #   <wing> is 0, 1, 2, and 3 for the four elements needed
    #   for each butterfly.
    #
    #   <component> is "r" for real and "i" for imaginary.
    #
    # Thus b2i is the imaginary component of element 2 of the data
    # in the second phase of the butterfly calculations.

    .equiv  a0r,    volatileV0
    .equiv  a0i,    volatileV1
    .equiv  a1r,    volatileV2
    .equiv  a1i,    volatileV3
    .equiv  a2r,    volatileV4
    .equiv  a2i,    volatileV5
    .equiv  a3r,    volatileV6
    .equiv  a3i,    volatileV7
    .equiv  b1r,    volatileV8
    .equiv  b1i,    volatileV9
    .equiv  b2r,    volatileV10
    .equiv  b2i,    volatileV11
    .equiv  b3r,    volatileV12
    .equiv  b3i,    volatileV13
    .equiv  c0r,    volatileV14
    .equiv  c0i,    volatileV15
    .equiv  c1r,    volatileV16
    .equiv  c1i,    volatileV17
    .equiv  c2r,    volatileV18
    .equiv  c2i,    volatileV19
    .equiv  c3r,    b1r
    .equiv  c3i,    b1i
    .equiv  d0r,    volatileV20
    .equiv  d0i,    volatileV21
    .equiv  d1r,    volatileV22
    .equiv  d1i,    volatileV23
    .equiv  d2r,    volatileV20
    .equiv  d2i,    volatileV21
    .equiv  d3r,    volatileV22
    .equiv  d3i,    volatileV23

    .equiv  t0,     volatileV24
    .equiv  t1,     volatileV25

    # Assign some general registers to local symbol names.
    .equiv  LoadOffset0,    0           # Assigned to %r0 because always zero.
    .equiv  LoadOffset1,    volatileR0
    .equiv  LoadOffset2,    volatileR1
    .equiv  LoadOffset3,    volatileR2
    .equiv  StoreOffset0,   volatileR3
    .equiv  StoreOffset1,   volatileR4
    .equiv  StoreOffset2,   volatileR5
    .equiv  StoreOffset3,   volatileR6
    .equiv  Minus16,        volatileR7


# Note: For the in-cache routine, this routine is called only for
# the entire vector, so the first iteration is always for the first
# butterfly of the past, which has weight one. Part of that iteration
# is performed in the prologue, separate from other iterations, so
# it could be changed to omit some weight operations and save a few
# cycles. (Very few, maybe six or so out of the entire FFT.)

# Prologue, stage 0 for an even iteration.
    lvx t0, Zero, weights                           # S0 even
    li  LoadOffset2, 2*BytesPerBlock

    lvx t1, Sixteen, weights                        # S0 even
    li  LoadOffset1, 1*BytesPerBlock

    lvx a2r, LoadOffset2, dataR                     # S0
    li  LoadOffset3, 3*BytesPerBlock

    lvx a2i, LoadOffset2, dataI                     # S0
    mtctr   count

    lvx a1r, LoadOffset1, dataR                     # S0
    vspltw  w2i, t0, 3                              # S0 even
    lvx a1i, LoadOffset1, dataI                     # S0
    vspltw  w1i, t0, 1                              # S0 even
    lvx a3r, LoadOffset3, dataR                     # S0
    vspltw  w2r, t0, 2                              # S0 even
    lvx a3i, LoadOffset3, dataI                     # S0
    vspltw  w3i, t1, 1                              # S0 even
    lvx a0r, LoadOffset0, dataR                     # S0
    vspltw  w3r, t1, 0                              # S0 even
    lvx a0i, LoadOffset0, dataI                     # S0
    vnmsubfp    b2r, a2i, w2i, a2r                  # S0
    vmaddfp     b2i, a2r, w2i, a2i                  # S0
    vnmsubfp    b1r, a1i, w1i, a1r                  # S0
    addi    StoreOffset0, LoadOffset0, -1*4*BytesPerBlock
    vmaddfp     b1i, a1r, w1i, a1i                  # S0
    addi    StoreOffset1, LoadOffset1, -1*4*BytesPerBlock
    vnmsubfp    b3r, a3i, w3i, a3r                  # S0
    addi    StoreOffset2, LoadOffset2, -1*4*BytesPerBlock
    vmaddfp     b3i, a3r, w3i, a3i                  # S0
    addi    StoreOffset3, LoadOffset3, -1*4*BytesPerBlock
    vmaddfp     c0r, b2r, w2r, a0r                  # S0
    li  Minus16, -16
    vmaddfp     c0i, b2i, w2r, a0i                  # S0


    bdz 2f

    .balign 8
# Main loop.

# Stage 0 is in an odd iteration, stage 1 is in an even iteration.
1:
    vmaddfp     c1r, b3r, w3r, b1r                      # S1
    vmaddfp     c1i, b3i, w3r, b1i                      # S1
    addi    weights, weights, 2*SizeOfCommonWeight  # S0 odd
    vnmsubfp    c2r, b2r, w2r, a0r                      # S1
    vnmsubfp    c2i, b2i, w2r, a0i                      # S1
    addi    dataR, dataR, 4*BytesPerBlock
    vnmsubfp    c3r, b3r, w3r, b1r                      # S1
    vnmsubfp    c3i, b3i, w3r, b1i                      # S1
    vspltw  w1r, t0, 0                                  # S1 even
    lvx t0, Minus16, weights                        # S0 odd
    addi    dataI, dataI, 4*BytesPerBlock
    lvx a2r, LoadOffset2, dataR                     # S0
    lvx a2i, LoadOffset2, dataI                     # S0
    lvx a1r, LoadOffset1, dataR                     # S0
    vspltw  w1i, t1, 3                              # S0 odd
    lvx a1i, LoadOffset1, dataI                     # S0
    vspltw  w3i, t0, 3                              # S0 odd
    lvx a3r, LoadOffset3, dataR                     # S0
    vspltw  w3r, t0, 2                              # S0 odd
    lvx a3i, LoadOffset3, dataI                     # S0
    vspltw  w2i, t0, 1                              # S0 odd
    lvx a0r, LoadOffset0, dataR                     # S0
    vspltw  w2r, t0, 0                              # S0 odd
    lvx a0i, LoadOffset0, dataI                     # S0
    vmaddfp     d0r, c1r, w1r, c0r                      # S1
    stvx    d0r, StoreOffset0, dataR                    # S1
    vmaddfp     d0i, c1i, w1r, c0i                      # S1
    vnmsubfp    d1r, c1r, w1r, c0r                      # S1
    stvx    d0i, StoreOffset0, dataI                    # S1
    vnmsubfp    d1i, c1i, w1r, c0i                      # S1
    vnmsubfp    d2r, c3i, w1r, c2r                      # S1
    stvx    d1r, StoreOffset1, dataR                    # S1
    vmaddfp     d2i, c3r, w1r, c2i                      # S1
    vmaddfp     d3r, c3i, w1r, c2r                      # S1
    stvx    d1i, StoreOffset1, dataI                    # S1
    vnmsubfp    d3i, c3r, w1r, c2i                      # S1
    vnmsubfp    b2r, a2i, w2i, a2r                  # S0
    stvx    d2r, StoreOffset2, dataR                    # S1
    vmaddfp     b2i, a2r, w2i, a2i                  # S0
    vnmsubfp    b1r, a1i, w1i, a1r                  # S0
    stvx    d2i, StoreOffset2, dataI                    # S1
    vmaddfp     b1i, a1r, w1i, a1i                  # S0
    vnmsubfp    b3r, a3i, w3i, a3r                  # S0
    stvx    d3r, StoreOffset3, dataR                    # S1
    vmaddfp     b3i, a3r, w3i, a3i                  # S0
    vmaddfp     c0r, b2r, w2r, a0r                  # S0
    stvx    d3i, StoreOffset3, dataI                    # S1
    vmaddfp     c0i, b2i, w2r, a0i                  # S0

    bdz 3f

# Stage 0 is in an even iteration, stage 1 is in an odd iteration.

    vmaddfp     c1r, b3r, w3r, b1r                      # S1
    vmaddfp     c1i, b3i, w3r, b1i                      # S1
    lvx t0, Zero, weights                           # S0 even
    vnmsubfp    c2r, b2r, w2r, a0r                      # S1
    vnmsubfp    c2i, b2i, w2r, a0i                      # S1
    addi    dataR, dataR, 4*BytesPerBlock
    vnmsubfp    c3r, b3r, w3r, b1r                      # S1
    vnmsubfp    c3i, b3i, w3r, b1i                      # S1
    vspltw  w1r, t1, 2                                  # S1 odd
    lvx t1, Sixteen, weights                        # S0 even
    addi    dataI, dataI, 4*BytesPerBlock
    lvx a2r, LoadOffset2, dataR                     # S0
    lvx a2i, LoadOffset2, dataI                     # S0
    lvx a1r, LoadOffset1, dataR                     # S0
    vspltw  w2i, t0, 3                              # S0 even
    lvx a1i, LoadOffset1, dataI                     # S0
    vspltw  w1i, t0, 1                              # S0 even
    lvx a3r, LoadOffset3, dataR                     # S0
    vspltw  w2r, t0, 2                              # S0 even
    lvx a3i, LoadOffset3, dataI                     # S0
    vspltw  w3i, t1, 1                              # S0 even
    lvx a0r, LoadOffset0, dataR                     # S0
    vspltw  w3r, t1, 0                              # S0 even
    lvx a0i, LoadOffset0, dataI                     # S0
    vmaddfp     d0r, c1r, w1r, c0r                      # S1
    stvx    d0r, StoreOffset0, dataR                    # S1
    vmaddfp     d0i, c1i, w1r, c0i                      # S1
    vnmsubfp    d1r, c1r, w1r, c0r                      # S1
    stvx    d0i, StoreOffset0, dataI                    # S1
    vnmsubfp    d1i, c1i, w1r, c0i                      # S1
    vnmsubfp    d2r, c3i, w1r, c2r                      # S1
    stvx    d1r, StoreOffset1, dataR                    # S1
    vmaddfp     d2i, c3r, w1r, c2i                      # S1
    vmaddfp     d3r, c3i, w1r, c2r                      # S1
    stvx    d1i, StoreOffset1, dataI                    # S1
    vnmsubfp    d3i, c3r, w1r, c2i                      # S1
    vnmsubfp    b2r, a2i, w2i, a2r                  # S0
    stvx    d2r, StoreOffset2, dataR                    # S1
    vmaddfp     b2i, a2r, w2i, a2i                  # S0
    vnmsubfp    b1r, a1i, w1i, a1r                  # S0
    stvx    d2i, StoreOffset2, dataI                    # S1
    vmaddfp     b1i, a1r, w1i, a1i                  # S0
    vnmsubfp    b3r, a3i, w3i, a3r                  # S0
    stvx    d3r, StoreOffset3, dataR                    # S1
    vmaddfp     b3i, a3r, w3i, a3i                  # S0
    vmaddfp     c0r, b2r, w2r, a0r                  # S0
    stvx    d3i, StoreOffset3, dataI                    # S1
    vmaddfp     c0i, b2i, w2r, a0i                  # S0

    bdnz    1b

# Epilogue, stage 1 for an even iteration.
2:

    vmaddfp     c1r, b3r, w3r, b1r                      # S1
    vmaddfp     c1i, b3i, w3r, b1i                      # S1
    vspltw  w1r, t0, 0                                  # S1 even

# The remaining code is the same for both epilogues.
    b   4f

# Epilogue, stage 1 for an odd iteration.
3:
    vmaddfp     c1r, b3r, w3r, b1r                      # S1
    vmaddfp     c1i, b3i, w3r, b1i                      # S1
    vspltw  w1r, t1, 2                                  # S1 odd
4:
    vnmsubfp    c2r, b2r, w2r, a0r                      # S1
    vnmsubfp    c2i, b2i, w2r, a0i                      # S1
    addi    dataR, dataR, 4*BytesPerBlock
    vnmsubfp    c3r, b3r, w3r, b1r                      # S1
    vnmsubfp    c3i, b3i, w3r, b1i                      # S1
    addi    dataI, dataI, 4*BytesPerBlock
    vmaddfp     d0r, c1r, w1r, c0r                      # S1
    stvx    d0r, StoreOffset0, dataR                    # S1
    vmaddfp     d0i, c1i, w1r, c0i                      # S1
    stvx    d0i, StoreOffset0, dataI                    # S1
    vnmsubfp    d1r, c1r, w1r, c0r                      # S1
    stvx    d1r, StoreOffset1, dataR                    # S1
    vnmsubfp    d1i, c1i, w1r, c0i                      # S1
    stvx    d1i, StoreOffset1, dataI                    # S1
    vnmsubfp    d2r, c3i, w1r, c2r                      # S1
    stvx    d2r, StoreOffset2, dataR                    # S1
    vmaddfp     d2i, c3r, w1r, c2i                      # S1
    stvx    d2i, StoreOffset2, dataI                    # S1
    vmaddfp     d3r, c3i, w1r, c2r                      # S1
    stvx    d3r, StoreOffset3, dataR                    # S1
    vnmsubfp    d3i, c3r, w1r, c2i                      # S1
    stvx    d3i, StoreOffset3, dataI                    # S1

    blr


    .endif